
# --- Chinese Text Processing: the Sheng Xuanhuai Collection --- #

# The script used to create the post on RSDI blog.


# Import Text ---------------------------------------------------

# set your working directory
setwd('YOUR WORKING DIRECTORY')

# load the data in a spreadsheet to R
v36 <- read.csv('v36.csv', encoding = 'UTF-8', header=TRUE, 
                row.names=1, stringsAsFactors=FALSE)

# view the first two rows of the data
head(v36, 2)


# Segmenting Chinese Text ---------------------------------------

# load the "jiebaR" and "stringr" packages/libraries
library(jiebaR)
library(stringr)

# initialize jiebaR worker
cutter <- worker()

# test the worker
cutter["今天的天氣真好"]

# define the function of segmenting
seg_x <- function(x) {str_c(cutter[x], collapse = " ")} 

# apply the function to each document (row of ltext)
x.out <- sapply(v36$ltext, seg_x, USE.NAMES = FALSE)

# attach the segmented text back to the data frame
v36$ltext.seg <- x.out 

# view the first two rows of the data frame
head(v36, 2)


# Create corpus and document-term/feature-matrix ----------------

# load the library
library(quanteda)

# create corpus
lcorpus <- corpus(v36$ltext.seg)

# summarize the lcorpus object
summary(lcorpus, showmeta = TRUE, 5)

# see the text in the 1st document of lcorpus
texts(lcorpus)[1]

# create dfm with "terms/features" spliting by whitespaces; 
# ie, preserve what has done for segmenting by jiebaR

# tokenize:"tokens" from doc 1, split by whitespaces
tokens(lcorpus, what = "fasterword")[1]

# tokenize and create document-feature-matrix
ltokens <- tokens(v36$ltext.seg, what = "fasterword")
ldfm <- dfm(ltokens)

# a dfm with 245 documents and 8052 features
ldfm 

# list top 20 features
topfeatures(ldfm, 20)

# plot wordcloud
par(family='Kaiti TC') # set Chinese font on Mac; 
                       # you may not need to set font on Windows
textplot_wordcloud(ldfm, min.freq=30, random.order=FALSE,
                   colors = RColorBrewer::brewer.pal(8,"Dark2"))


# tidy up: remove everything in R environment
rm(list=ls())

# Combine multiple data files -----------------------------------

# define the function of combining multiple files
multcomb <- function(mypath){
  # save all the file names (with path) in an object "filenames"
  filenames <- list.files(path=mypath, full.names=TRUE)
  # import all files and save them as "datalist"
  datalist <- lapply(filenames, function(x){
    read.csv(file=x, encoding='UTF-8', header=TRUE, 
             row.names=1, stringsAsFactors=FALSE)})
  # combine the files (data frames in "datalist")
  Reduce(function(x,y) {rbind(x,y)}, datalist)}

# Use the function multcomb to combine the files in the folder:
# before excecute the function, save all the csv. files in one folder;
# note the folder should not contain other files
mydata <- multcomb('YOUR PATH OF THE FOLDER')

# view the first two rows of mydata
head(mydata, 2)


# Segmenting: stopwords and dictionary --------------------------

# see the stopwords and dictionary
readLines('sheng_stop.txt', encoding = 'UTF-8')
readLines('sheng_dic.txt', encoding = 'UTF-8')

# set up and apply the worker and function for segmenting
cutter <- worker(stop_word = 'sheng_stop.txt', user = 'sheng_dic.txt')
seg_x <- function(x) {str_c(cutter[x], collapse = " ")} 
mydata$ltext.seg <- sapply(mydata$ltext, seg_x, USE.NAMES = FALSE)

# view the first few rows
head(mydata, 2)


# Create corpus, DFM and feature frequency tab ------------------

# create and examine corpus 
mycorpus <- corpus(mydata$ltext.seg)
summary(mycorpus, showmeta = TRUE, 5)

# view texts in the first document of the corpus
texts(mycorpus)[1]

# create and examine DFM
mydfm <- dfm(tokens(mydata$ltext.seg, what = "fasterword"))
mydfm 

# top 20 features
topfeatures(mydfm, 20)

# tabulate feature frequency
dfmtab <- textstat_frequency(mydfm)
head(dfmtab)

# select 2+ word features 
mydfm2 <- dfm_select(mydfm, min_nchar = 2) 
topfeatures(mydfm2, 20)

# plot wordcloud
par(family='Kaiti TC')
textplot_wordcloud(mydfm2, min.freq=5, random.order=FALSE, max.words = 200, 
                   rot.per = .25, scale = c(2.8, .5),
                   colors = RColorBrewer::brewer.pal(8,"Dark2"))


# Explore more about the data -----------------------------------

# tabulate the top 10 features
textstat_frequency(mydfm2, n=10)

# plot freq. by rank of the most frequent 50 features
library(ggplot2)
theme_set(theme_minimal())
textstat_frequency(mydfm2, n = 50) %>% 
  ggplot(aes(x = rank, y = frequency)) +
  geom_point() +
  labs(x = "Frequency rank", y = "Term frequency")

# create dfm with relative term frequencies
dfmpct <- dfm_weight(mydfm2, type = "relfreq") 

# plot relative term frequencies
textstat_frequency(dfmpct, n = 10) %>% 
  ggplot(aes(x = reorder(feature, -rank), y = frequency)) +
  geom_bar(stat = "identity") + coord_flip() + 
  labs(x = "", y = "Relative Term Frequency") +
  theme(text = element_text(family = 'STKaiti'))

