Text Analysis with LIWC (Linguistic Inquery and Word Count)

In the last class, we have learned how to scrape the posts on Gossiping board of PTT, specifically how to take advantage of session to change the cookie for age verification to “yes” in order to circumvent the age verification. In this class, we will continuously scrape the posts on Gossiping board which are relevant to the three groups of the candidates of Taiwan president election 2024. Also, we will try to analyze the linguistic and psychological features of the comments to these posts with LIWC. First, let’s scrape the posts.

library(rvest)
library(dplyr)
url<-"https://www.ptt.cc/bbs/Gossiping/index.html"
gossip.session<-session(url)
gossip.form<-gossip.session %>%
  html_node("form") %>%
  html_form()
gossiping<-session_submit(x=gossip.session,form=gossip.form,submit="yes")

Suppose we plan to scrape the most recent 20 pages, roughly equal to 400 posts.

postsinfo<-lapply(seq(39123,39104),function(x){
     page<-paste0("https://www.ptt.cc/bbs/Gossiping/index",x,".html")
     info<-tryCatch({
         titles<-gossiping %>% session_jump_to(page) %>%
              html_nodes(".title") %>% html_text(trim=T)
         links<-gossiping %>% session_jump_to(page) %>%
              html_nodes(".title a") %>% html_attrs()
         links<-unlist(links)
         names(links)<-NULL
         tag<-sapply(titles,function(y)grepl("已被",y) & grepl("刪除",y))
         titles<-titles[!tag]
         temp<-list(title=titles,link=links)
         return(temp)  
     },
     error=function(cond){
       message("Error on ",page)
       return(NA)
     },
     warning=function(cond){
       message("Warning on ",page)
       return(NULL)
     },
     finally={
       message("Done with ",page)
     })
     return(info)
})

Now let’s sort out the collected posts. We extract titles and links from the object returned by our web crawler and put them together as a data frame with two columns, one for titles and the other for links. In cases of any unexpected problem, it is better to export this data frame to a file in your hard drive.

Title<-NULL
Link<-NULL
for (i in 1:length(postsinfo)){
   Title<-c(Title,postsinfo[[i]]$title)
}
for (i in 1:length(postsinfo)){
   Link<-c(Link,postsinfo[[i]]$link)
}
gossip.dta<-data.frame(Title=Title,Link=Link)
write.table(gossip.dta,"gossip_dta.txt")

We separately focus on each president candidate. It turns out that there are 17 posts exclusively about 賴 and 11 posts about 侯, but 30 posts about 柯, suggesting that the PTT volume for 柯 is still the highest among these three.

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

gossip.dta<-read.table("gossip_dta.txt",header=T)
gossip.dta<-as_tibble(gossip.dta)
dta_Ly<-gossip.dta %>% filter(grepl("賴",Title) & !grepl("柯",Title) & !grepl("侯",Title))
dta_Kp<-gossip.dta %>% filter(grepl("柯",Title) & !grepl("賴",Title) & !grepl("侯",Title))
dta_Ho<-gossip.dta %>% filter(grepl("侯",Title) & !grepl("賴",Title) & !grepl("賴",Title))
c(nrow(dta_Ly),nrow(dta_Kp),nrow(dta_Ho))

## [1] 17 30 11

Now we start analyzing the sentimental values of the posts for each candidate. We extract the numbers of likes and hates, the repliers’ id’s, and their comments for each post about every president candidate.

# For 賴
replies_Ly<-sapply(dta_Ly$Link,function(URL){
     p<-paste0("https://www.ptt.cc",URL)
     push_tags<-gossiping %>% session_jump_to(p) %>%
                 html_nodes(".hl.push-tag") %>% html_text()
     push_users<-gossiping %>% session_jump_to(p) %>%
                 html_nodes(".hl.push-userid") %>% html_text()
     push_comments<-gossiping %>% session_jump_to(p) %>%
                 html_nodes(".push-content") %>% html_text()
     push_tags<-paste(push_tags,collapse=",")
     push_users<-paste(push_users,collapse=",")
     push_comments<-paste(push_comments,collapse=",")
     temp<-c(push_tags,push_users,push_comments)
     return(temp)
})
repliesLy.dta<-tibble(tag=replies_Ly[1,],id=replies_Ly[2,],comment=replies_Ly[3,])
# For 柯
replies_Kp<-sapply(dta_Kp$Link,function(URL){
     p<-paste0("https://www.ptt.cc",URL)
     push_tags<-gossiping %>% session_jump_to(p) %>%
                 html_nodes(".hl.push-tag") %>% html_text()
     push_users<-gossiping %>% session_jump_to(p) %>%
                 html_nodes(".hl.push-userid") %>% html_text()
     push_comments<-gossiping %>% session_jump_to(p) %>%
                 html_nodes(".push-content") %>% html_text()
     push_tags<-paste(push_tags,collapse=",")
     push_users<-paste(push_users,collapse=",")
     push_comments<-paste(push_comments,collapse=",")
     temp<-c(push_tags,push_users,push_comments)
     return(temp)
})
repliesKp.dta<-tibble(tag=replies_Kp[1,],id=replies_Kp[2,],comment=replies_Kp[3,])
# For 侯
replies_Ho<-sapply(dta_Ho$Link,function(URL){
     p<-paste0("https://www.ptt.cc",URL)
     push_tags<-gossiping %>% session_jump_to(p) %>%
                 html_nodes(".hl.push-tag") %>% html_text()
     push_users<-gossiping %>% session_jump_to(p) %>%
                 html_nodes(".hl.push-userid") %>% html_text()
     push_comments<-gossiping %>% session_jump_to(p) %>%
                 html_nodes(".push-content") %>% html_text()
     push_tags<-paste(push_tags,collapse=",")
     push_users<-paste(push_users,collapse=",")
     push_comments<-paste(push_comments,collapse=",")
     temp<-c(push_tags,push_users,push_comments)
     return(temp)
})
repliesHo.dta<-tibble(tag=replies_Ho[1,],id=replies_Ho[2,],comment=replies_Ho[3,])
write.table(repliesLy.dta,"Ly.txt")
write.table(repliesKp.dta,"Kp.txt")
write.table(repliesHo.dta,"Ho.txt")

Let’s do a quick check for the attitudes of PTT users toward these candidates. We counts the numbers of “推” and “噓” for likes and hates respectively.

repliesLy.dta<-read.table("Ly.txt",header=T)
repliesKp.dta<-read.table("Kp.txt",header=T)
repliesHo.dta<-read.table("Ho.txt",header=T)
LH_Ly<-sapply(repliesLy.dta$tag,function(x){
     temp<-unlist(strsplit(x,","))
     names(temp)<-NULL
     return(c(sum(temp=="推 "),sum(temp=="噓 ")))
})
colnames(LH_Ly)<-NULL
LH_Kp<-sapply(repliesKp.dta$tag,function(x){
     temp<-unlist(strsplit(x,","))
     names(temp)<-NULL
     return(c(sum(temp=="推 "),sum(temp=="噓 ")))
})
colnames(LH_Kp)<-NULL
LH_Ho<-sapply(repliesHo.dta$tag,function(x){
     temp<-unlist(strsplit(x,","))
     names(temp)<-NULL
     return(c(sum(temp=="推 "),sum(temp=="噓 ")))
})
colnames(LH_Ho)<-NULL

A one-way between-subjects ANOVA is conducted for likes and hates respectively. None of the effects is statistically significant. However, the pattern seems to show that 賴 gets most positive replies, 柯 gets the second most, and 侯 gest the least positive replies. Interestingly, 賴 also gets the most negative replies, followed by the other two candidates. This figure suggestst that PTT users have less impression of 侯 than these other two candidates. However, t-test results show that only for Kp, Likes is significantly more than Hates and for the other candidates, the difference is not significant. To sum up, it is suggested that PTT users might be relatively positive toward Kp than the other two.

LH.dta<-data.frame(score=c(LH_Ly[1,],LH_Kp[1,],LH_Ho[1,],LH_Ly[2,],LH_Kp[2,],LH_Ho[2,]),
                   pn=rep(c("Like","Hate"),c(17+30+11,17+30+11)),
                   cand=rep(c("L","K","H"),c(17,30,11)))
summary(aov(score~pn*cand,LH.dta))

##              Df Sum Sq Mean Sq F value Pr(>F)  
## pn            1  18000   18000   3.366 0.0693 .
## cand          2  17141    8570   1.602 0.2061  
## pn:cand       2   4767    2384   0.446 0.6416  
## Residuals   110 588326    5348                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(aov(score~cand,subset(LH.dta,pn=="Like")))

##             Df Sum Sq Mean Sq F value Pr(>F)
## cand         2  19685    9842   0.995  0.376
## Residuals   55 543852    9888

summary(aov(score~cand,subset(LH.dta,pn=="Hate")))

##             Df Sum Sq Mean Sq F value Pr(>F)
## cand         2   2223  1111.5   1.375  0.261
## Residuals   55  44475   808.6

library(ggplot2)
LH.fg.dta<-data.frame(mean=c(with(LH.dta,tapply(score,list(pn,cand),mean))),
           ses=c(with(LH.dta,tapply(score,list(pn,cand),sd))/sqrt(matrix(c(11,30,17),2,3,byrow=T))),
           cand=c("H","H","K","K","L","L"),
           pn=rep(c("Hate","Like"),3))
LH.fg.dta %>% ggplot(aes(cand,mean,fill=cand))+
  geom_bar(stat="identity")+geom_errorbar(aes(ymin=mean-ses,ymax=mean+ses),width=0.2)+
  facet_wrap(.~pn)

# Compare the difference between Likes and Hates across all candidates
LH.d.dta<-data.frame(delta=c(LH_Ly[1,]-LH_Ly[2,],LH_Kp[1,]-LH_Kp[2,],LH_Ho[1,]-LH_Ho[2,]),
           cand=rep(c("L","K","H"),c(17,30,11)))
summary(aov(delta~cand,LH.d.dta))

##             Df Sum Sq Mean Sq F value Pr(>F)
## cand         2   9534    4767   0.701  0.501
## Residuals   55 374172    6803

with(subset(LH.d.dta,cand=="L"),t.test(delta,mu=0))

## 
##  One Sample t-test
## 
## data:  delta
## t = 1.2947, df = 16, p-value = 0.2138
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -26.80559 110.92324
## sample estimates:
## mean of x 
##  42.05882

with(subset(LH.d.dta,cand=="K"),t.test(delta,mu=0))

## 
##  One Sample t-test
## 
## data:  delta
## t = 2.412, df = 29, p-value = 0.02242
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##   3.421163 41.578837
## sample estimates:
## mean of x 
##      22.5

with(subset(LH.d.dta,cand=="H"),t.test(delta,mu=0))

## 
##  One Sample t-test
## 
## data:  delta
## t = 0.49038, df = 10, p-value = 0.6344
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -17.71867  27.71867
## sample estimates:
## mean of x 
##         5

LIWC

LIWC consists of two parts: a dictionary and a word-counting engine. In the LIWC dictionary, more than 70 categories of words are contained each of which corresponds to a linguistic or psychological feature. For example, the first-person singular pronoun (e.g., I), the positive emotional words (e.g., happy), and the negative emotional words (e.g., unhappy). The word-counting engine is used to transfer a text to a multinomial distribution of frequencies/probabilities of these word categories. Since the word-counting engine is quite simple, we can use R to make one. The dictionary is the core of LIWC. There is a Chinese version of LIWC, CLIWC, which is maintained by Dr. Huang at NTTU. For those who are interested in CLIWC, it is encouraged to buy a set of CLIWC.

The coding book of CLIWC is contained in cliwc_v1.1.txt and the code-word category table is contained in cliwc_index.txt. The Chinese meanings of these word categories are contained in tc-liwc.docx.

Before we transfer the comments for the posts to LIWC word distribution, we need to do word segmentation for these comments.

Chinese Word Segmentation

As usual, we use {jiebaR}. First, we import the data from files. Second, we clean up the comments by removing punctuation and putting all words together as a giant string without spaces.

repliesLy.dta<-read.table("Ly.txt",header=T)
repliesKp.dta<-read.table("Kp.txt",header=T)
repliesHo.dta<-read.table("Ho.txt",header=T)
# Clean up comments
Ly_comments<-sapply(repliesLy.dta$comment,function(x){
    x<-gsub(",","",x)
    x<-gsub("，","",x)
    x<-gsub(":","",x)
    x<-gsub(" ","",x)
    return(x)
})
Kp_comments<-sapply(repliesKp.dta$comment,function(x){
    x<-gsub(",","",x)
    x<-gsub("，","",x)
    x<-gsub(":","",x)
    x<-gsub(" ","",x)
    return(x)
})
Ho_comments<-sapply(repliesHo.dta$comment,function(x){
    x<-gsub(",","",x)
    x<-gsub("，","",x)
    x<-gsub(":","",x)
    x<-gsub(" ","",x)
    return(x)
})

Third, we can do Chinese word segmentation now. We defined two files in advance. One (i.e., user_defined_ptt.txt) contains the user-defined words, which are the words that must be segmented out. The other (i.e., user_stopwords_ptt.txt) contains the stop words, which are those words that should be omitted.

library(jiebaR)

## Loading required package: jiebaRD

wk<-worker(user="user_defined_ptt.txt",stop_word="user_stopwords_ptt.txt")
Ly_comments<-sapply(Ly_comments,function(x){
    temp<-wk[x]
    temp<-paste(temp,collapse=" ")
    return(temp)
})
names(Ly_comments)<-NULL
Kp_comments<-sapply(Kp_comments,function(x){
    temp<-wk[x]
    temp<-paste(temp,collapse=" ")
    return(temp)
})
names(Kp_comments)<-NULL
Ho_comments<-sapply(Ho_comments,function(x){
    temp<-wk[x]
    temp<-paste(temp,collapse=" ")
    return(temp)
})
names(Ho_comments)<-NULL

In addition to checking out the sentimental valence of a comment, we can also check out the word counds in the comments to show the PTT volume of a candidate. Although the rank of word counts in the comments show 賴 > 柯 > 侯, the result of a one-way between-subjects ANOVA is not significant.

WC<-function(v){
     counts<-sapply(v,function(x){
          return(length(unlist(strsplit(x," "))))
     })
     return(counts)
}
Ly_wc<-WC(Ly_comments)
names(Ly_wc)<-NULL
Kp_wc<-WC(Kp_comments)
names(Kp_wc)<-NULL
Ho_wc<-WC(Ho_comments)
names(Ho_wc)<-NULL
wc.dta<-data.frame(wc=c(Ly_wc,Kp_wc,Ho_wc),cand=rep(c("L","K","H"),c(17,30,11)))
summary(aov(wc~cand,wc.dta))

##             Df    Sum Sq Mean Sq F value Pr(>F)
## cand         2   3591978 1795989   0.891  0.416
## Residuals   55 110841688 2015303

means<-with(wc.dta,tapply(wc,cand,mean))
ses<-with(wc.dta,tapply(wc,cand,sd))/sqrt(c(11,17,30))
wc.fig.dta<-data.frame(mean=means,ses=ses,cand=c("H","K","L"))
wc.fig.dta %>% ggplot(aes(cand,mean,fill=cand))+geom_bar(stat="identity")+
  geom_errorbar(aes(ymin=mean-ses,ymax=mean+ses),width=0.2)

We can check the frequency of CLIWC words of these comments. The idea is that we check each word in the comments for its corresponding CLIWC word category.

cliwc_codes<-scan("cliwc_v1.1.txt",what=character(),sep="\n")
cliwc_codes<-sapply(cliwc_codes,function(x){
  return(gsub("\t"," ",x))
})
cliwc_ws<-sapply(cliwc_codes,function(x)unlist(strsplit(x," "))[1])
names(cliwc_codes)<-NULL
CLIWC_codes<-function(v){
  One_codes<-sapply(v,function(x){
    temp<-unlist(strsplit(x," "))
    codes<-lapply(temp,function(y){
        flag<-which(y==cliwc_ws)
        if(length(flag)==0){
           return(0)
        }else{
           tt<-unlist(strsplit(cliwc_codes[flag]," "))[-c(1,2)]
           return(tt)  
        }
    })
    return(paste(unlist(codes),collapse=" "))
  })
  names(One_codes)<-NULL
  return(One_codes)
}
Ly_codes<-CLIWC_codes(Ly_comments)
Kp_codes<-CLIWC_codes(Kp_comments)
Ho_codes<-CLIWC_codes(Ho_comments)

We now can transfer these codes to frequencies of all words in comments. Suppose we focus on the CLIWC categores of I, negate, affect, cogmech, swear,discrep, cause, insight, achieve and work. The bar plot below suggests that for no matter which candidate, swear is the most frequent word category. This is not surprising, as PTT users are notorious of their bad attitudes toward public affairs. However, comparing with the other two candidates, the comments for the posts talking about 侯 tend not to talk much about the words about work or the first-person singular pronoun. Perhaps, the posts having something to do with 侯 are less talking about the employment policy. Nonetheless, LIWC analysis shows that PTT users used a lot of swear words when replying to the posts relevant to the three president candidates, implying a very strong emotional reaction toward these candidates.

Ly_codes<-unlist(strsplit(paste(Ly_codes,collapse=" ")," "))
Kp_codes<-unlist(strsplit(paste(Kp_codes,collapse=" ")," "))
Ho_codes<-unlist(strsplit(paste(Ho_codes,collapse=" ")," "))
cliwc_indices<-scan("cliwc_index.txt",what=character(),sep="\n")
cliwc_indices<-sapply(cliwc_indices,function(x)gsub("\t"," ",x))
names(cliwc_indices)<-NULL
tLy_codes<-table(Ly_codes)
tKp_codes<-table(Kp_codes)
tHo_codes<-table(Ho_codes)
all.codes<-unique(c(names(tLy_codes),names(tKp_codes),names(tHo_codes)))
all.codes<-all.codes[-c(1,72)]
# The LIWC codes of i, negate,affect,cogmech,swear,discrep,cause,insight,work,achieve
# are 4, 19, 22, 125, 131, 132, 133, 134, 354, 355
Probs<-sapply(c(4, 19, 22, 125, 131, 132, 133, 134, 354, 355),function(x){
    return(c(tLy_codes[which(names(tLy_codes)==x)]/sum(tLy_codes),
    tKp_codes[which(names(tKp_codes)==x)]/sum(tKp_codes),
    tHo_codes[which(names(tHo_codes)==x)]/sum(tHo_codes)))
})
data.frame(p=c(Probs),cand=rep(c("L","K","H"),10),
           LIWC=rep(c("i","negate","affect","cogmech","swear",
                      "discrep","cause","insight","work","achieve"),each=3)) %>% 
  ggplot(aes(LIWC,p,fill=cand))+geom_bar(stat="identity")+
  scale_x_discrete(guide=guide_axis(angle=45))

Application of Data from Web Pages in Psychology

Lee-Xieng Yang

Class 10

Text Analysis with LIWC (Linguistic Inquery and Word Count)

LIWC

Chinese Word Segmentation