12  Natural language processing

This section deals with handling of text data. R has several excellent libraries such as tm, tidytext, textmineR and quanteda for handling text data.

In the Data-Wrangling chapter, we had introduced a section on regular expressions. Much of the work is involved in cleaning documents so that unnecessary words such as stop words and white space are removed. Example of stop words include “I”, “he”, “she”, “they” etc. It is also important to turn words into lower case otherwise “London” and “london” are treated as 2 different words.

12.1 Bag of words

Bag of words or unigram analysis describe data in which words in a sentence were separated or tokenised. Later we will demontrate analysis of bigram. Within this bag of words the order of words within the document is not retained. Depending on how this process is performed the negative connotation may be loss. Consider “not green” and after cleaning of the document, only the color “green” remain.

12.2 Extracting data from Pubmed

Data can be extracted directly from Pubmed.

library(PubMedWordcloud)

#can only extract one word at a time, limit 100 abstracts
pmid = getPMIDsByKeyWords(keys = "breast cancer", 
                          dFrom = 2021, dTo = 2021, n=100)
pmid1 = getPMIDsByKeyWords(keys = "stroke", 
                          dFrom = 2021, dTo = 2021,n=100)

#combine 2 search words
abstracts = getAbstracts(c(pmid,pmid1)) 
There are total 200 PMIDs
downloading abstracts for PMIDs from 1 to 100 ...
downloading abstracts for PMIDs from 101 to 200 ...
#ermove stopwords
cleanAbs = cleanAbstracts(abstracts)

#plot
plotWordCloud(cleanAbs)

This is an example using RISmed library to extract data from PubMed on electronic medical record and text mining for 2021.

#library(adjutant)
library(RISmed)
library(ggplot2)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(SnowballC)
library(wordcloud)
Loading required package: RColorBrewer
library(lattice)
library(tm)
Loading required package: NLP

Attaching package: 'NLP'
The following object is masked from 'package:ggplot2':

    annotate
library (dplyr)
library(tidytext)
library(tidyr)
library(stringr)

#search 25/9/21
#query<-"electronic medical record + text mining"
#ngs_search <- EUtilsSummary(query, type="esearch",db = "pubmed",mindate=2016, maxdate=2018, retmax=30000)
#summary(ngs_search)
#QueryCount(ngs_search)
#ngs_records <- EUtilsGet(ngs_search)
#save(ngs_records,file="ngs_records.Rda")

#bug in system and EUtilsSummary is not working
#ISOAbbreviation

#reload saved search
load("EMR_Textmiing_ngs_records.Rda")

12.2.1 Obtain abstracts from the Pubmed

#extract title and abstract
pubmed_data <- data.frame('Pmid'=PMID(ngs_records),
        'Year'=YearPubmed(ngs_records),
        'Title'=ArticleTitle(ngs_records),
        'Journal'=MedlineTA(ngs_records),
        'Abstract'=AbstractText(ngs_records))

pubmed_data$Abstract <- as.character(pubmed_data$Abstract)
pubmed_data$Abstract <- gsub(",", " ", pubmed_data$Abstract, fixed = TRUE)

####

#partition data
#high impact factor journals
#| is or
#note Lancet includes Lancet Neurology etc
hi <- pubmed_data[grepl("Lancet|Neurology|N Engl J Med|Ann Neurol", pubmed_data$Journal),]
 #hi1 <- hi$Pmid %>% list
 #pubmed_data <- pubmed_data %>% 
  # mutate(Pmid  = as.character(Pmid)) 

#low impact factor journals
li <- pubmed_data[grepl("Mult Scler|Int J MS Care|J Neurol|Cochrane|BMC|PLoS|BMJ Open", pubmed_data$Journal),]
#li1 <- li$Pmid %>% list
 #pubmed_data <- pubmed_data %>% 
  # mutate(Pmid  = as.character(Pmid)) 
 
#join
hia<-paste(hi$Abstract, collapse="")
lia<-paste(li$Abstract,collapse="")
 
#combine
all<-c(hia,lia)

Create a list of high and low impact factor journals

12.2.2 Clean text data

This is an example of cleaning text data. First the corpus is created.

#create a list of stop words
mystopwords=bind_rows(data.frame(word= c("It","mg","kg","journals","medline","embase","ebsco","cinahl","background","method","results","conclusion","http","web","i","ii","iii","ci","jan","january","feb","february","march","april","may","june","july","august","sept","september","oct","october","nov","november","dec","december"),lexicon=c("custom")),stop_words)
 
#abstract
abs<-pubmed_data$Abstract
abs<-iconv(abs, to = 'utf-8')
abs <- (abs[!is.na(abs)])
abCorpus<-VCorpus(VectorSource(abs))
ab<-tidy(abCorpus)

#token words
ab_word<-ab %>% unnest_tokens(word,text) %>%
  mutate(word = gsub("[^A-Za-z ]","",word)) %>% 
  filter(word != "") %>%
  #anti_join(stop_words) %>%
  #use customised stopwords
  anti_join(mystopwords) 
Joining with `by = join_by(word)`
colnames(ab_word)
[1] "author"        "datetimestamp" "description"   "heading"      
[5] "id"            "language"      "origin"        "word"         

Plot wordcloud

#plot wordcloud
ab_word%>% count(word) %>% 
  with(wordcloud(word,n, min.freq = 50, max.words = 200, 
      colors = brewer.pal(8, "Dark2")), scale = c(6,.3), 
       per.rot = 0.4)

Note effect on shape of wordcloud as we change the word size

#plot wordcloud
ab_word%>% count(word) %>% 
  with(wordcloud(word,n, min.freq = 50, max.words = 500, 
      colors = brewer.pal(8, "Dark2")), scale = c(4,.2), 
       per.rot = 0.4)

12.2.3 Sentiment in text data

Plot Wordcloud with negative and positive sentiment from Bing library. Other sentiment libraries include afinn, loughran and nrc.

library(reshape2)

Attaching package: 'reshape2'
The following object is masked from 'package:tidyr':

    smiths
ab_word %>% inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort=TRUE) %>%
  acast(word~sentiment,value.var = "n",fill=0) %>%
  comparison.cloud(colors = c("blue","red"),max.words = 100)
Joining with `by = join_by(word)`

Find pairs of words

library(extrafont)
Registering fonts with R
library(igraph)

Attaching package: 'igraph'
The following object is masked from 'package:tidyr':

    crossing
The following objects are masked from 'package:dplyr':

    as_data_frame, groups, union
The following objects are masked from 'package:stats':

    decompose, spectrum
The following object is masked from 'package:base':

    union
library(ggraph)
library(widyr)
library(viridis)
Loading required package: viridisLite
#abstract
ab_word_cors <- 
  ab_word %>% 
  mutate(section = row_number() %/% 10) %>%
  filter(section > 0) %>%
  filter(!word %in% stop_words$word) %>%
  group_by(word) %>%
  filter(n() >= 20) %>%
  pairwise_cor(word, section, sort = TRUE)

Plot pairs of words

ab_word_cors %>%
  filter(correlation > .5) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), 
    show.legend = FALSE) + 
    geom_node_point(color ="#27408b", size = 5) +
    geom_node_text(aes(label = name), repel = TRUE) +
    theme_void(base_family="Roboto")+
    labs(title="Pairs of words in publications on electronic medical record and Text mining ")

graph analysis of word relationship

library(extrafont)
library(igraph)
library(ggraph)
library(widyr)
library(viridis)

#abstract
ab_word_cors <- 
  ab_word %>% 
  mutate(section = row_number() %/% 10) %>%
  filter(section > 0) %>%
  filter(!word %in% stop_words$word) %>%
  group_by(word) %>%
  filter(n() >= 20) %>%
  pairwise_cor(word, section, sort = TRUE)

ab_word_cors %>%
  filter(correlation > .5) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) + geom_node_point(color ="#27408b", size = 5) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void(base_family="Roboto")+
  labs(title="  Pairs of words in publications on electronic medical record and Text mining ")

12.3 Bigram analysis

Previously we treated words as unigram. Here we group words which appear together as bigram.

ab_bigrams <- ab %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  mutate(bigram = gsub("[^A-Za-z ]","", bigram)) %>% 
  filter(bigram != "") 
Warning: Outer names are only allowed for unnamed scalar atomic inputs
bigrams_separated <- ab_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% mystopwords$word) %>%
  filter(!word2 %in% mystopwords$word)


bigram_counts <- bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)

bigrams_united <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")
bigrams_united
# A tibble: 21,793 × 8
   author datetimestamp       description heading id    language origin bigram  
   <lgl>  <dttm>              <lgl>       <lgl>   <chr> <chr>    <lgl>  <chr>   
 1 NA     2025-02-11 02:43:08 NA          NA      1     en       NA     growing…
 2 NA     2025-02-11 02:43:08 NA          NA      1     en       NA     elderly…
 3 NA     2025-02-11 02:43:08 NA          NA      1     en       NA     populat…
 4 NA     2025-02-11 02:43:08 NA          NA      1     en       NA     incurab…
 5 NA     2025-02-11 02:43:08 NA          NA      1     en       NA     chronic…
 6 NA     2025-02-11 02:43:08 NA          NA      1     en       NA     continu…
 7 NA     2025-02-11 02:43:08 NA          NA      1     en       NA     medical…
 8 NA     2025-02-11 02:43:08 NA          NA      1     en       NA     service…
 9 NA     2025-02-11 02:43:08 NA          NA      1     en       NA     mental …
10 NA     2025-02-11 02:43:08 NA          NA      1     en       NA     impairm…
# ℹ 21,783 more rows
bigram_graph <- bigram_counts %>%
  filter(n > 10) %>%
  graph_from_data_frame()
bigram_graph
IGRAPH f45d754 DN-- 107 97 -- 
+ attr: name (v/c), n (e/n)
+ edges from f45d754 (vertex names):
 [1]           ->            electronic->health      machine   ->learning   
 [4] health    ->records     text      ->mining      data      ->mining     
 [7] natural   ->language    language  ->processing  medical   ->records    
[10]           ->patients    electronic->medical     health    ->care       
[13] ehr       ->data        health    ->record      free      ->text       
[16] clinical  ->notes       deep      ->learning    clinical  ->data       
[19] entity    ->recognition medical   ->record      processing->nlp        
[22] neural    ->network     records   ->ehrs        logistic  ->regression 
+ ... omitted several edges

12.3.1 Network of bigrams

The relationship among the bigrams are illustrated here.

library(tidygraph)

Attaching package: 'tidygraph'
The following object is masked from 'package:igraph':

    groups
The following object is masked from 'package:stats':

    filter
as_tbl_graph(bigram_graph)
# A tbl_graph: 107 nodes and 97 edges
#
# A directed multigraph with 20 components
#
# A tibble: 107 × 1
  name        
  <chr>       
1 ""          
2 "electronic"
3 "machine"   
4 "health"    
5 "text"      
6 "data"      
# ℹ 101 more rows
#
# A tibble: 97 × 3
   from    to     n
  <int> <int> <int>
1     1     1   533
2     2     4   173
3     3    27   119
# ℹ 94 more rows
set.seed(2017)
#plot(bigram_graph)
ggraph(bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point(color = "red") +
  geom_node_text(aes(label = name), size=3,vjust = 1, hjust = 1)

12.4 Trigram

ab_trigrams <- ab %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 2) %>%
  mutate(trigram = gsub("[^A-Za-z ]","", trigram)) %>% 
  filter(trigram != "") 
Warning: Outer names are only allowed for unnamed scalar atomic inputs
trigrams_separated <- ab_trigrams %>%
  separate(trigram, c("word1", "word2","word3"), sep = " ")
Warning: Expected 3 pieces. Missing pieces filled with `NA` in 72149 rows [1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
trigrams_filtered <- trigrams_separated %>%
  filter(!word1 %in% mystopwords$word) %>%
  filter(!word2 %in% mystopwords$word) %>%
filter(!word3 %in% mystopwords$word)

trigram_counts <- trigrams_filtered %>% 
  count(word1, word2, word3, sort = TRUE)

trigram_counts
# A tibble: 14,068 × 4
   word1        word2        word3     n
   <chr>        <chr>        <chr> <int>
 1 ""           ""           <NA>    533
 2 "electronic" "health"     <NA>    173
 3 "machine"    "learning"   <NA>    119
 4 "health"     "records"    <NA>    108
 5 "text"       "mining"     <NA>     91
 6 "data"       "mining"     <NA>     76
 7 "natural"    "language"   <NA>     68
 8 "language"   "processing" <NA>     66
 9 "medical"    "records"    <NA>     66
10 ""           "patients"   <NA>     64
# ℹ 14,058 more rows
trigrams_united <- trigrams_filtered %>%
  unite(trigram, word1, word2, word3, sep = " ")
trigrams_united
# A tibble: 21,793 × 8
   author datetimestamp       description heading id    language origin trigram 
   <lgl>  <dttm>              <lgl>       <lgl>   <chr> <chr>    <lgl>  <chr>   
 1 NA     2025-02-11 02:43:08 NA          NA      1     en       NA     growing…
 2 NA     2025-02-11 02:43:08 NA          NA      1     en       NA     elderly…
 3 NA     2025-02-11 02:43:08 NA          NA      1     en       NA     populat…
 4 NA     2025-02-11 02:43:08 NA          NA      1     en       NA     incurab…
 5 NA     2025-02-11 02:43:08 NA          NA      1     en       NA     chronic…
 6 NA     2025-02-11 02:43:08 NA          NA      1     en       NA     continu…
 7 NA     2025-02-11 02:43:08 NA          NA      1     en       NA     medical…
 8 NA     2025-02-11 02:43:08 NA          NA      1     en       NA     service…
 9 NA     2025-02-11 02:43:08 NA          NA      1     en       NA     mental …
10 NA     2025-02-11 02:43:08 NA          NA      1     en       NA     impairm…
# ℹ 21,783 more rows
trigram_graph <- trigram_counts %>%
  filter(n > 10) %>%
  graph_from_data_frame()
trigram_graph
IGRAPH f62da48 DN-- 107 97 -- 
+ attr: name (v/c), word3 (e/c), n (e/n)
+ edges from f62da48 (vertex names):
 [1]           ->            electronic->health      machine   ->learning   
 [4] health    ->records     text      ->mining      data      ->mining     
 [7] natural   ->language    language  ->processing  medical   ->records    
[10]           ->patients    electronic->medical     health    ->care       
[13] ehr       ->data        health    ->record      free      ->text       
[16] clinical  ->notes       deep      ->learning    clinical  ->data       
[19] entity    ->recognition medical   ->record      processing->nlp        
[22] neural    ->network     records   ->ehrs        logistic  ->regression 
+ ... omitted several edges

12.4.1 Network of trigrams

The relationship among the trigrams are illustrated here.

as_tbl_graph(trigram_graph)
# A tbl_graph: 107 nodes and 97 edges
#
# A directed multigraph with 20 components
#
# A tibble: 107 × 1
  name        
  <chr>       
1 ""          
2 "electronic"
3 "machine"   
4 "health"    
5 "text"      
6 "data"      
# ℹ 101 more rows
#
# A tibble: 97 × 4
   from    to word3     n
  <int> <int> <chr> <int>
1     1     1 <NA>    533
2     2     4 <NA>    173
3     3    27 <NA>    119
# ℹ 94 more rows
set.seed(2017)
#plot(trigram_graph)
ggraph(trigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point(color = "red") +
  geom_node_text(aes(label = name), size=3,vjust = 1, hjust = 1)

12.5 Topic modeling or thematic analysis

Two methods for unsupervised thematic analysis, NMF and probabilistic topic model, are illustrated.

12.5.1 TFIDF

Term frequency defines the frequency of a term in a document. The document frequency defines how often a term is used across document. The inverse document frequency can be seen as a weight to decrease the importance of commonly words used across documents. Term frequency inverse document frequency is a process used to down weight common terms and highlight important terms in the document. In the example below topic modeling, an example of creating tfidf is shown. Other packages like tidytext, textmineR have functions for creating tfidf

library(slam)
library(topicmodels)

ab2<-ab %>% select(text) %>% as.data.frame()

corpus = Corpus(VectorSource(ab2))
myCorpus = VCorpus(VectorSource(ab2))
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
myCorpus <- tm_map(myCorpus, removeNumbers)
myCorpus <- tm_map(myCorpus, removePunctuation)
myCorpus <- tm_map(myCorpus, removeWords, stopwords 
                   ("english"),lazy=TRUE) 
myCorpus <- tm_map(myCorpus, stripWhitespace, lazy=TRUE)

# create term-document matrix
dtm <- DocumentTermMatrix(myCorpus,control = 
                            list(wordLengths=c(3, 20)))
tdm <- TermDocumentMatrix(myCorpus,control = 
                            list(wordLengths=c(3, 20)))

#convert to matrix
tdm = as.matrix(tdm)


#ensure non non zero entry
rowTotals <- apply(dtm , 1, sum) #Find the sum of words in each Document
dtm1   <- dtm[rowTotals> 0, ]  

#create tfidf using slam library
term_tfidf <-
  + tapply(dtm$v/row_sums(dtm)[dtm$i],dtm$j,mean) * 
  + log2(nDocs(dtm)/col_sums(dtm>0))

#remove frequent words
dtm1 <-dtm1[,term_tfidf>=median(term_tfidf)] 
#dtm <-dtm[,term_tfidf>=0.0015]

12.5.2 Latent Dirichlet Allocation

Probabilistic topic modelling is a machine learning method that generates topics or discovers themes among a collection of documents. This step was performed using the Latent Dirichlet Allocation algorithm via the topicmodels package in R. An issue with topic modeling is that the number of topics are not known. It can be estimated empirically or by examining the harmonic means of the log likelihood . The idea is that the number of topics based on the log likelihood of P(topics|documents) at each iterations

#find k
harmonicMean <- function(logLikelihoods, precision=2000L) {
  library("Rmpfr")
  llMed <- median(logLikelihoods)
  as.double(llMed - log(mean(exp(-mpfr(logLikelihoods,
                                       prec = precision) + llMed))))
}

Determine the optimal number of topics based on harmonic means.

## estimate k
k = 20
burnin = 1000
iter = 1000
keep=50
fitted <- LDA(dtm1, k = k, 
              method = "Gibbs",control = list(burnin = burnin, 
                  iter = iter, keep = keep) )

# where keep indicates that every keep iteration the log-likelihood is evaluated and stored. This returns all log-likelihood values including burnin, i.e., these need to be omitted before calculating the harmonic mean:
logLiks <- fitted@logLiks[-c(1:(burnin/keep))]

# assuming that burnin is a multiple of keep and
harmonicMean(logLiks)
Loading required package: gmp

Attaching package: 'gmp'
The following objects are masked from 'package:base':

    %*%, apply, crossprod, matrix, tcrossprod
C code of R package 'Rmpfr': GMP using 64 bits per limb

Attaching package: 'Rmpfr'
The following object is masked from 'package:gmp':

    outer
The following objects are masked from 'package:stats':

    dbinom, dgamma, dnbinom, dnorm, dpois, dt, pnorm
The following objects are masked from 'package:base':

    cbind, pmax, pmin, rbind
[1] -320584.7
# generate numerous topic models with different numbers of topics
sequ <- seq(5, 50, 5) # in this case a sequence of numbers from 5 to 50, by 5.
fitted_many <- lapply(sequ, function(k) LDA(dtm1, k = k, method = "Gibbs",
        control = list(burnin = burnin, iter = iter, keep = keep) ))
# extract logliks from each topic
logLiks_many <- lapply(fitted_many, function(L)  L@logLiks[-c(1:(burnin/keep))])

# compute harmonic means
hm_many <- sapply(logLiks_many, function(h) harmonicMean(h))

# inspect
plot(sequ, hm_many, type = "l")

# compute optimum number of topics
sequ[which.max(hm_many)]
[1] 5

The previous analysis show that there are 40 topics. For ease of illustrations LDA is perform with 5 topics.

#perform LDA
lda_EHR <- LDA(dtm1, k = sequ[which.max(hm_many)], 
         method="Gibbs", 
         control=list(seed=1234,burnin=1000,thin=100,iter=1000))

#extract topics terms and beta weights
EHR_topics <- tidy(lda_EHR, matrix = "beta")

#view data by topics
EHR_top_terms <- EHR_topics %>%
  group_by(topic) %>%
  slice_max(beta, n = 10) %>% 
  ungroup() %>%
  arrange(topic, -beta)

EHR_top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered()

Compare differences in words between topics.

beta_wide <- EHR_topics %>%
  mutate(topic = paste0("topic", topic)) %>%
  pivot_wider(names_from = topic, values_from = beta) %>% 
  filter(topic1 > .001 | topic2 > .001) %>%
  mutate(log_ratio = log2(topic2 / topic1))

beta_wide
# A tibble: 450 × 7
   term          topic1     topic2    topic3    topic4    topic5 log_ratio
   <chr>          <dbl>      <dbl>     <dbl>     <dbl>     <dbl>     <dbl>
 1 access     0.00386   0.00000374 0.0000212 0.0000185 0.000212     -10.0 
 2 accuracy   0.0000202 0.00281    0.00257   0.0000185 0.000981       7.12
 3 accurately 0.00244   0.00000374 0.0000212 0.0000185 0.000212      -9.35
 4 achieved   0.0000202 0.00154    0.0000212 0.0000185 0.000212       6.25
 5 achieves   0.00103   0.00000374 0.000446  0.0000185 0.0000192     -8.10
 6 across     0.0000202 0.00199    0.0000212 0.0000185 0.0000192      6.62
 7 address    0.00486   0.0000412  0.0000212 0.0000185 0.0000192     -6.88
 8 adoption   0.00143   0.00000374 0.0000212 0.0000185 0.000597      -8.58
 9 adrs       0.00163   0.00000374 0.0000212 0.0000185 0.000597      -8.77
10 adult      0.00143   0.00000374 0.0000212 0.0000185 0.0000192     -8.58
# ℹ 440 more rows

12.5.3 NMF for topic model

In the previous chapter, NMF was used as a method to cluster data. Here, it can be framed as a method for topic modeling. NMF is analogous to multinomial PCA or probabilistic latent semantic analysis.

library(NMF)
Warning: package 'NMF' was built under R version 4.3.2
Loading required package: registry
Loading required package: rngtools
Loading required package: cluster
NMF - BioConductor layer [OK] | Shared memory capabilities [NO: windows] | Cores 2/2

Attaching package: 'NMF'
The following objects are masked from 'package:igraph':

    algorithm, compare
res <- nmf(as.matrix(tdm), sequ[which.max(hm_many)], "lee") 
Warning in validityMethod(object): Dimensions of W and H look strange [ncol(W)=
5 > ncol(H)= 1 ]
Warning in validityMethod(object): Dimensions of W and H look strange [ncol(W)=
5 > ncol(H)= 1 ]

Warning in validityMethod(object): Dimensions of W and H look strange [ncol(W)=
5 > ncol(H)= 1 ]
w <- basis(res@fit)

Turn data into data frame

df<-as.data.frame(w)
df$total <- rowSums(df)
df$word<-rownames(df)
colnames(df) <- c("Topic1","Topic2","Topic3","Topic4",
                  "Topic5","total","word")
df <-df[order(-df$total),] 
head(df,10)
                  Topic1       Topic2       Topic3       Topic4      Topic5
data        0.0159593720 0.0282241995 0.0100832967 0.0261308474 0.009280678
health      0.0009194075 0.0181108217 0.0043249907 0.0200148184 0.003487443
clinical    0.0112117568 0.0017558221 0.0179071444 0.0119635890 0.001939906
records     0.0062592077 0.0068971277 0.0004992859 0.0116797218 0.015787157
patients    0.0135102987 0.0172660375 0.0015100847 0.0038115459 0.001999944
medical     0.0087560848 0.0036455271 0.0118192185 0.0005089656 0.009446978
using       0.0002018923 0.0073490332 0.0127421083 0.0062441034 0.007323748
electronic  0.0107998221 0.0004045812 0.0076442762 0.0063438294 0.005712474
model       0.0005598215 0.0010319464 0.0055814410 0.0153828778 0.008074580
information 0.0036473004 0.0061528286 0.0052446082 0.0074690427 0.006044581
                 total        word
data        0.08967839        data
health      0.04685748      health
clinical    0.04477822    clinical
records     0.04112250     records
patients    0.03809791    patients
medical     0.03417677     medical
using       0.03386089       using
electronic  0.03090498  electronic
model       0.03063067       model
information 0.02855836 information

Order frequent words in the topics

wordMatrix = as.data.frame(w)
wordMatrix$word<-rownames(wordMatrix)
colnames(wordMatrix) <- c("Topic1","Topic2","Topic3",
                          "Topic4","Topic5","word")


# Topic 1
newdata <-wordMatrix[order(-wordMatrix$Topic1),] 
head(newdata)
                Topic1       Topic2      Topic3       Topic4      Topic5
data       0.015959372 0.0282241995 0.010083297 0.0261308474 0.009280678
patients   0.013510299 0.0172660375 0.001510085 0.0038115459 0.001999944
clinical   0.011211757 0.0017558221 0.017907144 0.0119635890 0.001939906
electronic 0.010799822 0.0004045812 0.007644276 0.0063438294 0.005712474
medical    0.008756085 0.0036455271 0.011819218 0.0005089656 0.009446978
based      0.006544277 0.0001106726 0.001261345 0.0035722560 0.007048943
                 word
data             data
patients     patients
clinical     clinical
electronic electronic
medical       medical
based           based

Plot topic 1

d <- newdata
df <- as.data.frame(cbind(d[1:10,]$word,
                          as.numeric(d[1:10,]$Topic1)))

colnames(df)<- c("Word","Frequency")

# for ggplot to understand the order of words, you need to specify factor order

df$Word <- factor(df$Word, levels = df$Word[order(df$Frequency)])

ggplot(df, aes(x=Word, y=Frequency)) + 
  geom_bar(stat="identity", fill="lightgreen", color="grey50")+
  coord_flip()+
  ggtitle("Topic 1")

12.6 Word embedding

Word embedding refers to a method to identify similarities among words in a corpus based on co-occurrence. This can be performed using word2vec or widyr.

library(widyr)

#create tweet id
ab$postID<-row.names(ab)

skipgrams <- ab %>%
  unnest_tokens(ngram, text, token = "ngrams", n = 8) %>%
  mutate(ngramID = row_number()) %>% 
    tidyr::unite(skipgramID, postID, ngramID) %>%
    unnest_tokens(word, ngram)
Warning: Outer names are only allowed for unnamed scalar atomic inputs
  #mutate(ngram = gsub("[^A-Za-z ]","", ngram)) %>% 
  #unnest_tokens(word, ngram)
  
#calculate unigram probabilities (used to normalize skipgram probabilities later)
unigram_probs <- ab %>%  unnest_tokens(word, text) %>%
    count(word, sort = TRUE) %>%
    mutate(p = n / sum(n))
Warning: Outer names are only allowed for unnamed scalar atomic inputs
#calculate probabilities
skipgram_probs <- skipgrams %>%
    pairwise_count(word, skipgramID, diag = TRUE, sort = TRUE) %>%
    mutate(p = n / sum(n))

#normalize probabilities
normalized_prob <- skipgram_probs %>%
    filter(n > 20) %>%
    rename(word1 = item1, word2 = item2) %>%
    left_join(unigram_probs %>%
                  select(word1 = word, p1 = p),
              by = "word1") %>%
    left_join(unigram_probs %>%
                  select(word2 = word, p2 = p),
              by = "word2") %>%
    mutate(p_together = p / p1 / p2)

normalized_prob[2005:2010,]
# A tibble: 6 × 7
  word1 word2          n         p      p1       p2 p_together
  <chr> <chr>      <dbl>     <dbl>   <dbl>    <dbl>      <dbl>
1 in    model        148 0.0000341 0.0190  0.00295       0.607
2 of    within       148 0.0000341 0.0377  0.000759      1.19 
3 to    results      148 0.0000341 0.0246  0.00233       0.594
4 the   prediction   148 0.0000341 0.0468  0.000966      0.755
5 as    clinical     148 0.0000341 0.00497 0.00634       1.08 
6 is    there        148 0.0000341 0.00744 0.000870      5.27 

Explore the word “prediction”.

normalized_prob %>% 
    filter(word1 == "prediction") %>%
    arrange(-p_together)
# A tibble: 31 × 7
   word1      word2          n          p       p1       p2 p_together
   <chr>      <chr>      <dbl>      <dbl>    <dbl>    <dbl>      <dbl>
 1 prediction prediction   541 0.000125   0.000966 0.000966     134.  
 2 prediction aki           34 0.00000784 0.000966 0.000124      65.3 
 3 prediction bg            41 0.00000945 0.000966 0.000235      41.7 
 4 prediction risk          87 0.0000201  0.000966 0.00210        9.89
 5 prediction tasks         22 0.00000507 0.000966 0.000607       8.64
 6 prediction algorithms    26 0.00000599 0.000966 0.000759       8.17
 7 prediction models        78 0.0000180  0.000966 0.00242        7.70
 8 prediction disease       43 0.00000991 0.000966 0.00171        5.99
 9 prediction different     26 0.00000599 0.000966 0.00141        4.41
10 prediction model         51 0.0000118  0.000966 0.00295        4.12
# ℹ 21 more rows

Plot data in multidimensional space following SVD.

pmi_matrix <- normalized_prob %>%
    mutate(pmi = log10(p_together)) %>%
    cast_sparse(word1, word2, pmi)

#remove missing data
pmi_matrix@x[is.na(pmi_matrix@x)] <- 0

#library for svd and pca
library(irlba)

#run SVD
pmi_svd <- svd(pmi_matrix)
#pmi_svd <- irlba(pmi_matrix, 256, maxit = 500)

#next we output the word vectors:
word_vectors <- pmi_svd$u
rownames(word_vectors) <- rownames(pmi_matrix)

library(broom)
search_synonyms <- function(word_vectors, selected_vector) {

    similarities <- word_vectors %*% selected_vector %>%
        tidy() %>%
        as_tibble() %>%
        rename(token = .rownames,
               similarity = unrowname.x.)

    similarities %>%
        arrange(-similarity)    
}


#pres_synonym <- search_synonyms(word_vectors,word_vectors["prediction",])

#grab 100 words
forplot<-as.data.frame(word_vectors[200:300,])
forplot$word<-rownames(forplot)

#now plot
library(ggplot2)
ggplot(forplot, aes(x=V1, y=V2, label=word))+
  geom_text(aes(label=word),hjust=0, vjust=0, color="blue")+
  theme_minimal()+
  xlab("First Dimension Created by SVD")+
  ylab("Second Dimension Created by SVD")

12.7 Deep Learning for text

12.7.1 transformer

We can use reticulate package to install transformers from Hugging Face. The example below uses DistilBERT transformer from Hugging Face. For this python work, a virtual environment has been created.

##########
#the following installation files have been commented out
#the files are stored under huggingfaceR envs

library(reticulate)
#devtools::install_github("farach/huggingfaceR")
#hf_python_depends('transformers') 

##########
#source huggingfaceR/bin/activate

library(huggingfaceR)

distilBERT <- hf_load_pipeline(
  model_id = "distilbert-base-uncased-finetuned-sst-2-english", 
  task = "text-classification"
  )
#> 
#> 
#> distilbert-base-uncased-finetuned-sst-2-english is ready for text-classification

distilBERT
#> <transformers.pipelines.text_classification.TextClassificationPipeline object at 0x000001D0A8F71510>