Text Mining in R

# data is stored as tab delimited
reviews <- readr::read_tsv(here::here('data', 'pizza_reviews.tsv')) %>% 
  # make a Date out of time for better use of daily trends
  mutate(date=as.Date(time_created)) %>% 
  select(text, date, user.name, Name, rating)

Parsed with column specification:
cols(
  ID = col_double(),
  id = col_character(),
  url = col_character(),
  text = col_character(),
  rating = col_double(),
  time_created = col_datetime(format = ""),
  user.id = col_character(),
  user.profile_url = col_character(),
  user.image_url = col_character(),
  user.name = col_character(),
  Name = col_character()
)

Tokenize

Ron Hextall was my model goalie in the nineties

ourSentence <- tibble::tibble(text='Ron Hextall was my model goalie in the nineties')
sentenceTokens <- ourSentence %>% unnest_tokens(output=word, input=text)
sentenceTokens

# A tibble: 9 x 1
  word    
  <chr>   
1 ron     
2 hextall 
3 was     
4 my      
5 model   
6 goalie  
7 in      
8 the     
9 nineties

sentenceTokens %>% 
  anti_join(stop_words, by='word')

# A tibble: 5 x 1
  word    
  <chr>   
1 ron     
2 hextall 
3 model   
4 goalie  
5 nineties

review_tokens <- reviews %>% 
  unnest_tokens(output=word, input=text) %>% 
  anti_join(stop_words, by='word') %>% 
  filter(word != 'pizza')
review_tokens

# A tibble: 1,432 x 5
   date       user.name Name            rating word    
   <date>     <chr>     <chr>            <dbl> <chr>   
 1 2019-02-09 Jeff B.   Juliana's Pizza      5 love    
 2 2019-02-09 Jeff B.   Juliana's Pizza      5 coal    
 3 2019-02-09 Jeff B.   Juliana's Pizza      5 fired   
 4 2019-02-09 Jeff B.   Juliana's Pizza      5 gem     
 5 2019-02-09 Jeff B.   Juliana's Pizza      5 watching
 6 2019-02-09 Jeff B.   Juliana's Pizza      5 sunset  
 7 2019-02-09 Jeff B.   Juliana's Pizza      5 dumbo   
 8 2019-02-09 Jeff B.   Juliana's Pizza      5 dinner  
 9 2019-02-09 Jeff B.   Juliana's Pizza      5 time    
10 2019-02-09 Jeff B.   Juliana's Pizza      5 wait    
# ... with 1,422 more rows

Word Counts

review_tokens %>% 
  count(word, sort=TRUE)

# A tibble: 781 x 2
   word          n
   <chr>     <int>
 1 slice        24
 2 ny           17
 3 time         15
 4 nyc          13
 5 crust        11
 6 food         11
 7 amazing      10
 8 cheese       10
 9 nice         10
10 pepperoni    10
# ... with 771 more rows

review_tokens %>% count(word, sort = TRUE) %>%
  filter(n >= 10) %>% mutate(word = reorder(word, n)) %>%
  ggplot(aes(x=word, y=n)) +
  geom_col() + xlab(NULL) + coord_flip()

Sentiments

get_sentiments('nrc')

# A tibble: 13,901 x 2
   word        sentiment
   <chr>       <chr>    
 1 abacus      trust    
 2 abandon     fear     
 3 abandon     negative 
 4 abandon     sadness  
 5 abandoned   anger    
 6 abandoned   fear     
 7 abandoned   negative 
 8 abandoned   sadness  
 9 abandonment anger    
10 abandonment fear     
# ... with 13,891 more rows

get_sentiments('bing')

# A tibble: 6,788 x 2
   word        sentiment
   <chr>       <chr>    
 1 2-faced     negative 
 2 2-faces     negative 
 3 a+          positive 
 4 abnormal    negative 
 5 abolish     negative 
 6 abominable  negative 
 7 abominably  negative 
 8 abominate   negative 
 9 abomination negative 
10 abort       negative 
# ... with 6,778 more rows

get_sentiments('afinn')

# A tibble: 2,476 x 2
   word       score
   <chr>      <int>
 1 abandon       -2
 2 abandoned     -2
 3 abandons      -2
 4 abducted      -2
 5 abduction     -2
 6 abductions    -2
 7 abhor         -3
 8 abhorred      -3
 9 abhorrent     -3
10 abhors        -3
# ... with 2,466 more rows

get_sentiments('loughran')

# A tibble: 4,149 x 2
   word         sentiment
   <chr>        <chr>    
 1 abandon      negative 
 2 abandoned    negative 
 3 abandoning   negative 
 4 abandonment  negative 
 5 abandonments negative 
 6 abandons     negative 
 7 abdicated    negative 
 8 abdicates    negative 
 9 abdicating   negative 
10 abdication   negative 
# ... with 4,139 more rows

review_tokens <- review_tokens %>% 
  inner_join(get_sentiments('bing'), by='word')
review_tokens

# A tibble: 236 x 6
   date       user.name   Name                rating word       sentiment
   <date>     <chr>       <chr>                <dbl> <chr>      <chr>    
 1 2019-02-09 Jeff B.     Juliana's Pizza          5 love       positive 
 2 2019-02-09 Jeff B.     Juliana's Pizza          5 gem        positive 
 3 2019-02-01 Tiffany H.  Juliana's Pizza          4 famous     positive 
 4 2019-01-22 Princess P. Juliana's Pizza          5 worth      positive 
 5 2019-02-10 Jeff B.     Lombardi's Pizza         5 worth      positive 
 6 2019-02-20 Eemi M.     Lombardi's Pizza         3 fast       positive 
 7 2019-02-20 Eemi M.     Lombardi's Pizza         3 reputation positive 
 8 2019-02-25 Rits M.     Prince Street Pizza      5 easy       positive 
 9 2019-02-17 Paul R.     Lucali                   5 celebrate  positive 
10 2019-02-17 Paul R.     Lucali                   5 pretty     positive 
# ... with 226 more rows

ggplot(review_tokens, aes(x=sentiment)) + 
  geom_bar(aes(fill=sentiment), show.legend=FALSE)

review_tokens %>% count(word, sentiment, sort = TRUE) %>%  group_by(sentiment) %>%
  top_n(5) %>% ungroup() %>% mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") + coord_flip()

pos_neg <- review_tokens %>% 
  count(Name, sentiment) %>% 
  group_by(Name) %>% 
  # get percent of positive/negative at each place
  mutate(percent=n/sum(n)) %>% 
  # greater percentage wins
  slice(which.max(percent)) %>% 
  ungroup()

# avgerage rating of place
avg_rating <- review_tokens %>% 
  distinct(Name, user.name, .keep_all=TRUE) %>% 
  group_by(Name) %>% 
  summarize(Rating=mean(rating))

# join together
rating_sentiment <- pos_neg %>% 
  inner_join(avg_rating, by='Name')

ggplot(rating_sentiment, aes(x=sentiment, fill=sentiment)) + 
  geom_bar(show.legend=FALSE)

rating_sentiment %>% 
  ggplot(aes(x=sentiment, y=Rating, fill=sentiment)) + 
  geom_violin(show.legend=FALSE) + 
  geom_jitter(show.legend=FALSE)

rating_vs_sentiment <- lm(Rating ~ sentiment, data=rating_sentiment)
coefplot(rating_vs_sentiment, sort='magnitude')

Word Clouds

review_tokens %>%
    count(word, sentiment, sort = TRUE) %>%
    reshape2::acast(word ~ sentiment, value.var = "n", fill = 0) %>%
    comparison.cloud(colors = c("gray80", "gray20"), max.words = 100)

Resources

New York R Conference: May 9-11
Washington DC R Conference: November
NY Open Statistical Programming Meetup: Monthly

Ticket Pricing
Marketing
Point-of-Sale Analytics
Sales Forecasting
Automated Survey Analysis
Automated Report Generation

Thank You