# data is stored as tab delimited reviews <- readr::read_tsv(here::here('data', 'pizza_reviews.tsv')) %>% # make a Date out of time for better use of daily trends mutate(date=as.Date(time_created)) %>% select(text, date, user.name, Name, rating)
Parsed with column specification: cols( ID = col_double(), id = col_character(), url = col_character(), text = col_character(), rating = col_double(), time_created = col_datetime(format = ""), user.id = col_character(), user.profile_url = col_character(), user.image_url = col_character(), user.name = col_character(), Name = col_character() )
Ron Hextall was my model goalie in the nineties
ourSentence <- tibble::tibble(text='Ron Hextall was my model goalie in the nineties') sentenceTokens <- ourSentence %>% unnest_tokens(output=word, input=text) sentenceTokens
# A tibble: 9 x 1 word <chr> 1 ron 2 hextall 3 was 4 my 5 model 6 goalie 7 in 8 the 9 nineties
sentenceTokens %>% anti_join(stop_words, by='word')
# A tibble: 5 x 1 word <chr> 1 ron 2 hextall 3 model 4 goalie 5 nineties
review_tokens <- reviews %>% unnest_tokens(output=word, input=text) %>% anti_join(stop_words, by='word') %>% filter(word != 'pizza') review_tokens
# A tibble: 1,432 x 5 date user.name Name rating word <date> <chr> <chr> <dbl> <chr> 1 2019-02-09 Jeff B. Juliana's Pizza 5 love 2 2019-02-09 Jeff B. Juliana's Pizza 5 coal 3 2019-02-09 Jeff B. Juliana's Pizza 5 fired 4 2019-02-09 Jeff B. Juliana's Pizza 5 gem 5 2019-02-09 Jeff B. Juliana's Pizza 5 watching 6 2019-02-09 Jeff B. Juliana's Pizza 5 sunset 7 2019-02-09 Jeff B. Juliana's Pizza 5 dumbo 8 2019-02-09 Jeff B. Juliana's Pizza 5 dinner 9 2019-02-09 Jeff B. Juliana's Pizza 5 time 10 2019-02-09 Jeff B. Juliana's Pizza 5 wait # ... with 1,422 more rows
review_tokens %>% count(word, sort=TRUE)
# A tibble: 781 x 2 word n <chr> <int> 1 slice 24 2 ny 17 3 time 15 4 nyc 13 5 crust 11 6 food 11 7 amazing 10 8 cheese 10 9 nice 10 10 pepperoni 10 # ... with 771 more rows
review_tokens %>% count(word, sort = TRUE) %>% filter(n >= 10) %>% mutate(word = reorder(word, n)) %>% ggplot(aes(x=word, y=n)) + geom_col() + xlab(NULL) + coord_flip()
get_sentiments('nrc')
# A tibble: 13,901 x 2 word sentiment <chr> <chr> 1 abacus trust 2 abandon fear 3 abandon negative 4 abandon sadness 5 abandoned anger 6 abandoned fear 7 abandoned negative 8 abandoned sadness 9 abandonment anger 10 abandonment fear # ... with 13,891 more rows
get_sentiments('bing')
# A tibble: 6,788 x 2 word sentiment <chr> <chr> 1 2-faced negative 2 2-faces negative 3 a+ positive 4 abnormal negative 5 abolish negative 6 abominable negative 7 abominably negative 8 abominate negative 9 abomination negative 10 abort negative # ... with 6,778 more rows
get_sentiments('afinn')
# A tibble: 2,476 x 2 word score <chr> <int> 1 abandon -2 2 abandoned -2 3 abandons -2 4 abducted -2 5 abduction -2 6 abductions -2 7 abhor -3 8 abhorred -3 9 abhorrent -3 10 abhors -3 # ... with 2,466 more rows
get_sentiments('loughran')
# A tibble: 4,149 x 2 word sentiment <chr> <chr> 1 abandon negative 2 abandoned negative 3 abandoning negative 4 abandonment negative 5 abandonments negative 6 abandons negative 7 abdicated negative 8 abdicates negative 9 abdicating negative 10 abdication negative # ... with 4,139 more rows
review_tokens <- review_tokens %>% inner_join(get_sentiments('bing'), by='word') review_tokens
# A tibble: 236 x 6 date user.name Name rating word sentiment <date> <chr> <chr> <dbl> <chr> <chr> 1 2019-02-09 Jeff B. Juliana's Pizza 5 love positive 2 2019-02-09 Jeff B. Juliana's Pizza 5 gem positive 3 2019-02-01 Tiffany H. Juliana's Pizza 4 famous positive 4 2019-01-22 Princess P. Juliana's Pizza 5 worth positive 5 2019-02-10 Jeff B. Lombardi's Pizza 5 worth positive 6 2019-02-20 Eemi M. Lombardi's Pizza 3 fast positive 7 2019-02-20 Eemi M. Lombardi's Pizza 3 reputation positive 8 2019-02-25 Rits M. Prince Street Pizza 5 easy positive 9 2019-02-17 Paul R. Lucali 5 celebrate positive 10 2019-02-17 Paul R. Lucali 5 pretty positive # ... with 226 more rows
ggplot(review_tokens, aes(x=sentiment)) + geom_bar(aes(fill=sentiment), show.legend=FALSE)
review_tokens %>% count(word, sentiment, sort = TRUE) %>% group_by(sentiment) %>% top_n(5) %>% ungroup() %>% mutate(word = reorder(word, n)) %>% ggplot(aes(word, n, fill = sentiment)) + geom_col(show.legend = FALSE) + facet_wrap(~sentiment, scales = "free_y") + coord_flip()
pos_neg <- review_tokens %>% count(Name, sentiment) %>% group_by(Name) %>% # get percent of positive/negative at each place mutate(percent=n/sum(n)) %>% # greater percentage wins slice(which.max(percent)) %>% ungroup() # avgerage rating of place avg_rating <- review_tokens %>% distinct(Name, user.name, .keep_all=TRUE) %>% group_by(Name) %>% summarize(Rating=mean(rating)) # join together rating_sentiment <- pos_neg %>% inner_join(avg_rating, by='Name')
ggplot(rating_sentiment, aes(x=sentiment, fill=sentiment)) + geom_bar(show.legend=FALSE)
rating_sentiment %>% ggplot(aes(x=sentiment, y=Rating, fill=sentiment)) + geom_violin(show.legend=FALSE) + geom_jitter(show.legend=FALSE)
rating_vs_sentiment <- lm(Rating ~ sentiment, data=rating_sentiment) coefplot(rating_vs_sentiment, sort='magnitude')
review_tokens %>% count(word, sentiment, sort = TRUE) %>% reshape2::acast(word ~ sentiment, value.var = "n", fill = 0) %>% comparison.cloud(colors = c("gray80", "gray20"), max.words = 100)
Thank You