#Text Searching in Twitter #Rob Wells #Feb 17 2019 #Credit where it is due: #Thanks to https://awakeningdatascientist.wordpress.com/2015/07/20/r-of-the-day-grep-and-grepl/ #Thanks to https://haozhu233.github.io/kableExtra/awesome_table_in_html.html #Load the AOC tweets AOC <- rio::import("./Data/AOC.csv") AOC <- janitor::clean_names(AOC) colnames(AOC) str(AOC) #--------------------------------------------------------------------# # TEXT SEARCHING # #--------------------------------------------------------------------# #Let's focus on the text and time AOC1 <- AOC %>% select(created_at, text) View(AOC1) #What is wrong with this picture? #Put it in a readable format #https://haozhu233.github.io/kableExtra/awesome_table_in_html.html install.packages("kableExtra") library(knitr) library(kableExtra) kable(AOC1) #Better formatting into html. Bonus - you see emojis. AOC1 %>% kable() %>% kable_styling() #You can now see the tweets displayed nicely on the viewer #Export: Viewer | Export | Save as Web Page #Open File In Web Browser and read the Tweets #Find instances of Trump in the Tweets# library(dplyr) #Primitive Method of Filtering Test <- filter(AOC, grepl ("Trump", text)) Test <- Test%>%select(created_at, text) #Filtered Table with 62 instances of Trump in Tweets AOC_Trump <- filter(AOC, grepl ("Trump", text)) %>% select(created_at, text) #Use | as OR statement in text searches - 73 instances AOC_Trump2 <- filter(AOC, grepl ("Trump|green", text)) %>% select(created_at, text) #Let's Look at GREP ??grepl #Pattern Matching and Replacement #grep, grepl, regexpr, gregexpr and regexec search for matches to argument pattern within each element of a character vector: they differ in the format of and amount of detail in the results. #grepl(pattern, x, ignore.case = FALSE, perl = FALSE, # fixed = FALSE, useBytes = FALSE) #Another resource: https://www.rdocumentation.org/packages/base/versions/3.5.2/topics/grep #The Power of Grep! #Or #filter_for_value<-CO2[grep("non", CO2$Treatment), ] #head(filter_for_value) Test1 <-AOC[grep("Trump", AOC$text), ] head(Test1) #filter data set based on values that do not match the specified pattern #filter_for_not_a_value<-CO2[-(grep("non", CO2$Treatment)),] NotTrump <-AOC[-(grep("Trump", AOC$text)), ] #Source: https://awakeningdatascientist.wordpress.com/2015/07/20/r-of-the-day-grep-and-grepl/ #Write your results into a table write.csv(AOC_Trump, "AOC_Trump.csv") #Look at your results in html. Bonus - you see emojis. AOC_Trump %>% kable() %>% kable_styling() #------------------------------------------------------------------------------# #------------This Section Involves Processing the Twitter Data ----------------# #------------------------------------------------------------------------------# #Create a subset AOC2 <- select(AOC, user_id, created_at, text, is_retweet, hashtags, urls_expanded_url) View(AOC2) #Filters to Just Retweets AOC_retweets<- AOC2%>%select(user_id, is_retweet, hashtags)%>%filter(is_retweet=="TRUE") #Create a table with frequency of word usage Hashtags <-- table(AOC_retweets$hashtags) View(Hashtags) #Write to a CSV #output this file to a CSV write.csv(Hashtags, "Hashtags.csv") #Who is engaging with AOC? df3 <- group_by(AOC2, urls_expanded_url) %>% summarize(total=n()) %>% arrange(desc(total)) %>% head() #---------- ANALYZE WORDS ---------------# #from book: https://www.tidytextmining.com/tidytext.html # #from tutorial: http://varianceexplained.org/r/trump-tweets/ # using tidytext: https://cran.r-project.org/web/packages/tidytext/ #https://cran.r-project.org/web/packages/tidytext/vignettes/tidytext.html #https://cran.r-project.org/web/packages/tidytext/tidytext.pdf #Comparison of words #Now that we’re sure there’s a difference between these two accounts, what can we say about the difference in the content? #We’ll use the tidytext package that Julia Silge and I developed. #We start by dividing into individual words using the unnest_tokens function (see this vignette for more), #and removing some common “stopwords”2: #Sept. 28 Session # #install.packages("tidytext") install.packages("tidyr") library(tidytext) library(tidyr) library(dplyr) #reg <- "([^A-Za-z\\d#@']|'(?![A-Za-z\\d#@]))" #tweet_words <- tweets %>% # filter(!str_detect(text, '^"')) %>% # mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&", "")) %>% # unnest_tokens(word, text, token = "regex", pattern = reg) %>% # filter(!word %in% stop_words$word, # str_detect(word, "[a-z]")) #tweet_words #android_iphone_ratios <- tweet_words %>% #count(word, source) %>% # filter(sum(n) >= 5) %>% # spread(source, n, fill = 0) %>% # ungroup() %>% # mutate_each(funs((. + 1) / sum(. + 1)), -word) %>% # mutate(logratio = log2(Android / iPhone)) %>% # arrange(desc(logratio)) reg <- "([^A-Za-z\\d#@']|'(?![A-Za-z\\d#@]))" tweet_words <- AOC %>% filter(!str_detect(text, '^"')) %>% mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&", "")) %>% unnest_tokens(word, text, token = "regex", pattern = reg) %>% filter(!word %in% stop_words$word, str_detect(word, "[a-z]")) tweet_words CommonAOCWords <- tweet_words %>% count(word, source) %>% filter(sum(n) >= 5) %>% spread(source, n, fill = 0) %>% ungroup() #Don't count as separate devices# #Removes source from table CommonAOCWords <- tweet_words %>% count(word) %>% filter(sum(n) >= 5) %>% ungroup() #Write to CSV write.csv(CommonAOCWord, "CommonAOCWord.csv") #Filter to Words With More than 100 Occurences # CommonWord <- filter(CommonAOCWords, n >= 100) %>% group_by(word, n) %>% arrange(desc(n)) #Same table but not in a grouped df - do this for ggplot below CommonWord1 <- filter(CommonAOCWords, n >= 100) library(dplyr) library(ggplot2) #Convert to data frame CommonWords <- as.data.frame(CommonAOCWords) head(CommonCollinsWords) #Visualize it! #Do the Katie Serrano special and add some killer colors CommonAOCChart <- ggplot(CommonWord, aes(x = reorder(word, -n), y = n, color = word, fill=word)) + geom_bar(stat = "identity") + coord_flip() + labs(title = "Top Words In AOC Twitter Feed", subtitle = "AOC Twitter Feed", caption = "Graphic by Rob Wells", x="Word", y="Count of the word usage") + theme(legend.position="none") plot(CommonAOCChart) #For Later, Indexing #Filters to Just Trump, Dumps into New Column library(stringr) AOC$Trump <- str_match(AOC$text, "Trump") AOC$Trump <- as.character(AOC$Trump)