#Splitting out hashtags #Feb 25 2019 #Load the AOC tweets AOC <- rio::import("./Data/AOC.csv") AOC <- janitor::clean_names(AOC) colnames(AOC) str(AOC) #------------------------------------------------# #CREATE SMALLER TABLE WITH HASHTAGS AND URLS #------------------------------------------------# #Create a subset library(dplyr) library(tidyr) AOC2 <- select(AOC, user_id, created_at, text, is_retweet, hashtags, urls_expanded_url) View(AOC2) colnames(AOC2) #Why are we doing this? #Total the hashtag column hashtags <- AOC %>% count(hashtags, sort=TRUE) View(hashtags) #We have a problem! #Data Cleaning - Split the URLS out# #Consulted This Tutorial https://trendct.org/2015/06/12/r-for-beginners-how-to-transition-from-excel-to-r/#chapterTOC8 # #Step 1 - Delete stray charcters c( "" ) etc# #Put it in a new Column Called URL1# AOC2$hashtag1 <- gsub("\\(", "", AOC2$hashtags) AOC2$hashtag1 <- gsub ("\\)", "", AOC2$hashtag1) AOC2$hashtag1 <- gsub ("\"", "&", AOC2$hashtag1) AOC2$hashtag1 <- gsub ("c&&", "", AOC2$hashtag1) AOC2$hashtag1 <- gsub ("&&", "", AOC2$hashtag1) #function{x <- gsub("\\(", "", x), return(x)} #also to remove quotes: noquote(YOUR TEXT STRING) #Step 2 - Separate hashtag1, which has the comma delimiters and dump in five new colums# library(tidyr) AOC3 <- separate(AOC2, hashtag1, c('hashtag1', 'hashtag2', 'hashtag3', 'hashtag4', 'hashtag5', 'hashtag6', 'hashtag7', 'hashtag8', 'hashtag9', 'hashtag10'), sep=',', remove=TRUE) View(AOC3) #table with only hashtags AOC4 <- select(AOC3, hashtag1:hashtag10) count <- table(unlist(AOC4)) hashtag2 <- as.data.frame(count) #rename columns colnames(hashtag2)[1] <- "hashtag" colnames(hashtag2)[2] <- "count" #Total by Column, Summarize by String# df7 <- hashtag2 %>% separate_rows(hashtag, sep = ' ') %>% group_by(hashtag = tolower(hashtag)) %>% summarise(Count = n(), ScoreSum = sum(count)) hashtags3 <- select(df7, hashtag, ScoreSum) %>% arrange(desc(ScoreSum)) write.csv(hashtags, "AOC_hashtags.csv") #Examine the table - what are the major narratives? #Create a table with the original tweets by hashtag #Filtered Table with #abolishIce in Tweets AOC_ICE <- filter(AOC2, grepl ("abolishICE", hashtag1)) %>% select(created_at, text, hashtags, hashtag1) AOC_ICE2 <- filter(AOC2, grepl ("abolishICE|AbolishICE|abolish ICE", text)) %>% select(created_at, text, hashtags, hashtag1) #Put it in a readable format #https://haozhu233.github.io/kableExtra/awesome_table_in_html.html #install.packages("kableExtra") library(knitr) library(kableExtra) kable(AOC_ICE2) #Better formatting into html. Bonus - you see emojis. AOC_ICE2 %>% kable() %>% kable_styling() #abolishIce #Look up AP on ProQuest in NYT when that narrative started #Was Ocasio Ortez ahead or behind the news coverage? #pub.Exact("New York Times") AND abolishICE #end of lesson for Wednesday