├── .DS_Store ├── README.md ├── img_plots ├── .DS_Store ├── img1.png ├── img10.png ├── img10_es.png ├── img11.png ├── img11_es.png ├── img12.png ├── img12_es.png ├── img13.png ├── img13_es.png ├── img14.png ├── img14_es.png ├── img15.png ├── img16.png ├── img2.png ├── img3.png ├── img4.png ├── img5.png ├── img6.png ├── img7.png ├── img8.png └── img9.png └── main.R /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/.DS_Store -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # r-google-search-history-analysis 2 | Explore your activity on Google with R: How to Analyze and Visualize Your Personal Data Search History Find out how and how much you have used the most popular search engine in the world, using a copy of your personal data. Medium article: https://cosmoduende.medium.com/explore-your-activity-on-google-with-r-how-to-analyze-and-visualize-your-search-history-1fb74e5fb2b6 3 | 4 | ### *Versión en español* 5 | Explora tu actividad en Google con R: Análisis y visualización de datos de tu historial de búsquedas. Descubre cómo y cuánto has usado el buscador más popular del mundo, usando una copia de tus datos personalesArtículo en Medium: https://cosmoduende.medium.com/explora-tu-actividad-en-google-con-r-an%C3%A1lisis-y-visualizaci%C3%B3n-de-datos-de-tu-historial-de-a25638881fbd 6 | 7 | #### *Project preview* 8 | https://www.youtube.com/watch?v=1MbXsX9vFzc 9 | -------------------------------------------------------------------------------- /img_plots/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/.DS_Store -------------------------------------------------------------------------------- /img_plots/img1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img1.png -------------------------------------------------------------------------------- /img_plots/img10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img10.png -------------------------------------------------------------------------------- /img_plots/img10_es.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img10_es.png -------------------------------------------------------------------------------- /img_plots/img11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img11.png -------------------------------------------------------------------------------- /img_plots/img11_es.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img11_es.png -------------------------------------------------------------------------------- /img_plots/img12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img12.png -------------------------------------------------------------------------------- /img_plots/img12_es.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img12_es.png -------------------------------------------------------------------------------- /img_plots/img13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img13.png -------------------------------------------------------------------------------- /img_plots/img13_es.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img13_es.png -------------------------------------------------------------------------------- /img_plots/img14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img14.png -------------------------------------------------------------------------------- /img_plots/img14_es.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img14_es.png -------------------------------------------------------------------------------- /img_plots/img15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img15.png -------------------------------------------------------------------------------- /img_plots/img16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img16.png -------------------------------------------------------------------------------- /img_plots/img2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img2.png -------------------------------------------------------------------------------- /img_plots/img3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img3.png -------------------------------------------------------------------------------- /img_plots/img4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img4.png -------------------------------------------------------------------------------- /img_plots/img5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img5.png -------------------------------------------------------------------------------- /img_plots/img6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img6.png -------------------------------------------------------------------------------- /img_plots/img7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img7.png -------------------------------------------------------------------------------- /img_plots/img8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img8.png -------------------------------------------------------------------------------- /img_plots/img9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img9.png -------------------------------------------------------------------------------- /main.R: -------------------------------------------------------------------------------- 1 | # REQUIRED LIBRARIES 2 | library(wordcloud) 3 | library(lubridate) 4 | library(rvest) 5 | library(tm) 6 | library(tidyverse) 7 | library(plotly) 8 | 9 | # READ DATA 10 | fileHTML <- "Takeout/My Activity/Search/MyActivity.html" 11 | mySearchFile <- read_html(fileHTML, encoding = "UTF-8") 12 | 13 | # SCRAPPING SEARCH DATE AND TIME 14 | dateSearch <- mySearchFile %>% 15 | html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>% 16 | str_extract(pattern = "(?<=
)(.*)(?<=PM|AM)") %>% 17 | mdy_hms() 18 | dateSearch[1:5] 19 | 20 | # SCRAPING SEARCH TEXT 21 | textSearch <- mySearchFile %>% 22 | html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>% 23 | str_extract(pattern = '(?<=)') %>% 24 | str_extract(pattern = '(?<=\">)(.*)') 25 | textSearch[1:5] 26 | 27 | # SCRAPING SEARCH TYPE 28 | searchType <- mySearchFile %>% 29 | html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>% 30 | str_extract(pattern = "(?<=mdl-typography--body-1\">)(.*)(?=% 31 | str_extract(pattern = "(\\w+)(?=\\s)") 32 | searchType[1:5] 33 | 34 | # CREATE DATA FRAME USING SCRAPED DATA 35 | searchedData <- tibble(timestamp = dateSearch, 36 | date = as_date(dateSearch), 37 | year = year(dateSearch), 38 | month = month(dateSearch, label = TRUE), 39 | day = weekdays(dateSearch), 40 | hour = hour(dateSearch), 41 | type = searchType, 42 | search = textSearch) 43 | 44 | searchedData$day <- factor(searchedData$day, levels = c("Sunday", "Monday", "Tuesday", 45 | "Wednesday","Thursday", "Friday", 46 | "Saturday")) 47 | searchedData <- na.omit(searchedData) 48 | head(searchedData) 49 | 50 | # PLOT SEARCH VOLUME BY YEAR 51 | searchByYear <- ggplot(searchedData, aes(year, fill=..count..)) + 52 | scale_fill_gradient(low = "yellow", high = "red")+ 53 | geom_bar(width=0.7)+ 54 | labs(x= "Year", y= "Count") + 55 | ggtitle("How much your search frequency has changed over time", "Search activity by year") 56 | searchByYear 57 | ggplotly() 58 | 59 | # PLOT SEARCH VOLUME BY MONTH 60 | searchByMonth <- searchedData[(searchedData$year > 2007 & searchedData$year< 2021), ] 61 | ggplot(searchByMonth, aes(year, fill=..count..)) + 62 | scale_fill_gradient(low = "yellow", high = "red")+ 63 | geom_bar(aes(x = month, group = year)) + 64 | theme(axis.text.x = element_text(angle=90)) + 65 | facet_grid(.~year, scales="free") + 66 | labs(x= "Year / Month", y= "Count") + 67 | ggtitle("How much your search frequency has changed over time", "Month activity on detail") 68 | ggplotly() 69 | 70 | 71 | # PLOT SEARCH VOLUME BY HOUR 72 | seearchByHour <- ggplot(searchedData, aes(hour, fill=..count..)) + 73 | scale_fill_gradient(low = "yellow", high = "red") + 74 | geom_bar() + 75 | labs(x= "Hour", y= "Count") + 76 | ggtitle("What time of day do you have the highest frequency of searches?", "Hour activity on detail") 77 | seearchByHour 78 | ggplotly() 79 | 80 | 81 | # PLOT SEARCH VOLUME BY WEEKDAY 82 | seearchByWeekD <- ggplot(searchedData, aes(day, fill=..count..)) + 83 | scale_fill_gradient(low = "yellow", high = "red") + 84 | geom_bar() + 85 | labs(x= "Day", y= "Count") + 86 | ggtitle("What day of the week do you have the highest frequency of searches?", "Weekday activity on detail") 87 | seearchByWeekD 88 | ggplotly() 89 | 90 | 91 | # PLOT SEARCH VOLUME BY WEEKDAY AND TIME 92 | searchWdayTime <- ggplot(searchedData) + 93 | scale_fill_gradient(low = "yellow", high = "red")+ 94 | geom_bar(aes(x = hour, group = day, fill=..count..) ) + 95 | labs(x= "Hour / Day", y= "Count") + 96 | ggtitle("Relationship between day / time you have a higher frequency of searches", "Weekday/Time activity on detail") + 97 | facet_grid(.~day, scales = "free") 98 | searchWdayTime 99 | ggplotly() 100 | 101 | 102 | # CLEAN AND EXTRACT TEXT TO CREATE A TEXT CORPUS 103 | lastTwoYears <- searchedData[(searchedData$year > 2007 & searchedData$year< 2010), ] 104 | 105 | search <- tolower(lastTwoYears$search) 106 | search <- gsub('(http|https)\\S+\\s*|(#|@)\\S+\\s*|\\n|\\"', " ", search) 107 | search <- gsub("(.*.)\\.com(.*.)\\S+\\s|[^[:alnum:]]", " ", search) 108 | search <- trimws(search) 109 | 110 | textCorpus <- Corpus(VectorSource(search)) 111 | textCorpus <- tm_map(textCorpus, content_transformer(removePunctuation)) 112 | textCorpus <- tm_map(textCorpus, content_transformer(removeNumbers)) 113 | stopwords <- c(stopwords("english"), "que", "com", "cómo", "como", "para", "con", "qué", "las", "los", "del", "can") 114 | textCorpus <- tm_map(textCorpus, removeWords, stopwords) 115 | 116 | searchTDM <- TermDocumentMatrix(textCorpus) 117 | searchMatrix <- as.matrix(searchTDM) 118 | 119 | # CREATE DATA FRAME WITH WORDS 120 | arrange <- sort(rowSums(searchMatrix), decreasing = TRUE) 121 | twNames <- names(arrange) 122 | dataCloud <- data.frame(word = twNames, freq = arrange) 123 | 124 | wordcloud(dataCloud$word, dataCloud$freq, min.freq = 40, scale = c(2 , 0.5), max.words = 100, colors=brewer.pal(9, "Paired")) 125 | --------------------------------------------------------------------------------