├── .DS_Store
├── README.md
├── img_plots
├── .DS_Store
├── img1.png
├── img10.png
├── img10_es.png
├── img11.png
├── img11_es.png
├── img12.png
├── img12_es.png
├── img13.png
├── img13_es.png
├── img14.png
├── img14_es.png
├── img15.png
├── img16.png
├── img2.png
├── img3.png
├── img4.png
├── img5.png
├── img6.png
├── img7.png
├── img8.png
└── img9.png
└── main.R
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/.DS_Store
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # r-google-search-history-analysis
2 | Explore your activity on Google with R: How to Analyze and Visualize Your Personal Data Search History Find out how and how much you have used the most popular search engine in the world, using a copy of your personal data. Medium article: https://cosmoduende.medium.com/explore-your-activity-on-google-with-r-how-to-analyze-and-visualize-your-search-history-1fb74e5fb2b6
3 |
4 | ### *Versión en español*
5 | Explora tu actividad en Google con R: Análisis y visualización de datos de tu historial de búsquedas. Descubre cómo y cuánto has usado el buscador más popular del mundo, usando una copia de tus datos personalesArtículo en Medium: https://cosmoduende.medium.com/explora-tu-actividad-en-google-con-r-an%C3%A1lisis-y-visualizaci%C3%B3n-de-datos-de-tu-historial-de-a25638881fbd
6 |
7 | #### *Project preview*
8 | https://www.youtube.com/watch?v=1MbXsX9vFzc
9 |
--------------------------------------------------------------------------------
/img_plots/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/.DS_Store
--------------------------------------------------------------------------------
/img_plots/img1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img1.png
--------------------------------------------------------------------------------
/img_plots/img10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img10.png
--------------------------------------------------------------------------------
/img_plots/img10_es.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img10_es.png
--------------------------------------------------------------------------------
/img_plots/img11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img11.png
--------------------------------------------------------------------------------
/img_plots/img11_es.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img11_es.png
--------------------------------------------------------------------------------
/img_plots/img12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img12.png
--------------------------------------------------------------------------------
/img_plots/img12_es.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img12_es.png
--------------------------------------------------------------------------------
/img_plots/img13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img13.png
--------------------------------------------------------------------------------
/img_plots/img13_es.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img13_es.png
--------------------------------------------------------------------------------
/img_plots/img14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img14.png
--------------------------------------------------------------------------------
/img_plots/img14_es.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img14_es.png
--------------------------------------------------------------------------------
/img_plots/img15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img15.png
--------------------------------------------------------------------------------
/img_plots/img16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img16.png
--------------------------------------------------------------------------------
/img_plots/img2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img2.png
--------------------------------------------------------------------------------
/img_plots/img3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img3.png
--------------------------------------------------------------------------------
/img_plots/img4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img4.png
--------------------------------------------------------------------------------
/img_plots/img5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img5.png
--------------------------------------------------------------------------------
/img_plots/img6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img6.png
--------------------------------------------------------------------------------
/img_plots/img7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img7.png
--------------------------------------------------------------------------------
/img_plots/img8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img8.png
--------------------------------------------------------------------------------
/img_plots/img9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cosmoduende/r-google-search-history-analysis/0864f4f76b6a521e10840af3e4cb8c62a91992c8/img_plots/img9.png
--------------------------------------------------------------------------------
/main.R:
--------------------------------------------------------------------------------
1 | # REQUIRED LIBRARIES
2 | library(wordcloud)
3 | library(lubridate)
4 | library(rvest)
5 | library(tm)
6 | library(tidyverse)
7 | library(plotly)
8 |
9 | # READ DATA
10 | fileHTML <- "Takeout/My Activity/Search/MyActivity.html"
11 | mySearchFile <- read_html(fileHTML, encoding = "UTF-8")
12 |
13 | # SCRAPPING SEARCH DATE AND TIME
14 | dateSearch <- mySearchFile %>%
15 | html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>%
16 | str_extract(pattern = "(?<=
)(.*)(?<=PM|AM)") %>%
17 | mdy_hms()
18 | dateSearch[1:5]
19 |
20 | # SCRAPING SEARCH TEXT
21 | textSearch <- mySearchFile %>%
22 | html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>%
23 | str_extract(pattern = '(?<=)') %>%
24 | str_extract(pattern = '(?<=\">)(.*)')
25 | textSearch[1:5]
26 |
27 | # SCRAPING SEARCH TYPE
28 | searchType <- mySearchFile %>%
29 | html_nodes(xpath = '//div[@class="mdl-grid"]/div/div') %>%
30 | str_extract(pattern = "(?<=mdl-typography--body-1\">)(.*)(?=%
31 | str_extract(pattern = "(\\w+)(?=\\s)")
32 | searchType[1:5]
33 |
34 | # CREATE DATA FRAME USING SCRAPED DATA
35 | searchedData <- tibble(timestamp = dateSearch,
36 | date = as_date(dateSearch),
37 | year = year(dateSearch),
38 | month = month(dateSearch, label = TRUE),
39 | day = weekdays(dateSearch),
40 | hour = hour(dateSearch),
41 | type = searchType,
42 | search = textSearch)
43 |
44 | searchedData$day <- factor(searchedData$day, levels = c("Sunday", "Monday", "Tuesday",
45 | "Wednesday","Thursday", "Friday",
46 | "Saturday"))
47 | searchedData <- na.omit(searchedData)
48 | head(searchedData)
49 |
50 | # PLOT SEARCH VOLUME BY YEAR
51 | searchByYear <- ggplot(searchedData, aes(year, fill=..count..)) +
52 | scale_fill_gradient(low = "yellow", high = "red")+
53 | geom_bar(width=0.7)+
54 | labs(x= "Year", y= "Count") +
55 | ggtitle("How much your search frequency has changed over time", "Search activity by year")
56 | searchByYear
57 | ggplotly()
58 |
59 | # PLOT SEARCH VOLUME BY MONTH
60 | searchByMonth <- searchedData[(searchedData$year > 2007 & searchedData$year< 2021), ]
61 | ggplot(searchByMonth, aes(year, fill=..count..)) +
62 | scale_fill_gradient(low = "yellow", high = "red")+
63 | geom_bar(aes(x = month, group = year)) +
64 | theme(axis.text.x = element_text(angle=90)) +
65 | facet_grid(.~year, scales="free") +
66 | labs(x= "Year / Month", y= "Count") +
67 | ggtitle("How much your search frequency has changed over time", "Month activity on detail")
68 | ggplotly()
69 |
70 |
71 | # PLOT SEARCH VOLUME BY HOUR
72 | seearchByHour <- ggplot(searchedData, aes(hour, fill=..count..)) +
73 | scale_fill_gradient(low = "yellow", high = "red") +
74 | geom_bar() +
75 | labs(x= "Hour", y= "Count") +
76 | ggtitle("What time of day do you have the highest frequency of searches?", "Hour activity on detail")
77 | seearchByHour
78 | ggplotly()
79 |
80 |
81 | # PLOT SEARCH VOLUME BY WEEKDAY
82 | seearchByWeekD <- ggplot(searchedData, aes(day, fill=..count..)) +
83 | scale_fill_gradient(low = "yellow", high = "red") +
84 | geom_bar() +
85 | labs(x= "Day", y= "Count") +
86 | ggtitle("What day of the week do you have the highest frequency of searches?", "Weekday activity on detail")
87 | seearchByWeekD
88 | ggplotly()
89 |
90 |
91 | # PLOT SEARCH VOLUME BY WEEKDAY AND TIME
92 | searchWdayTime <- ggplot(searchedData) +
93 | scale_fill_gradient(low = "yellow", high = "red")+
94 | geom_bar(aes(x = hour, group = day, fill=..count..) ) +
95 | labs(x= "Hour / Day", y= "Count") +
96 | ggtitle("Relationship between day / time you have a higher frequency of searches", "Weekday/Time activity on detail") +
97 | facet_grid(.~day, scales = "free")
98 | searchWdayTime
99 | ggplotly()
100 |
101 |
102 | # CLEAN AND EXTRACT TEXT TO CREATE A TEXT CORPUS
103 | lastTwoYears <- searchedData[(searchedData$year > 2007 & searchedData$year< 2010), ]
104 |
105 | search <- tolower(lastTwoYears$search)
106 | search <- gsub('(http|https)\\S+\\s*|(#|@)\\S+\\s*|\\n|\\"', " ", search)
107 | search <- gsub("(.*.)\\.com(.*.)\\S+\\s|[^[:alnum:]]", " ", search)
108 | search <- trimws(search)
109 |
110 | textCorpus <- Corpus(VectorSource(search))
111 | textCorpus <- tm_map(textCorpus, content_transformer(removePunctuation))
112 | textCorpus <- tm_map(textCorpus, content_transformer(removeNumbers))
113 | stopwords <- c(stopwords("english"), "que", "com", "cómo", "como", "para", "con", "qué", "las", "los", "del", "can")
114 | textCorpus <- tm_map(textCorpus, removeWords, stopwords)
115 |
116 | searchTDM <- TermDocumentMatrix(textCorpus)
117 | searchMatrix <- as.matrix(searchTDM)
118 |
119 | # CREATE DATA FRAME WITH WORDS
120 | arrange <- sort(rowSums(searchMatrix), decreasing = TRUE)
121 | twNames <- names(arrange)
122 | dataCloud <- data.frame(word = twNames, freq = arrange)
123 |
124 | wordcloud(dataCloud$word, dataCloud$freq, min.freq = 40, scale = c(2 , 0.5), max.words = 100, colors=brewer.pal(9, "Paired"))
125 |
--------------------------------------------------------------------------------