├── README.md ├── Startup 100 ├── create_gt_url.js ├── Connect to DB ├── Rescale ├── Google trends batch service.R ├── 2. Scoring GT ├── Daily in percent ├── Most googled by state ├── Searches by state ├── 1. Find winner ├── Execution ├── competitor_tiers.R ├── lynx automate script ├── Google Trends functions ├── Daily data example.R └── Working example /README.md: -------------------------------------------------------------------------------- 1 | Google-Trends 2 | ============= 3 | -------------------------------------------------------------------------------- /Startup 100: -------------------------------------------------------------------------------- 1 | library(XML) 2 | library(RODBC) 3 | startup100_URL="http://www.startup100.net" 4 | tables=readHTMLTable(startup100_URL) 5 | table=tables[[1]] 6 | names(table)=c("Change %", "Company", "S100 Index", "Change", "Index rank", "Category", "Profiles", "Y-tunnus") 7 | table=table[c(-1, -5)] 8 | startup100=table 9 | 10 | connector=odbcConnect("ejjohans_boostery", uid="ejjohans_boost", pwd="Greenstep!!1") 11 | 12 | sqlDrop(connector, "startup100", errors=FALSE) 13 | sqlSave(connector, startup100) 14 | 15 | close(connector) 16 | -------------------------------------------------------------------------------- /create_gt_url.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | 4 | function URL_GT(keyword, country, region, year, month, length){ 5 | 6 | var start = "http://www.google.com/trends/trendsReport?hl=en-US&q="; 7 | var end = "&cmpt=q&content=1&export=1"; 8 | var geo = ""; 9 | var date = ""; 10 | var URL = ""; 11 | var month=1; 12 | var length=3; 13 | 14 | 15 | //Geographic restrictions 16 | if(typeof country!=="undefined") { 17 | geo="&geo="; 18 | geo=geo + country; 19 | if(region!==undefined) geo=geo + "-" + region; 20 | } 21 | 22 | if(typeof keyword==="string"){ 23 | var queries=keyword; 24 | } 25 | 26 | if(typeof keyword==="object"){ 27 | var queries=keyword[0]; 28 | for(var i=1; i < keyword.length; i++){ 29 | queries=queries + "%2C" + keyword[i]; 30 | } 31 | } 32 | 33 | //Dates 34 | if(typeof year!=="undefined"){ 35 | date="&date=" 36 | date=date + month + "%2F" + year + "%20" + length + "m" 37 | } 38 | 39 | URL = start + queries + geo + date + end; 40 | URL = URL.replace(" ", "%20"); 41 | return(URL); 42 | } 43 | -------------------------------------------------------------------------------- /Connect to DB: -------------------------------------------------------------------------------- 1 | #RODBC documentation: http://cran.r-project.org/web/packages/RODBC/RODBC.pdf 2 | #http://cran.r-project.org/web/packages/RODBC/vignettes/RODBC.pdf 3 | #Check data base connection using run>odbcad32 4 | 5 | library(RODBC) 6 | downloadDir="C:/Users/erik.johansson/Downloads" 7 | setwd(downloadDir) 8 | forever="yes" 9 | 10 | while(forever=="yes") { 11 | gt_connector=odbcConnect("ejjohans_googletrends", uid="ejjohans_gt", pwd="***") 12 | 13 | #Check that table is available in R 14 | sqlTables(gt_connector, tableType="TABLE") 15 | 16 | #Get keyword from data base 17 | start_keyword=as.character(sqlFetch(gt_connector, "keyword_table")[1,1]) 18 | keyword=start_keyword 19 | 20 | while(start_keyword==keyword) { 21 | keyword=as.character(sqlFetch(gt_connector, "keyword_table")[1,1]) 22 | Sys.sleep(5) 23 | } 24 | 25 | #Create URL 26 | URL=URL_GT(keyword) 27 | filePath=downloadGT(URL, downloadDir) 28 | data=readGT(filePath) 29 | data=data[,c(3,1,2)] 30 | output=data 31 | 32 | sqlDrop(gt_connector, "output", errors=FALSE) 33 | sqlDrop(gt_connector, "geodata", errors=FALSE) 34 | 35 | sqlSave(gt_connector, output) 36 | 37 | geodata=readGeoGT(filePath) 38 | names(geodata)=c("Region", "SVI", "Source") 39 | 40 | sqlSave(gt_connector, geodata) 41 | 42 | } 43 | close(gt_connector) 44 | -------------------------------------------------------------------------------- /Rescale: -------------------------------------------------------------------------------- 1 | library(quantmod) 2 | 3 | downloadDir="C:/Users/erik.johansson/Downloads" 4 | setwd(downloadDir) 5 | year=c(2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013) 6 | 7 | GT.ts=list() 8 | n=1 9 | for(i in year){ 10 | for(j in 1:12){ 11 | URL=URL_GT(keyword="FTSE 100", year=i, month=j, length=3) 12 | tempDir=downloadGT(URL, downloadDir) 13 | data=read.csv(tempDir, header=F, blank.lines.skip=F) 14 | start=which(data[,1]=="")[1]+3 15 | stop=which(data[,1]=="")[2]-1 16 | data=data[start:stop,] 17 | data[,1]=as.Date(data[,1], "%Y-%m-%d") 18 | data[,2]=as.numeric(as.character(data[,2])) 19 | rownames(data)=data[,1] 20 | data[3]=NA 21 | 22 | #Calculate % change 23 | for(k in 2:nrow(data)){ 24 | data[k, 3]=data[k,2]/data[k-1,2] 25 | } 26 | GT.ts[[n]]=data 27 | n=n+1 28 | } 29 | } 30 | 31 | output=as.xts(GT.ts[[1]][3]) 32 | 33 | for(i in 2:length(GT.ts)){ 34 | data=as.xts(GT.ts[[i]][3]) 35 | output=merge(output, data) 36 | output$new=NA 37 | 38 | for(j in 1:nrow(output)){ 39 | if(is.finite(mean(output[j,],na.rm=T))) output[j,3]=mean(output[j,],na.rm=T) 40 | } 41 | output=output[,3] 42 | names(output)="d_svi" 43 | } 44 | 45 | #Start value 46 | startvalue=100 47 | series=vector() 48 | series[1]=100 49 | for(i in 2:nrow(output)){ 50 | series[i]=series[i-1]*output[i,] 51 | } 52 | -------------------------------------------------------------------------------- /Google trends batch service.R: -------------------------------------------------------------------------------- 1 | # Here, assuming we only deal with daily, non-comparable SVI 2 | search_terms = c('euromicron', 'TC Unterhaltungstechnik', 'SGL Carbon', 'zooplus', 'TUI', 'Borussia Dortmund', 'EUCA', 'TCU', 'SGL', 'ZO1', 'TUI1', 'BVB') 3 | frequency = 'daily' 4 | comparable = TRUE 5 | country = NA 6 | region = NA 7 | year = NA 8 | 9 | years = c(2004,2005,2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016) 10 | months = c(1,4,7,10) 11 | length = 3 12 | 13 | url=vector() 14 | counter = 1 15 | for(search_term in search_terms){ 16 | for(year in years){ 17 | for(month in months){ 18 | if(year == as.numeric(substr(as.character(Sys.time()), 1, 4)) & month > as.numeric(substr(as.character(Sys.time()), 6, 7))){ 19 | next() # This stops us from creating URLs for dates that don't exist. 20 | } 21 | url[counter]=URL_GT(keyword=search_term, year=year, month=month) 22 | counter = counter + 1 23 | } 24 | } 25 | } 26 | 27 | for(search_term in search_terms){ 28 | url[counter]=URL_GT(search_term) 29 | counter = counter + 1 30 | } 31 | 32 | for(i in 1:length(url)){ 33 | lynx_commands <- lynx_script(url[i]) # Create the lynx script 34 | write.table(lynx_commands, '/root/gt_download', row.names=F, col.names=F, quote=F) # Save the lynx script 35 | system("lynx -cmd_script=/root/gt_download www.google.com") # Execute the lynx script (takes a while, be patient) 36 | } 37 | -------------------------------------------------------------------------------- /2. Scoring GT: -------------------------------------------------------------------------------- 1 | scoringGT=function(players, competitionsGToutput){ 2 | #"players" must be a data frame with at least one column 3 | scoreBoard=data.frame() 4 | 5 | if(nrow(competitionsGToutput[-which(competitionsGToutput[,1]=="Winner"),])>0){ 6 | competitionsGToutput=competitionsGToutput[-which(competitionsGToutput[,1]=="Winner"),] 7 | } 8 | 9 | competitionsGToutput[,1]=as.numeric(competitionsGToutput[,1]) 10 | 11 | 12 | for(i in 1:max(competitionsGToutput[1:(nrow(competitionsGToutput)),1])) { 13 | batch=competitionsGToutput[which(competitionsGToutput[,1]==i),] 14 | highScore=max(batch[,3]) 15 | roundWinner=batch[which(batch[,3]==highScore),] 16 | 17 | batch[,4]=batch[,3]/roundWinner[,3] 18 | colnames(batch)[4]="Multiplier" 19 | 20 | scoreBoard=rbind(scoreBoard, batch) 21 | colnames(scoreBoard)=c("Round", "Player", "Score", "Multiplier") 22 | } 23 | 24 | M=as.data.frame(matrix(NA, nrow(players), nrow(players))) 25 | iterationMax=sum(is.na(M)) 26 | colnames(M)=players[,1] 27 | rownames(M)=players[,1] 28 | 29 | for(j in 1:max(scoreBoard[,1])){ 30 | batch=scoreBoard[which(scoreBoard[,1]==j),] 31 | 32 | for(i in 1:nrow(batch)){ 33 | place=which(tolower(rownames(M))==batch[i,2]) 34 | roundWinner=batch[which(batch[,4]==1),2] 35 | M[place, which(tolower(colnames(M))==roundWinner)]=batch[i,4] 36 | } 37 | } 38 | 39 | for(y in 1:nrow(players)){ 40 | for(x in 1:nrow(players)){ 41 | if(is.na(M[y,x])) M[y,x]=1/M[x,y] 42 | M[y,y]=1 43 | } 44 | } 45 | 46 | while((iterationMax-sum(is.na(M)))>0){ 47 | iterationMax=sum(is.na(M)) 48 | for(i in 1:nrow(players)){ 49 | for(x in 1:nrow(players)){ 50 | for(y in 1:nrow(players)){ 51 | if(is.na(M[y,i])) M[y,i]=M[y,x]/M[i,x] 52 | } 53 | } 54 | } 55 | } 56 | 57 | return(M) 58 | } 59 | -------------------------------------------------------------------------------- /Daily in percent: -------------------------------------------------------------------------------- 1 | #Create download pahts 2 | 3 | year=c(2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014) 4 | output=vector() 5 | downloadDir="C:/Users/erik.johansson/Downloads" 6 | setwd(downloadDir) 7 | for(i in year){ 8 | for(j in 1:12){ 9 | URL=URL_GT("ftse 100", year=i, month=j, length=2) 10 | output=append(output, URL) 11 | } 12 | } 13 | 14 | #Create table to store ouput 15 | URL=output 16 | gt_results=data.frame(as.Date("10.1.2004", "%d.%m.%Y"), NA, NA, NA, 1) 17 | colnames(gt_results)=c("Date", "SVI", "Company", "Path", "Percentage") 18 | 19 | for(i in 1:length(URL)){ 20 | #Download file 21 | gt_path=downloadGT(URL[i], downloadDir) 22 | 23 | #Format csv 24 | gt_data=readGT(gt_path) 25 | 26 | #Increment all by one to make percentage calculation possible 27 | gt_data[,2]=gt_data[,2]+1 28 | 29 | gt_data[which(is.na(gt_data[,2])),2]=1 30 | gt_data[5]=NA 31 | names(gt_data)[5]="Percentage" 32 | 33 | #Calculate percentage change 34 | for(j in 2:nrow(gt_data)){ 35 | gt_data$Percentage[j]=gt_data$SVI[j]/gt_data$SVI[j-1] 36 | } 37 | 38 | #Find first instance of date overlap in the new file 39 | date_match=which(gt_data$Date==gt_results[nrow(gt_results),1]) 40 | 41 | #To ensure that we haven't skipped a date (since the data might be on a weekly level in some cases) we do the same check for the results data 42 | date_match_results=which(gt_results$Date==gt_results[nrow(gt_results),1]) 43 | 44 | if(length(date_match)>0) { 45 | gt_data_subset=gt_data[(date_match+1):nrow(gt_data),] 46 | gt_results=gt_results[1:date_match_results,] 47 | } else {gt_data_subset=gt_data} 48 | colnames(gt_data_subset)=c("Date", "SVI", "Company", "Path", "Percentage") 49 | gt_results=rbind(gt_results, gt_data_subset) 50 | } 51 | -------------------------------------------------------------------------------- /Most googled by state: -------------------------------------------------------------------------------- 1 | queries="" 2 | downloadDir="C:/Users/erik.johansson/Downloads" 3 | country="US" 4 | paths=data.frame() 5 | states=c('AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY') 6 | states2=read.csv("C:/Users/erik.johansson/Dropbox/Google Trends/States/states.csv", sep=";") 7 | GT.raw=list() 8 | 9 | 10 | for(i in 1:length(states)){ 11 | URL=paste("http://www.google.com/trends/trendsReport?hl=en-US", "&geo=", country, "-", states[i], "&q=", queries, "&cmpt=q&content=1&export=1", sep="") 12 | 13 | #Get the file path for the csv 14 | startingFiles=list.files(downloadDir) 15 | endingFiles=list.files(downloadDir) 16 | browseURL(URL) 17 | while(length(setdiff(endingFiles,startingFiles))==0) { 18 | Sys.sleep(3) 19 | endingFiles=list.files(downloadDir) 20 | } 21 | filePath=setdiff(endingFiles,startingFiles) 22 | paths[i,1]=states[i] 23 | paths[i,2]=filePath 24 | } 25 | 26 | for(i in 1:nrow(paths)){ 27 | filePath=paste(downloadDir, "/", paths[i,2], sep="") 28 | GT.raw[[i]]=read.csv(filePath, header=FALSE, blank.lines.skip=FALSE, stringsAsFactors=FALSE) 29 | } 30 | 31 | summary=NA 32 | output=as.data.frame(matrix(NA, nrow=0, ncol=3)) 33 | for(i in 1:length(GT.raw)){ 34 | start=which(GT.raw[[i]][,1]=="")[3]+2 35 | stop=which(GT.raw[[i]][,1]=="")[4]-1 36 | data=GT.raw[[i]][start:stop,, drop=F] 37 | topList=as.numeric(as.character(data[,1])) 38 | summary=data[which(is.na(topList)),,drop=F] 39 | summary[2]=topList[-which(is.na(topList))] 40 | summary[3]=tolower(states2[i,1]) 41 | output=rbind(output,summary) 42 | } 43 | -------------------------------------------------------------------------------- /Searches by state: -------------------------------------------------------------------------------- 1 | celebrities=c('Snooki', 'Paris Hilton', 'Nicole Polizzi', 'Miley Cyrus', 'Lindsay Lohan', 'Kris Jenner', 'Kourtney Kardashian', 'Kim Kardashian', 'Khloe Kardashian', 'Justin Bieber', 'Honey Boo Boo') 2 | downloadDir="C:/Users/erik.johansson/Downloads" 3 | 4 | scoringTable=function(data=data.frame()){ 5 | summary=as.data.frame(matrix(NA, nrow=min(1, (ncol(data)-1)), ncol=5)) 6 | for(i in 1:(ncol(data)-1)) summary[i,2]=data[which(data[,1]=="")[1]+2,i+1] 7 | for(i in 1:(ncol(data)-1)) summary[i,3]=data[which(data[,1]=="")[2]-2,i+1] 8 | summary[,3]=as.numeric(as.character(summary[,3])) 9 | highScore=max(summary[,3]) 10 | roundWinner=summary[which(summary[,3]==highScore),1] 11 | summary[,4]=summary[,3]/highScore 12 | summary[which(summary[,3]==highScore),5]="Round winner" 13 | summary[,1]=data[2,1] 14 | 15 | return(summary) 16 | } 17 | 18 | 19 | mostGoogled=function(query, downloadDir){ 20 | country="US" 21 | states=c('AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY') 22 | states=states[4:14] 23 | 24 | #Initialize the score board 25 | scoreBoard=as.data.frame(matrix(NA, 0, 6)) #Main score board 26 | 27 | for(i in 1:length(states)){ 28 | #roundWinner requires initial value 29 | roundWinner=query[1] 30 | for(j in seq(1,length(query), by=5)){ 31 | queries=roundWinner 32 | for(k in 1:4) if(!is.na(query[j+k])) queries=paste(queries, "%2C ", query[j+k]) 33 | URL=paste("http://www.google.com/trends/trendsReport?hl=en-US", "&geo=", country, "-", states[i], "&q=", queries, "&cmpt=q&content=1&export=1", sep="") 34 | 35 | #Get the file path for the csv 36 | startingFiles=list.files(downloadDir) 37 | endingFiles=list.files(downloadDir) 38 | browseURL(URL) 39 | while(length(setdiff(endingFiles,startingFiles))==0) { 40 | Sys.sleep(3) 41 | endingFiles=list.files(downloadDir) 42 | } 43 | filePath=setdiff(endingFiles,startingFiles) 44 | 45 | #Read the csv 46 | error_handler=tryCatch( 47 | read.csv(paste(downloadDir, "/", filePath, sep=""), header=FALSE, blank.lines.skip=FALSE, stringsAsFactors=FALSE), 48 | error=function(e) e 49 | ) 50 | 51 | if(inherits(error_handler, "error")){ 52 | Sys.sleep(8) 53 | } 54 | 55 | trendsData=read.csv(paste(downloadDir, "/", filePath, sep=""), header=FALSE, blank.lines.skip=FALSE, stringsAsFactors=FALSE) 56 | score=scoringTable(trendsData) 57 | score[ncol(score)+1]=filePath 58 | roundWinner=score[which(score[,5]=="Round winner"),2][1] 59 | scoreBoard=rbind(scoreBoard, score) 60 | } 61 | } 62 | } 63 | 64 | mostGoogled(celebrities, downloadDir) 65 | -------------------------------------------------------------------------------- /1. Find winner: -------------------------------------------------------------------------------- 1 | 2 | competitionGT=function(players, path, download=TRUE){ 3 | 4 | #Set errortest default value 5 | errortest=NA 6 | 7 | #Initialize file count 8 | k=1 9 | 10 | #Create file references 11 | maxFiles=2000 12 | filenames=vector() 13 | filenames[1]=paste(path, "/report.csv", sep="") 14 | for(i in 1:maxFiles) filenames[i+1]=paste(path, "/report(", i, ").csv", sep="") 15 | 16 | scoreBoard=as.data.frame(matrix(NA, 0, 3)) 17 | colnames(scoreBoard)=c("Round", "Player", "Score") 18 | 19 | 20 | 21 | #Set initial value for round winner 22 | roundWinner=as.character(players[1,1]) 23 | 24 | #####START THE COMPETITION 25 | start=2 26 | 27 | if(download==TRUE) {stop=nrow(players) 28 | } else {stop=length(list.files(path))} 29 | 30 | for(i in seq(start, stop, by=4)){ 31 | 32 | #roundWinner always participates in the competition 33 | query=roundWinner 34 | 35 | #Select batch of four players to go up against the roundWinner. The min function ensures that there are no empty slots 36 | batch=players[i:min((i+3),nrow(players)),1] 37 | 38 | #If the files haven't been downloaded already, create Google Trends query, download and import file 39 | if(download==TRUE){ 40 | 41 | #File selector k 42 | k=length(list.files(path))+1 43 | 44 | for(j in 1:length(batch)) query=paste(query, "%2C ", batch[j], sep="") 45 | URL=paste("http://www.google.com/trends/trendsReport?hl=en-US&q=", query,"&cmpt=q&content=1&export=1", sep="") 46 | browseURL(URL) 47 | 48 | 49 | Sys.sleep(4) 50 | 51 | #Error handler. Try do read file. If it fails, wait 8 seconds. If it fails again, wait 8 seconds more. 52 | errortest=tryCatch( 53 | read.csv(filenames[k], header=FALSE, blank.lines.skip=FALSE, stringsAsFactors=FALSE), 54 | error=function(e) e 55 | ) 56 | 57 | if(inherits(errortest, "error")){ 58 | Sys.sleep(8) 59 | } 60 | 61 | errortest=tryCatch( 62 | read.csv(filenames[k], header=FALSE, blank.lines.skip=FALSE, stringsAsFactors=FALSE), 63 | error=function(e) e 64 | ) 65 | 66 | if(inherits(errortest, "error")){ 67 | Sys.sleep(8) 68 | } 69 | } 70 | 71 | #Make sure that the files can be loaded. If not, skip this iteration 72 | if(!inherits(errortest, "error")){ 73 | players.raw=read.csv(filenames[k], header=FALSE, blank.lines.skip=FALSE, stringsAsFactors=FALSE) 74 | 75 | 76 | #Initialize results 77 | results=data.frame(matrix(NA, (nrow=length(players.raw)-1), ncol=3)) 78 | results[,1]=k 79 | for(i in 1:(ncol(players.raw)-1)) results[i,2]=players.raw[5,i+1] 80 | for(i in 1:(ncol(players.raw)-1)) results[i,3]=as.numeric(as.character(players.raw[(which(players.raw=="")[2]-2),i+1])) 81 | 82 | highScore=max(results[,3]) 83 | 84 | colnames(results)=c("Round", "Player", "Score") 85 | 86 | roundWinner=results[which(results[,3]==highScore),2] 87 | scoreBoard=rbind(scoreBoard, results) 88 | 89 | k=k+1 #Move file selector one step forward 90 | 91 | 92 | winner=data.frame("Winner", roundWinner, highScore) 93 | colnames(winner)=c("Round", "Player", "Score") 94 | scoreBoard=rbind(scoreBoard, winner) 95 | } 96 | } 97 | return(scoreBoard) 98 | } 99 | 100 | 101 | 102 | -------------------------------------------------------------------------------- /Execution: -------------------------------------------------------------------------------- 1 | setwd("C:/Users/erik.johansson/Dropbox/Google Trends") 2 | 3 | pathData.capitals="Capitals/GT data" 4 | pathPlayers.capitals="Capitals/European capitals.csv" 5 | 6 | pathData.brands="Top 10 brands/Data" 7 | pathPlayers.brands="Top 10 brands/500 most valuable.csv" 8 | 9 | pathData.slush="Coming to Slush/Data" 10 | pathPlayers.slush="Coming to Slush/companies.csv" 11 | 12 | #pathPlayers=pathPlayers.capitals 13 | #pathData=pathData.capitals 14 | #pathPlayers=pathPlayers.brands 15 | #pathData=pathData.brands 16 | pathPlayers=pathPlayers.slush 17 | pathData=pathData.slush 18 | 19 | players=read.csv(pathPlayers, sep=";") 20 | podiumSize=3 21 | 22 | 23 | #For downloading the files 24 | round=list() 25 | for(i in 1:podiumSize){ 26 | round[[i]]=competitionGT(players, pathData, download=TRUE) 27 | players=players[-which(tolower(players[,1])==round[[i]][nrow(round[[i]]),2]),] 28 | } 29 | 30 | ###Once the files have been downloaded, we can import them again by setting the download option fo False 31 | scoreBoard=data.frame(1:10) 32 | for(i in 1:podiumSize) scoreBoard[i,2]=round[[i]][which(round[[i]][,1]=="Winner"),2] 33 | colnames(scoreBoard)=c("Rank", "Capital") 34 | 35 | #Once the data has been downloaded, we can import and order the data using the scoringGT function 36 | data=competitionGT(players, pathData, download=FALSE) 37 | #winner="London" 38 | winner="Facebook" 39 | 40 | scoringMatrix=scoringGT(players, data) 41 | scoreBoard=scoringMatrix[,which(colnames(scoringMatrix)==winner), drop=FALSE] 42 | scoreBoard=scoreBoard[-which(is.na(scoreBoard[,1])),,drop=FALSE] 43 | colnames(scoreBoard)="Score" 44 | scoreBoard=scoreBoard[order(scoreBoard, decreasing=TRUE),, drop=FALSE] 45 | 46 | #Now we have a ranking of the cities. Finally, let's download the individual files and create a nice graph of them all combined 47 | library(quantmod) 48 | path="C:/Users/erik.johansson/Dropbox/Google Trends/Capitals/Individual data" 49 | #downloadWeeklyGT(as.character(players[,1])) 50 | GT.raw=importGT(path) 51 | GT.summary=summaryTable(GT.raw) 52 | GT.f=formatGT(GT.raw, GT.summary) 53 | GT.m=mergeGT(GT.f) 54 | rownames(GT.m)=as.Date(GT.m[,1], "%Y-%m-%d") 55 | GT.date=GT.m[,1] 56 | GT.m=GT.m[-1] 57 | GT.m=as.xts(GT.m) 58 | index(GT.m)=GT.date 59 | GT.m=na.approx(GT.m) 60 | 61 | GT.return=as.data.frame(matrix(NA, nrow(GT.m), ncol(GT.m))) 62 | rownames(GT.return)=index(GT.m) 63 | colnames(GT.return)=colnames(GT.m) 64 | 65 | for(i in 1:ncol(GT.m)){ 66 | GT.return[i]=Delt(GT.m[,i]) 67 | } 68 | 69 | GT.final=as.data.frame(matrix(NA, nrow(GT.m), nrow(scoreBoard))) 70 | colnames(GT.final)=rownames(scoreBoard) 71 | rownames(GT.final)=index(GT.m) 72 | 73 | #Athens got lost along the way, so let's adjust for that 74 | GT.return=GT.return[,-which(colnames(GT.return)=="athens")] 75 | GT.return=GT.return[,order(colnames(GT.return))] 76 | rownames(GT.return)=index(GT.m) 77 | rownames(GT.final)=as.Date(index(GT.m), "%Y-%m-%d") 78 | 79 | GT.final[nrow(GT.final),]=scoreBoard[,1]*100 80 | GT.final=GT.final[,order(colnames(GT.final))] 81 | 82 | for(i in 1:(nrow(GT.final))){ 83 | GT.final[nrow(GT.final)-i,]=as.numeric(GT.final[nrow(GT.final)-i+1,])/(1+as.numeric(GT.return[nrow(GT.return)-i+1,])) 84 | } 85 | 86 | #Then let's plot the winners 87 | batch=rownames(scoreBoard)[1:10] 88 | numbers=1:nrow(GT.final) 89 | plots=list() 90 | for(i in 1:10){ 91 | winner=which(colnames(GT.final)==batch[i]) 92 | plots[[i]]=smooth.spline(as.Date(rownames(GT.final), "%Y-%m-%d"), GT.final[,winner], spar=0.35) 93 | } 94 | 95 | temp=data.frame(plots[[1]]$x) 96 | temp[,2]=plots[[1]]$y 97 | temp[1,2]=0 98 | plot(as.Date(temp[,1]), temp[,2], type="l", col="white", xlab="Date", ylab="Search Volume", main="Most popular capitals") 99 | for(i in 1:10){ 100 | lines(plots[[i]], col=i) 101 | } 102 | text(locator(), labels = batch[1:5]) 103 | -------------------------------------------------------------------------------- /competitor_tiers.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | 3 | company <- 'transferwise' 4 | 5 | competitors <- c('moneygram', 6 | 'western union', 7 | 'fairfx', 8 | 'caxton fx', 9 | 'worldfirst', 10 | 'worldremit', 11 | 'currencyfair', 12 | 'transfergo', 13 | 'tawipay', 14 | 'xoom', 15 | 'transfast', 16 | 'remitly', 17 | 'ria money transfer', 18 | 'azimo', 19 | 'moneycorp', 20 | 'ukforex', 21 | 'hifx', 22 | 'post office money', 23 | 'transferwise', 24 | 'revolut') 25 | 26 | 27 | competitors <- c('ZhongAn', 28 | 'oscar health', # Do distinguish from The Oscars 29 | 'wealthfront', 30 | 'qufenqi', 31 | 'funding circle', 32 | 'kreditech', 33 | 'avant', 34 | 'atom bank', 35 | 'klarna', 36 | 'our crowd', 37 | 'lufax', 38 | 'robinhood', 39 | '%2Fm%2F0by16yq', # Square 40 | 'motif investing', 41 | 'xero', 42 | 'stripe', 43 | 'collective health', 44 | 'credit karma', 45 | 'adyen', 46 | 'personal capital', 47 | 'secure key technologies ', 48 | 'betterment', 49 | 'kabbage', 50 | 'lending club', 51 | 'prosper', 52 | 'coinbase', 53 | 'izettle', 54 | 'policybazaar', 55 | 'knip', 56 | 'affirm', 57 | 'circleup', 58 | 'iex ', 59 | 'prospa', 60 | 'etoro', 61 | 'spotcap', 62 | 'jimubox', 63 | 'transferwise', 64 | 'rong360', 65 | '21inc', 66 | 'coverfox', 67 | 'angellist') 68 | 69 | competitors=tolower(competitors) 70 | rank_table <- data.frame(competitors=competitors, batch = ceiling(seq(1, length(competitors),1)/4), stringsAsFactors=F) 71 | downloadDir = '/users/erik.johansson/downloads' 72 | res = list() 73 | for(i in 1:max(rank_table$batch)){ 74 | r = which(rank_table$batch == i) 75 | keywords = c(rank_table$competitors[r], company) 76 | url = URL_GT(keywords, country='GB') 77 | GT_dir = downloadGT(url, downloadDir) 78 | GT_dir = paste(downloadDir, GT_dir, sep='/') 79 | res[[i]] = readGT(GT_dir) 80 | } 81 | res.normalised = list() 82 | for(i in 1:length(res)){ 83 | print(i) 84 | res.normalised[[i]] = res[[i]] 85 | r <- which(res[[i]]$Keyword==company) 86 | res.company <- res[[i]][r,] 87 | keywords = unique(res[[i]]$Keyword) 88 | 89 | for(j in 1:length(keywords)){ 90 | print(paste("j", j)) 91 | s = which(res[[i]]$Keyword == keywords[j]) 92 | res.normalised[[i]]$SVI[s] = res[[i]]$SVI[s] / res.company$SVI 93 | } 94 | } 95 | 96 | df <- do.call("rbind", res.normalised) 97 | 98 | df %>% ggplot(aes(Date, SVI,color=Keyword))+geom_line() 99 | 100 | 101 | df.max <- df[which(df$Date==max(df$Date)),] 102 | df.max <-df.max[!duplicated(df.max[-4]),] 103 | rank_table <- merge(rank_table, df.max[c(3,2)], by.x='competitors', by.y='Keyword') %>% unique 104 | rank_table <- rank_table[order(rank_table$SVI, decreasing=T),] 105 | rank_table <- rank_table[is.finite(rank_table$SVI),] 106 | deviation <- rank_table$SVI-mean(rank_table$SVI) 107 | top_tier <- which(deviation > sd(rank_table$SVI)/2) 108 | bottom_tier <- which(deviation < -sqrt(sd(rank_table$SVI))/5) 109 | rank_table$tier <- 'mid_tier' 110 | rank_table$tier[top_tier] <- 'top_tier' 111 | rank_table$tier[bottom_tier] <- 'bottom_tier' 112 | 113 | top_tier_competitors <- rank_table$competitors[which(rank_table$tier == 'top_tier')] 114 | df[which(df$Keyword %in% top_tier_competitors),] %>% filter(Date > '2015-01-01') %>% ggplot(aes(Date, SVI,color=Keyword))+geom_line() 115 | 116 | mid_tier_competitors <- rank_table$competitors[which(rank_table$tier == 'mid_tier')] 117 | df[which(df$Keyword %in% mid_tier_competitors),] %>% filter(Date > '2015-01-01') %>% ggplot(aes(Date, SVI,color=Keyword))+geom_line() 118 | 119 | bottom_tier_competitors <- rank_table$competitors[which(rank_table$tier == 'bottom_tier')] 120 | df[which(df$Keyword %in% bottom_tier_competitors),] %>% filter(Date > '2015-01-01') %>% ggplot(aes(Date, SVI,color=Keyword))+geom_line() 121 | 122 | df[-which(df$Keyword %in% bottom_tier_competitors),] %>% filter(Date > '2014-01-01') %>% ggplot(aes(Date, SVI,color=Keyword))+geom_line() 123 | 124 | write.cb(rank_table) 125 | -------------------------------------------------------------------------------- /lynx automate script: -------------------------------------------------------------------------------- 1 | 2 | # Remove old files from directory 3 | system('rm /root/downloads/*.*') 4 | library(data.table) 5 | library(RCurl) 6 | library(RMySQL) 7 | library(dplyr) 8 | library(reshape2) 9 | functions <- getURL("https://raw.githubusercontent.com/321k/R-Helper-Functions/master/general%20helper%20functions") 10 | eval(parse(text=functions)) 11 | functions <- getURL("https://raw.githubusercontent.com/321k/Google-Trends/master/Google%20Trends%20functions") 12 | eval(parse(text=functions)) 13 | 14 | 15 | 16 | swap <- function(x, winner, loser){ 17 | x_copy <- x 18 | w <- which(x==winner) 19 | l <- which(x==loser) 20 | x[w] <- x_copy[l] 21 | x[l] <- x_copy[w] 22 | return(x) 23 | } 24 | 25 | 26 | # Function that creates the instructions for lynx to fetch data 27 | lynx_script <- function(url){ 28 | file_name <- paste('google_trends_download_,' ', Sys.time(), '.csv.gz', sep='') 29 | path <- paste('/root/downloads/', file_name, sep='') 30 | script <- list() 31 | script[[1]] <- c('A' 32 | ,'Down Arrow' 33 | ,'Down Arrow' 34 | ,'Down Arrow' 35 | ,'Down Arrow' 36 | ,'Down Arrow' 37 | ,'Down Arrow' 38 | ,'Down Arrow' 39 | ,'Down Arrow' 40 | ,'Down Arrow' 41 | ,'Down Arrow' 42 | ,'^J' 43 | ,'A' 44 | ,'a' 45 | ,'n' 46 | ,'t' 47 | ,'o' 48 | ,'n' 49 | ,'.' 50 | ,'m' 51 | ,'a' 52 | ,'i' 53 | ,'n' 54 | ,'h' 55 | ,'o' 56 | ,'f' 57 | ,'@' 58 | ,'g' 59 | ,'m' 60 | ,'a' 61 | ,'i' 62 | ,'l' 63 | ,'.' 64 | ,'c' 65 | ,'o' 66 | ,'m' 67 | ,'' 68 | ,'o' 69 | ,'o' 70 | ,'d' 71 | ,'i' 72 | ,'x' 73 | ,'a' 74 | ,'c' 75 | ,'h' 76 | ,'^J' 77 | ,'^J' 78 | ,'A' 79 | ,'A' 80 | ,'A' 81 | ,'A' 82 | ,'A' 83 | ,'g') 84 | 85 | script[[2]] <- substring(url, 1:nchar(url), 1:nchar(url)) 86 | script[[3]] <- c( 87 | '^J', 88 | 'A', 89 | 'D', 90 | 'Down Arrow', 91 | 'Down Arrow', 92 | 'Down Arrow', 93 | '^J', 94 | '', 95 | '', 96 | '', 97 | '', 98 | '', 99 | '', 100 | '', 101 | '', 102 | '', 103 | '', 104 | '', 105 | '', 106 | '' 107 | ) 108 | 109 | script[[4]] <- substring(path, 1:nchar(path), 1:nchar(path)) 110 | script[[5]] <- c( 111 | '^J', 112 | 'q', 113 | 'y' 114 | ) 115 | 116 | res <- vector() 117 | for(i in 1:length(script)){ 118 | res <- c(res, script[[i]]) 119 | } 120 | for(i in 1:length(res)){ 121 | res[i] <- paste('key', res[i]) 122 | } 123 | res <- paste(res,collapse="\n") 124 | return(res) 125 | } 126 | 127 | 128 | 129 | # Connect to db and get list of competitors to fetch data for 130 | con = dbConnect(MySQL(), user='erik', password='johansson', dbname='gt') 131 | 132 | query = 'select * from gt.keywords;' 133 | competitors <- dbGetQuery(con, query) 134 | competitors <- competitors$competitors 135 | 136 | # Create pairwise combinations of all competitors 137 | pairwise <- combn(competitors, 2) 138 | 139 | # Create download url for all combinations 140 | url=vector() 141 | for(i in 1:ncol(pairwise)){ 142 | url[i] <- URL_GT(pairwise[,i]) 143 | } 144 | 145 | # Download the files 146 | for(i in 1:length(url)){ 147 | lynx_commands <- lynx_script(url[i]) 148 | write.table(lynx_commands, '/root/gt_download', row.names=F, col.names=F, quote=F) 149 | system("lynx -cmd_script=/root/gt_download www.google.com") 150 | } 151 | 152 | # List path to files just downloaded 153 | file_path <- list.files('/root/downloads/') 154 | 155 | # Read files 156 | gt <- readGT(paste('/root/downloads/', file_path, sep='')) 157 | 158 | # Calculate the ratio between the different combinations 159 | paths <- unique(gt$Path) 160 | res <- list() 161 | for(i in 1:length(paths)){ 162 | r <- which(gt$Path==paths[i]) 163 | if(length(unique(gt$Keyword[r]))==1) next() 164 | res[[i]] <- gt[r,] %>% 165 | select(Date, Keyword, SVI) %>% 166 | dcast(Date~Keyword) %>% 167 | mutate(numerator_name=names(.)[3] 168 | , denominator_name=names(.)[2] 169 | , ratio=.[,3]/.[,2]) %>% 170 | setnames(names(.)[2:3], c('numerator', 'denominator')) 171 | } 172 | results <- data.frame(Reduce(rbind, res)) 173 | 174 | dbWriteTable(con, 'competitors', results, overwrite=T) 175 | 176 | max_date=max(results$Date) 177 | x <- results %>% filter(Date==max_date) %>% select(numerator_name, denominator_name, ratio) 178 | tmp <- x %>% select(denominator_name, numerator_name, ratio) %>% mutate(ratio=1/ratio) %>% setnames(names(.), names(x)) 179 | x <- rbind(x, tmp) 180 | 181 | # Ranking algorithm 182 | rank <- competitors 183 | for(j in 1:5){ 184 | for(i in 1:(length(rank)-1)){ 185 | participants <- rank[i:(i+1)] 186 | r <- which(x$numerator_name==participants[1] & x$denominator_name==participants[2]) 187 | if(length(r)==0){ 188 | player_ahead = which(x$numerator_name==participants[1]) 189 | player_behind = which(x$numerator_name==participants[2]) 190 | if(length(player_ahead) < length(player_behind)){ 191 | rank <- swap(rank, participants[1], participants[2]) 192 | next() 193 | } else {next()} 194 | } 195 | if(x$ratio[r]<1){ 196 | rank <- swap(rank, participants[1], participants[2]) 197 | print(paste("swapping", participants[1], participants[2])) 198 | print(rank) 199 | } 200 | } 201 | } 202 | 203 | # Create leaderboard 204 | leaderboard=as.data.frame(rank) 205 | leaderboard$pairwise_ratio <- NA 206 | 207 | for(i in 1:(nrow(leaderboard)-1)){ 208 | numerator = as.character(leaderboard$rank[i]) 209 | denominator = as.character(leaderboard$rank[i+1]) 210 | leaderboard$pairwise_ratio[i] <- x %>% filter(numerator_name==numerator, denominator_name==denominator) %>% select(ratio) %>% as.numeric 211 | } 212 | 213 | leaderboard$absolute_ratio <- NA 214 | start <- which(is.na(leaderboard$pairwise_ratio))[1] 215 | leaderboard$absolute_ratio[start]=1 216 | for(i in start:2){ 217 | leaderboard$absolute_ratio[i-1] <- leaderboard$absolute_ratio[i] * leaderboard$pairwise_ratio[i-1] 218 | } 219 | leaderboard$absolute_ratio[which(is.na(leaderboard$absolute_ratio))] <- 0 220 | dbWriteTable(con, 'leaderboard', leaderboard) 221 | -------------------------------------------------------------------------------- /Google Trends functions: -------------------------------------------------------------------------------- 1 | #This script automates the downloading of Google Trends. 2 | #It works best with firefox in combination with the Tab Mix Plus add-on that is used to automate tab closing. 3 | #Ask firefox not to prompt for new downloads and this script should run automatically. 4 | #Google Trends restricts the number of download to roughly 400 at a time. 5 | 6 | URL_GT=function(keyword="", country=NA, region=NA, year=NA, month=1, length=3){ 7 | 8 | start="http://www.google.com/trends/trendsReport?hl=en-US&q=" 9 | end="&cmpt=q&content=1&export=1" 10 | geo="" 11 | date="" 12 | 13 | #Geographic restrictions 14 | if(!is.na(country)) { 15 | geo="&geo=" 16 | geo=paste(geo, country, sep="") 17 | if(!is.na(region)) geo=paste(geo, "-", region, sep="") 18 | } 19 | 20 | queries=keyword[1] 21 | if(length(keyword)>1) { 22 | for(i in 2:length(keyword)){ 23 | queries=paste(queries, "%2C ", keyword[i], sep="") 24 | } 25 | } 26 | 27 | #Dates 28 | if(!is.na(year)){ 29 | date="&date=" 30 | date=paste(date, month, "%2F", year, "%20", length, "m", sep="") 31 | } 32 | 33 | URL=paste(start, queries, geo, date, end, sep="") 34 | URL <- gsub(" ", "%20", URL) 35 | return(URL) 36 | } 37 | 38 | downloadGT=function(URL, downloadDir){ 39 | 40 | #Determine if download has been completed by comparing the number of files in the download directory to the starting number 41 | startingFiles=list.files(downloadDir) 42 | browseURL(URL) 43 | endingFiles=list.files(downloadDir) 44 | 45 | while(length(setdiff(endingFiles,startingFiles))==0) { 46 | Sys.sleep(3) 47 | endingFiles=list.files(downloadDir) 48 | } 49 | filePath=setdiff(endingFiles,startingFiles) 50 | return(filePath) 51 | } 52 | 53 | 54 | readGT=function(filePath){ 55 | rawFiles=list() 56 | 57 | for(i in 1:length(filePath)){ 58 | if(length(filePath)==1) rawFiles[[1]]=read.csv(filePath, header=F, blank.lines.skip=F) 59 | if(length(filePath)>1) rawFiles[[i]]=read.csv(filePath[i], header=F, blank.lines.skip=F) 60 | } 61 | 62 | output=data.frame() 63 | name=vector() 64 | 65 | for(i in 1:length(rawFiles)){ 66 | data=rawFiles[[i]] 67 | name=as.character(t(data[5,-1])) 68 | 69 | #Select the time series 70 | start=which(data[,1]=="")[1]+3 71 | stop=which(data[,1]=="")[2]-2 72 | 73 | #Skip to next if file is empty 74 | if(ncol(data)<2) next 75 | if(is.na(which(data[,1]=="")[2]-2)) next 76 | 77 | data=data[start:stop,] 78 | data[,1]=as.character(data[,1]) 79 | 80 | #Convert all columns except date column into numeric 81 | for(j in 2:ncol(data)) data[,j]=as.numeric(as.character(data[,j])) 82 | 83 | #FORMAT DATE 84 | len=nchar(data[1,1]) 85 | 86 | #Monthly data 87 | if(len==7) { 88 | data[,1]=as.Date(paste(data[,1], "-1", sep=""), "%Y-%m-%d") 89 | data[,1]=sapply(data[,1], seq, length=2, by="1 month")[2,]-1 90 | data[,1]=as.Date(data[,1], "%Y-%m-%d", origin="1970-01-01") 91 | } 92 | 93 | #Weekly data 94 | if(len==23){ 95 | data[,1]=sapply(data[,1], substr, start=14, stop=30) 96 | data[,1]=as.Date(data[,1], "%Y-%m-%d") 97 | } 98 | 99 | #Daily data 100 | if(len==10) data[,1]=as.Date(data[,1], "%Y-%m-%d") 101 | 102 | #Structure into panel data format 103 | panelData=data[1:2] 104 | panelData[3]=name[1] 105 | names(panelData)=c("Date", "SVI", "Keyword") 106 | if(ncol(data)>2) { 107 | 108 | for(j in 3:ncol(data)) { 109 | appendData=data[c(1,j)] 110 | appendData[3]=name[j-1] 111 | names(appendData)=c("Date", "SVI", "Keyword") 112 | panelData=rbind(panelData, appendData) 113 | } 114 | } 115 | 116 | #Add file name 117 | panelData[ncol(panelData)+1]=filePath[i] 118 | 119 | #Add path to filename 120 | names(panelData)[4]="Path" 121 | 122 | #Merge several several files into one 123 | if(i==1) output=panelData 124 | if(i>1) output=rbind(output, panelData) 125 | } 126 | return(output) 127 | } 128 | 129 | readGeoGT=function(filePath){ 130 | output=data.frame() 131 | rawFiles=list() 132 | for(i in 1:length(filePath)){ 133 | if(length(filePath)==1) rawFiles[[1]]=read.csv(filePath, header=F, blank.lines.skip=F) 134 | if(length(filePath)>1) rawFiles[[i]]=read.csv(filePath[i], header=F, blank.lines.skip=F) 135 | } 136 | 137 | for(i in 1:length(rawFiles)){ 138 | data=rawFiles[[i]] 139 | start=which(data[,1]=="")[3]+3 140 | stop=which(data[,1]=="")[4]-1 141 | names=data[start-1,] 142 | 143 | for(j in 1:ncol(names)) names(data)[j]=as.character(names[1,j]) 144 | data=data[start:stop,] 145 | data[,1]=as.character(data[,1]) 146 | data[,-1]=as.numeric(as.character(data[,-1])) 147 | data[ncol(data)+1]=filePath[i] 148 | 149 | output=rbind(output, data) 150 | } 151 | return(output) 152 | } 153 | 154 | 155 | readAdditionalGT = function(filePath){ 156 | 157 | output=list() 158 | rawFiles=list() 159 | for(i in 1:length(filePath)){ 160 | if(length(filePath)==1) rawFiles[[1]]=read.csv(filePath, header=F, blank.lines.skip=F) 161 | if(length(filePath)>1) rawFiles[[i]]=read.csv(filePath[i], header=F, blank.lines.skip=F) 162 | } 163 | 164 | 165 | for(file in rawFiles){ 166 | search_term = substring(as.character(file[1,1]), 22) 167 | start = grep('Top regions', file[,1]) + 2 168 | end = start + which(file[start:nrow(file),1]=="")[1] - 2 169 | tmp = file[start:end,] 170 | tmp$Type = 'Top regions' 171 | tmp$Keyword = search_term 172 | output[[length(output)+1]] = tmp 173 | 174 | start = grep('Top cities', file[,1]) + 2 175 | end = start + which(file[start:nrow(file),1]=="")[1] - 2 176 | tmp = file[start:end,] 177 | tmp$Type = 'Top cities' 178 | tmp$Keyword = search_term 179 | output[[length(output)+1]] = tmp 180 | 181 | start = grep('Top searches', file[,1]) + 2 182 | end = start + which(file[start:nrow(file),1]=="")[1] - 2 183 | tmp = file[start:end,] 184 | tmp$Type = 'Top searches' 185 | tmp$Keyword = search_term 186 | output[[length(output)+1]] = tmp 187 | 188 | start = grep('Rising searches', file[,1]) + 2 189 | end = start + which(file[start:nrow(file),1]=="")[1] - 2 190 | tmp = file[start:end,] 191 | tmp$Type = 'Rising searches' 192 | tmp$Keyword = search_term 193 | output[[length(output)+1]] = tmp 194 | } 195 | output = do.call(rbind, output) 196 | return(output) 197 | } 198 | -------------------------------------------------------------------------------- /Daily data example.R: -------------------------------------------------------------------------------- 1 | 2 | library(Rmisc) 3 | library(ggplot2) 4 | library(dplyr) 5 | 6 | # The Google Trends formating functions ----------------------------------- 7 | 8 | #This script automates the downloading of Google Trends. 9 | #It works best with firefox in combination with the Tab Mix Plus add-on that is used to automate tab closing. 10 | #Ask firefox not to prompt for new downloads and this script should run automatically. 11 | #Google Trends restricts the number of download to roughly 400 at a time. 12 | 13 | URL_GT=function(keyword="", country=NA, region=NA, year=NA, month=1, length=3){ 14 | 15 | start="http://www.google.com/trends/trendsReport?hl=en-US&q=" 16 | end="&cmpt=q&content=1&export=1" 17 | geo="" 18 | date="" 19 | 20 | #Geographic restrictions 21 | if(!is.na(country)) { 22 | geo="&geo=" 23 | geo=paste(geo, country, sep="") 24 | if(!is.na(region)) geo=paste(geo, "-", region, sep="") 25 | } 26 | 27 | queries=keyword[1] 28 | if(length(keyword)>1) { 29 | for(i in 2:length(keyword)){ 30 | queries=paste(queries, "%2C ", keyword[i], sep="") 31 | } 32 | } 33 | 34 | #Dates 35 | if(!is.na(year)){ 36 | date="&date=" 37 | date=paste(date, month, "%2F", year, "%20", length, "m", sep="") 38 | } 39 | 40 | URL=paste(start, queries, geo, date, end, sep="") 41 | URL <- gsub(" ", "%20", URL) 42 | return(URL) 43 | } 44 | 45 | downloadGT=function(URL, downloadDir){ 46 | 47 | #Determine if download has been completed by comparing the number of files in the download directory to the starting number 48 | startingFiles=list.files(downloadDir) 49 | browseURL(URL) 50 | endingFiles=list.files(downloadDir) 51 | 52 | while(length(setdiff(endingFiles,startingFiles))==0) { 53 | Sys.sleep(3) 54 | endingFiles=list.files(downloadDir) 55 | } 56 | filePath=setdiff(endingFiles,startingFiles) 57 | return(filePath) 58 | } 59 | 60 | 61 | readGT=function(filePath){ 62 | rawFiles=list() 63 | 64 | for(i in 1:length(filePath)){ 65 | if(length(filePath)==1) rawFiles[[1]]=read.csv(filePath, header=F, blank.lines.skip=F) 66 | if(length(filePath)>1) rawFiles[[i]]=read.csv(filePath[i], header=F, blank.lines.skip=F) 67 | } 68 | 69 | output=data.frame() 70 | name=vector() 71 | 72 | for(i in 1:length(rawFiles)){ 73 | data=rawFiles[[i]] 74 | name=as.character(t(data[5,-1])) 75 | 76 | #Select the time series 77 | start=which(data[,1]=="")[1]+3 78 | stop=which(data[,1]=="")[2]-2 79 | 80 | #Skip to next if file is empty 81 | if(ncol(data)<2) next 82 | if(is.na(which(data[,1]=="")[2]-2)) next 83 | 84 | data=data[start:stop,] 85 | data[,1]=as.character(data[,1]) 86 | 87 | #Convert all columns except date column into numeric 88 | for(j in 2:ncol(data)) data[,j]=as.numeric(as.character(data[,j])) 89 | 90 | #FORMAT DATE 91 | len=nchar(data[1,1]) 92 | 93 | #Monthly data 94 | if(len==7) { 95 | data[,1]=as.Date(paste(data[,1], "-1", sep=""), "%Y-%m-%d") 96 | data[,1]=sapply(data[,1], seq, length=2, by="1 month")[2,]-1 97 | data[,1]=as.Date(data[,1], "%Y-%m-%d", origin="1970-01-01") 98 | } 99 | 100 | #Weekly data 101 | if(len==23){ 102 | data[,1]=sapply(data[,1], substr, start=14, stop=30) 103 | data[,1]=as.Date(data[,1], "%Y-%m-%d") 104 | } 105 | 106 | #Daily data 107 | if(len==10) data[,1]=as.Date(data[,1], "%Y-%m-%d") 108 | 109 | #Structure into panel data format 110 | panelData=data[1:2] 111 | panelData[3]=name[1] 112 | names(panelData)=c("Date", "SVI", "Keyword") 113 | if(ncol(data)>2) { 114 | 115 | for(j in 3:ncol(data)) { 116 | appendData=data[c(1,j)] 117 | appendData[3]=name[j-1] 118 | names(appendData)=c("Date", "SVI", "Keyword") 119 | panelData=rbind(panelData, appendData) 120 | } 121 | } 122 | 123 | #Add file name 124 | panelData[ncol(panelData)+1]=filePath[i] 125 | 126 | #Add path to filename 127 | names(panelData)[4]="Path" 128 | 129 | #Merge several several files into one 130 | if(i==1) output=panelData 131 | if(i>1) output=rbind(output, panelData) 132 | } 133 | return(output) 134 | } 135 | 136 | readGeoGT=function(filePath){ 137 | output=data.frame() 138 | rawFiles=list() 139 | for(i in 1:length(filePath)){ 140 | if(length(filePath)==1) rawFiles[[1]]=read.csv(filePath, header=F, blank.lines.skip=F) 141 | if(length(filePath)>1) rawFiles[[i]]=read.csv(filePath[i], header=F, blank.lines.skip=F) 142 | } 143 | 144 | for(i in 1:length(rawFiles)){ 145 | data=rawFiles[[i]] 146 | start=which(data[,1]=="")[3]+3 147 | stop=which(data[,1]=="")[4]-1 148 | names=data[start-1,] 149 | 150 | for(j in 1:ncol(names)) names(data)[j]=as.character(names[1,j]) 151 | data=data[start:stop,] 152 | data[,1]=as.character(data[,1]) 153 | data[,-1]=as.numeric(as.character(data[,-1])) 154 | data[ncol(data)+1]=filePath[i] 155 | 156 | output=rbind(output, data) 157 | } 158 | return(output) 159 | } 160 | 161 | 162 | # Downloading the data ---------------------------------------------------- 163 | 164 | 165 | search_terms = c("bull market", "bear market", "recession") 166 | 167 | years = c(2005,2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016) 168 | months = c(1,4,7,10) 169 | res.daily=list() 170 | counter=1 171 | for(year in years){ 172 | for(month in months){ 173 | url=URL_GT(search_terms, year=year, month=month) 174 | GT_dir = downloadGT(url, downloadDir) 175 | GT_dir = paste(downloadDir, GT_dir, sep='/') 176 | res.daily[[counter]] = readGT(GT_dir) 177 | counter=counter+1 178 | } 179 | } 180 | 181 | df.daily <- do.call("rbind", res.daily) 182 | 183 | url = URL_GT(search_terms) 184 | GT_dir = downloadGT(url, downloadDir) 185 | GT_dir = paste(downloadDir, GT_dir, sep='/') 186 | df.weekly = readGT(GT_dir) 187 | 188 | 189 | # Formating the data ------------------------------------------------------ 190 | 191 | 192 | df.merged = merge(df.daily, df.weekly, by=c('Date', 'Keyword'), all.x=T) 193 | df.merged$adjustment_factor = df.merged$SVI.y /df.merged$SVI.x 194 | 195 | for(i in search_terms){ 196 | r=which(df.merged$Keyword==i) 197 | for(j in 2:length(r)){ 198 | if(!is.finite(df.merged$adjustment_factor[r][j])){ 199 | df.merged$adjustment_factor[r][j] = df.merged$adjustment_factor[r][j-1] 200 | } 201 | } 202 | } 203 | df.merged$daily = df.merged$adjustment_factor * df.merged$SVI.x 204 | df.merged$weekly = df.merged$SVI.y 205 | for(i in search_terms){ 206 | r=which(df.merged$Keyword==i) 207 | for(j in 2:length(r)){ 208 | if(is.na(df.merged$weekly[r][j])){ 209 | df.merged$weekly[r][j] = df.merged$weekly[r][j-1] 210 | } 211 | } 212 | } 213 | 214 | 215 | # Plotting the data ------------------------------------------------------- 216 | 217 | df.merged$daily[which(is.infinite(df.merged$daily))] = NA 218 | 219 | p1 = df.merged %>% 220 | ggplot(aes(Date, daily, color=Keyword))+geom_line() 221 | 222 | p2 = df.merged %>% 223 | ggplot(aes(Date, weekly, color=Keyword))+geom_line() 224 | 225 | multiplot(p1,p2) 226 | 227 | 228 | # Saving the data --------------------------------------------------------- 229 | 230 | 231 | write.csv(df.merged,'df.merged.csv') 232 | -------------------------------------------------------------------------------- /Working example: -------------------------------------------------------------------------------- 1 | # This script automates the downloading of Google Trends. 2 | # It works best with firefox in combination with the Tab Mix Plus add-on that is used to automate tab closing. 3 | # Ask firefox not to prompt for new downloads and this script should run automatically. 4 | # Google Trends restricts the number of download to roughly 400 at a time. 5 | # This is a fully working script for downloading Google Trends daily Google Trends data for the key word "FTSE 100" since 2004 6 | # The only thing you need to do is change downloadDir to the download directory of your default browser. 7 | # The final results will be located in the data frame "summary" 8 | # Note that this has only been tested on a Windows machine. It should work on a Mac as well. 9 | 10 | downloadDir="C:/Users/erik.johansson/Downloads" 11 | setwd(downloadDir) 12 | seach_word="FTSE 100" 13 | 14 | 15 | URL_GT=function(keyword="", country=NA, region=NA, year=NA, month=1, length=3){ 16 | 17 | start="http://www.google.com/trends/trendsReport?hl=en-US&q=" 18 | end="&cmpt=q&content=1&export=1" 19 | geo="" 20 | date="" 21 | 22 | #Geographic restrictions 23 | if(!is.na(country)) { 24 | geo="&geo=" 25 | geo=paste(geo, country, sep="") 26 | if(!is.na(region)) geo=paste(geo, "-", region, sep="") 27 | } 28 | 29 | queries=keyword[1] 30 | if(length(keyword)>1) { 31 | for(i in 2:length(keyword)){ 32 | queries=paste(queries, "%2C ", keyword[i], sep="") 33 | } 34 | } 35 | 36 | #Dates 37 | if(!is.na(year)){ 38 | date="&date=" 39 | date=paste(date, month, "%2F", year, " ", length, "m", sep="") 40 | } 41 | 42 | URL=paste(start, queries, geo, date, end, sep="") 43 | return(URL) 44 | } 45 | 46 | downloadGT=function(URL, downloadDir){ 47 | 48 | #Determine if download has been completed by comparing the number of files in the download directory to the starting number 49 | startingFiles=list.files(downloadDir) 50 | browseURL(URL) 51 | endingFiles=list.files(downloadDir) 52 | 53 | while(length(setdiff(endingFiles,startingFiles))==0) { 54 | Sys.sleep(3) 55 | endingFiles=list.files(downloadDir) 56 | } 57 | filePath=setdiff(endingFiles,startingFiles) 58 | return(filePath) 59 | } 60 | 61 | 62 | readGT=function(filePath){ 63 | rawFiles=list() 64 | 65 | for(i in 1:length(filePath)){ 66 | if(length(filePath)==1) rawFiles[[1]]=read.csv(filePath, header=F, blank.lines.skip=F) 67 | if(length(filePath)>1) rawFiles[[i]]=read.csv(filePath[i], header=F, blank.lines.skip=F) 68 | } 69 | 70 | output=data.frame() 71 | name=vector() 72 | 73 | for(i in 1:length(rawFiles)){ 74 | data=rawFiles[[i]] 75 | raw_name=as.character(data[1,1]) 76 | raw_name=substr(raw_name, 22, nchar(raw_name)) 77 | 78 | #Create a vector called name containing the search terms 79 | if(grepl(";", raw_name)) { 80 | separators=gregexpr(";", raw_name)[[1]] 81 | separators=c(separators, nchar(raw_name)+1) 82 | 83 | name[1]=substr(raw_name, 1, separators[1]-1) 84 | for(j in 2:length(separators)) { 85 | name[j]=substr(raw_name, separators[j-1]+2, separators[j]-1) 86 | } 87 | } else {name=raw_name} 88 | 89 | #Select the time series 90 | start=which(data[,1]=="")[1]+3 91 | stop=which(data[,1]=="")[2]-2 92 | 93 | #Skip to next if file is empty 94 | if(ncol(data)<2) next 95 | if(is.na(which(data[,1]=="")[2]-2)) next 96 | 97 | data=data[start:stop,] 98 | data[,1]=as.character(data[,1]) 99 | 100 | #Convert all columns except date column into numeric 101 | for(j in 2:ncol(data)) data[,j]=as.numeric(as.character(data[,j])) 102 | 103 | #FORMAT DATE 104 | len=nchar(data[1,1]) 105 | 106 | #Monthly data 107 | if(len==7) { 108 | data[,1]=as.Date(paste(data[,1], "-1", sep=""), "%Y-%m-%d") 109 | data[,1]=sapply(data[,1], seq, length=2, by="1 month")[2,]-1 110 | data[,1]=as.Date(data[,1], "%Y-%m-%d", origin="1970-01-01") 111 | } 112 | 113 | #Weekly data 114 | if(len==23){ 115 | data[,1]=sapply(data[,1], substr, start=14, stop=30) 116 | data[,1]=as.Date(data[,1], "%Y-%m-%d") 117 | } 118 | 119 | #Daily data 120 | if(len==10) data[,1]=as.Date(data[,1], "%Y-%m-%d") 121 | 122 | #Structure into panel data format 123 | panelData=data[1:2] 124 | panelData[3]=name[1] 125 | names(panelData)=c("Date", "SVI", "Keyword") 126 | if(ncol(data)>2) { 127 | 128 | for(j in 3:ncol(data)) { 129 | appendData=data[c(1,j)] 130 | appendData[3]=name[j-1] 131 | names(appendData)=c("Date", "SVI", "Keyword") 132 | panelData=rbind(panelData, appendData) 133 | } 134 | } 135 | 136 | #Add file name 137 | panelData[ncol(panelData)+1]=filePath[i] 138 | 139 | #Add path to filename 140 | names(panelData)[4]="Path" 141 | 142 | #Merge several several files into one 143 | if(i==1) output=panelData 144 | if(i>1) output=rbind(output, panelData) 145 | } 146 | return(output) 147 | } 148 | 149 | 150 | reindexGT=function(GT.daily, GT.weekly){ 151 | GT.daily=GT.daily[,order(colnames(GT.daily))] 152 | GT.weekly=GT.weekly[,order(colnames(GT.weekly))] 153 | 154 | w=match(names(GT.daily), names(GT.weekly)) 155 | w=w[!is.na(w)] 156 | d=match(names(GT.weekly), names(GT.daily)) 157 | d=d[!is.na(d)] 158 | 159 | GT.weekly=GT.weekly[,w] 160 | GT.daily=GT.daily[,d] 161 | 162 | merged=merge(GT.daily, GT.weekly, by="Date", all=T) 163 | GT.daily=merged[2:(1+((length(merged)-1))/2)] 164 | rownames(GT.daily)=merged$Date 165 | GT.weekly=merged[(2+(length(merged)-1)/2):length(merged)] 166 | rownames(GT.weekly)=merged$Date 167 | 168 | reindex=GT.daily/GT.weekly 169 | for(i in 2:length(reindex[,1])) { 170 | for(j in 1:length(reindex[1,])) { 171 | if(is.na(reindex[i,j])) { 172 | reindex[i,j]=reindex[i-1,j] 173 | } 174 | } 175 | } 176 | 177 | for(i in 2:length(GT.daily[,1])) { 178 | for(j in 1:length(GT.daily[1,])) { 179 | if(is.na(GT.daily[i,j])) { 180 | GT.daily[i,j]=GT.daily[i-1,j] 181 | } 182 | } 183 | } 184 | 185 | for(i in 2:length(GT.weekly[,1])) { 186 | for(j in 1:length(GT.weekly[1,])) { 187 | if(is.na(GT.weekly[i,j])) { 188 | GT.weekly[i,j]=GT.weekly[i-1,j] 189 | } 190 | } 191 | } 192 | 193 | output=list(reindex, GT.daily, GT.weekly) 194 | return(output) 195 | } 196 | 197 | 198 | # EXECUTION 199 | 200 | year=c(2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015) 201 | 202 | n=1 203 | trendsDir=vector() 204 | for(i in year){ 205 | for(j in 1:12){ 206 | URL=URL_GT(keyword=seach_word, year=i, month=j, length=1) 207 | trendsDir[n]=downloadGT(URL, downloadDir) 208 | n=n+1 209 | if(i==2015 && j==6) break() 210 | } 211 | } 212 | 213 | daily_GT=readGT(trendsDir) 214 | 215 | URL=URL_GT(keyword=seach_word) 216 | trendsDir_weekly=downloadGT(URL, downloadDir) 217 | weekly_GT=readGT(trendsDir_weekly) 218 | 219 | reindexing=reindexGT(daily_GT[1:2], weekly_GT[1:2]) 220 | 221 | # GT is the final results 222 | GT=reindexing[[2]]/reindexing[[1]] 223 | GT[which(!is.finite(GT[,1])),]=0 224 | 225 | # Create summary table for comparison 226 | summary=as.data.frame(matrix(NA, nrow(GT), 3)) 227 | names(summary)=c("Date", "Reindexed", "Original daily") 228 | summary[,1]=as.Date(rownames(GT)) 229 | summary[,2]=GT[,1] 230 | summary[,3]=reindexing[[2]][,1] 231 | summary=merge(summary, weekly_GT[1:2], all=T, by="Date") 232 | 233 | for(i in 2:nrow(summary)){ 234 | if(is.na(summary[i,4])) summary[i,4]=summary[i-1,4] 235 | } 236 | 237 | library(scales) 238 | 239 | plot(summary$Date, summary[,3], type="l") 240 | plot(summary$Date, summary[,4], type="l") 241 | plot(summary$Date, summary[,2], type="l") 242 | 243 | plot(summary$Date, summary[,2], type="l", col=alpha("green", 0.5), lwd=2) 244 | lines(summary$Date, summary[,3], type="l", col=alpha("blue", 0.8), lwd=1) 245 | lines(summary$Date, summary[,4], type="l", col=alpha("red", 0.9), lwd=2) 246 | --------------------------------------------------------------------------------