├── README.md
├── Startup 100
├── create_gt_url.js
├── Connect to DB
├── Rescale
├── Google trends batch service.R
├── 2. Scoring GT
├── Daily in percent
├── Most googled by state
├── Searches by state
├── 1. Find winner
├── Execution
├── competitor_tiers.R
├── lynx automate script
├── Google Trends functions
├── Daily data example.R
└── Working example


/README.md:
--------------------------------------------------------------------------------
1 | Google-Trends
2 | =============
3 | 


--------------------------------------------------------------------------------
/Startup 100:
--------------------------------------------------------------------------------
 1 | library(XML)
 2 | library(RODBC)
 3 | startup100_URL="http://www.startup100.net"
 4 | tables=readHTMLTable(startup100_URL)
 5 | table=tables[[1]]
 6 | names(table)=c("Change %", "Company", "S100 Index", "Change", "Index rank", "Category", "Profiles", "Y-tunnus")
 7 | table=table[c(-1, -5)]
 8 | startup100=table
 9 | 
10 | connector=odbcConnect("ejjohans_boostery", uid="ejjohans_boost", pwd="Greenstep!!1")
11 | 
12 | sqlDrop(connector, "startup100", errors=FALSE)
13 | sqlSave(connector, startup100)
14 | 
15 | close(connector)
16 | 


--------------------------------------------------------------------------------
/create_gt_url.js:
--------------------------------------------------------------------------------
 1 | 'use strict';
 2 | 
 3 | 
 4 | function URL_GT(keyword, country, region, year, month, length){
 5 |   
 6 |   var start = "http://www.google.com/trends/trendsReport?hl=en-US&q=";
 7 |   var end = "&cmpt=q&content=1&export=1";
 8 |   var geo = "";
 9 |   var date = "";
10 |   var URL = "";
11 |   var month=1;
12 |   var length=3;
13 | 
14 |   
15 |   //Geographic restrictions
16 |   if(typeof country!=="undefined") {
17 |     geo="&geo=";
18 |     geo=geo + country;
19 |     if(region!==undefined) geo=geo + "-" + region;
20 |   }
21 |   
22 |   if(typeof keyword==="string"){
23 |   	var queries=keyword;
24 |   }
25 |   
26 |   if(typeof keyword==="object"){
27 |   	var queries=keyword[0];
28 |     for(var i=1; i < keyword.length; i++){
29 |       queries=queries + "%2C" + keyword[i];
30 |     }
31 |   }
32 |   
33 |   //Dates
34 |   if(typeof year!=="undefined"){
35 |     date="&date="
36 |     date=date + month + "%2F" + year + "%20" + length + "m"
37 |   }
38 |   
39 |   URL = start + queries + geo + date + end;
40 |   URL = URL.replace(" ", "%20");
41 |   return(URL);
42 | }
43 | 


--------------------------------------------------------------------------------
/Connect to DB:
--------------------------------------------------------------------------------
 1 | #RODBC documentation: http://cran.r-project.org/web/packages/RODBC/RODBC.pdf
 2 | #http://cran.r-project.org/web/packages/RODBC/vignettes/RODBC.pdf
 3 | #Check data base connection using run>odbcad32
 4 | 
 5 | library(RODBC)
 6 | downloadDir="C:/Users/erik.johansson/Downloads"
 7 | setwd(downloadDir)
 8 | forever="yes"
 9 | 
10 | while(forever=="yes") {
11 | gt_connector=odbcConnect("ejjohans_googletrends", uid="ejjohans_gt", pwd="***")
12 | 
13 | #Check that table is available in R
14 | sqlTables(gt_connector, tableType="TABLE")
15 | 
16 | #Get keyword from data base
17 | start_keyword=as.character(sqlFetch(gt_connector, "keyword_table")[1,1])
18 | keyword=start_keyword
19 | 
20 | while(start_keyword==keyword) {
21 | keyword=as.character(sqlFetch(gt_connector, "keyword_table")[1,1])
22 | Sys.sleep(5)
23 | }
24 | 
25 | #Create URL
26 | URL=URL_GT(keyword)
27 | filePath=downloadGT(URL, downloadDir)
28 | data=readGT(filePath)
29 | data=data[,c(3,1,2)]
30 | output=data
31 | 
32 | sqlDrop(gt_connector, "output", errors=FALSE)
33 | sqlDrop(gt_connector, "geodata", errors=FALSE)
34 | 
35 | sqlSave(gt_connector, output)
36 | 
37 | geodata=readGeoGT(filePath)
38 | names(geodata)=c("Region", "SVI", "Source")
39 | 
40 | sqlSave(gt_connector, geodata)
41 | 
42 | }
43 | close(gt_connector)
44 | 


--------------------------------------------------------------------------------
/Rescale:
--------------------------------------------------------------------------------
 1 | library(quantmod)
 2 | 
 3 | downloadDir="C:/Users/erik.johansson/Downloads"
 4 | setwd(downloadDir)
 5 | year=c(2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013)
 6 | 
 7 | GT.ts=list()
 8 | n=1
 9 | for(i in year){
10 |   for(j in 1:12){
11 |     URL=URL_GT(keyword="FTSE 100", year=i, month=j, length=3)
12 |     tempDir=downloadGT(URL, downloadDir)
13 |     data=read.csv(tempDir, header=F, blank.lines.skip=F)
14 |     start=which(data[,1]=="")[1]+3
15 |     stop=which(data[,1]=="")[2]-1
16 |     data=data[start:stop,]
17 |     data[,1]=as.Date(data[,1], "%Y-%m-%d")
18 |     data[,2]=as.numeric(as.character(data[,2]))
19 |     rownames(data)=data[,1]
20 |     data[3]=NA
21 | 
22 | #Calculate % change
23 |     for(k in 2:nrow(data)){
24 |       data[k, 3]=data[k,2]/data[k-1,2]
25 |     }
26 |     GT.ts[[n]]=data
27 |     n=n+1
28 |   }
29 | }
30 | 
31 | output=as.xts(GT.ts[[1]][3])
32 | 
33 | for(i in 2:length(GT.ts)){
34 |   data=as.xts(GT.ts[[i]][3])
35 |   output=merge(output, data)
36 |   output$new=NA
37 | 
38 |   for(j in 1:nrow(output)){
39 |     if(is.finite(mean(output[j,],na.rm=T))) output[j,3]=mean(output[j,],na.rm=T)
40 |   }
41 |   output=output[,3]
42 |   names(output)="d_svi"
43 | }
44 | 
45 | #Start value
46 | startvalue=100
47 | series=vector()
48 | series[1]=100
49 | for(i in 2:nrow(output)){
50 | series[i]=series[i-1]*output[i,]
51 | }
52 | 


--------------------------------------------------------------------------------
/Google trends batch service.R:
--------------------------------------------------------------------------------
 1 | # Here, assuming we only deal with daily, non-comparable SVI
 2 | search_terms = c('euromicron',	'TC Unterhaltungstechnik',	'SGL Carbon',	'zooplus',	'TUI',	'Borussia Dortmund',	'EUCA',	'TCU',	'SGL',	'ZO1',	'TUI1',	'BVB')
 3 | frequency = 'daily'
 4 | comparable = TRUE
 5 | country = NA
 6 | region = NA 
 7 | year = NA
 8 | 
 9 | years = c(2004,2005,2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016)
10 | months = c(1,4,7,10)
11 | length = 3  
12 | 
13 | url=vector()
14 | counter = 1
15 | for(search_term in search_terms){
16 |   for(year in years){
17 |     for(month in months){
18 |       if(year == as.numeric(substr(as.character(Sys.time()), 1, 4)) & month > as.numeric(substr(as.character(Sys.time()), 6, 7))){
19 |         next() #  This stops us from creating URLs for dates that don't exist.
20 |       }
21 |       url[counter]=URL_GT(keyword=search_term, year=year, month=month)
22 |       counter = counter + 1
23 |     }
24 |   }
25 | }
26 | 
27 | for(search_term in search_terms){
28 |   url[counter]=URL_GT(search_term)
29 |   counter = counter + 1
30 | }
31 | 
32 | for(i in 1:length(url)){
33 |   lynx_commands <- lynx_script(url[i]) #  Create the lynx script
34 |   write.table(lynx_commands, '/root/gt_download', row.names=F, col.names=F, quote=F) #  Save the lynx script
35 |   system("lynx -cmd_script=/root/gt_download www.google.com") #  Execute the lynx script (takes a while, be patient)
36 | }
37 | 


--------------------------------------------------------------------------------
/2. Scoring GT:
--------------------------------------------------------------------------------
 1 | scoringGT=function(players, competitionsGToutput){
 2 | #"players" must be a data frame with at least one column
 3 | scoreBoard=data.frame()
 4 | 
 5 | if(nrow(competitionsGToutput[-which(competitionsGToutput[,1]=="Winner"),])>0){
 6 | competitionsGToutput=competitionsGToutput[-which(competitionsGToutput[,1]=="Winner"),]
 7 | }
 8 | 
 9 | competitionsGToutput[,1]=as.numeric(competitionsGToutput[,1])
10 | 
11 | 
12 | for(i in 1:max(competitionsGToutput[1:(nrow(competitionsGToutput)),1])) {
13 | batch=competitionsGToutput[which(competitionsGToutput[,1]==i),]
14 | highScore=max(batch[,3])
15 | roundWinner=batch[which(batch[,3]==highScore),]
16 | 
17 | batch[,4]=batch[,3]/roundWinner[,3]
18 | colnames(batch)[4]="Multiplier"
19 | 
20 | scoreBoard=rbind(scoreBoard, batch)
21 | colnames(scoreBoard)=c("Round", "Player", "Score", "Multiplier")
22 | }
23 | 
24 | M=as.data.frame(matrix(NA, nrow(players), nrow(players)))
25 | iterationMax=sum(is.na(M))
26 | colnames(M)=players[,1]
27 | rownames(M)=players[,1]
28 | 
29 | for(j in 1:max(scoreBoard[,1])){
30 | batch=scoreBoard[which(scoreBoard[,1]==j),]
31 | 
32 | for(i in 1:nrow(batch)){
33 | place=which(tolower(rownames(M))==batch[i,2])
34 | roundWinner=batch[which(batch[,4]==1),2]
35 | M[place, which(tolower(colnames(M))==roundWinner)]=batch[i,4]
36 | }
37 | }
38 | 
39 | for(y in 1:nrow(players)){
40 | for(x in 1:nrow(players)){
41 | if(is.na(M[y,x])) M[y,x]=1/M[x,y]
42 | M[y,y]=1
43 | }
44 | }
45 | 
46 | while((iterationMax-sum(is.na(M)))>0){
47 | iterationMax=sum(is.na(M))
48 | for(i in 1:nrow(players)){
49 | for(x in 1:nrow(players)){
50 | for(y in 1:nrow(players)){
51 | if(is.na(M[y,i])) M[y,i]=M[y,x]/M[i,x]
52 | }
53 | }
54 | }
55 | }
56 | 
57 | return(M)
58 | }
59 | 


--------------------------------------------------------------------------------
/Daily in percent:
--------------------------------------------------------------------------------
 1 | #Create download pahts
 2 | 
 3 | year=c(2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014)
 4 | output=vector()
 5 | downloadDir="C:/Users/erik.johansson/Downloads"
 6 | setwd(downloadDir)
 7 | for(i in year){
 8 |   for(j in 1:12){
 9 |     URL=URL_GT("ftse 100", year=i, month=j, length=2)
10 |     output=append(output, URL)
11 |   }
12 | }
13 | 
14 | #Create table to store ouput
15 | URL=output
16 | gt_results=data.frame(as.Date("10.1.2004", "%d.%m.%Y"), NA, NA, NA, 1)
17 | colnames(gt_results)=c("Date", "SVI", "Company",  "Path", "Percentage")
18 | 
19 | for(i in 1:length(URL)){
20 | #Download file
21 |   gt_path=downloadGT(URL[i], downloadDir)
22 |   
23 | #Format csv
24 |   gt_data=readGT(gt_path)
25 | 
26 | #Increment all by one to make percentage calculation possible
27 |   gt_data[,2]=gt_data[,2]+1
28 | 
29 |   gt_data[which(is.na(gt_data[,2])),2]=1
30 |   gt_data[5]=NA
31 |   names(gt_data)[5]="Percentage"
32 | 
33 | #Calculate percentage change  
34 |   for(j in 2:nrow(gt_data)){
35 |     gt_data$Percentage[j]=gt_data$SVI[j]/gt_data$SVI[j-1]
36 |   }
37 | 
38 | #Find first instance of date overlap in the new file
39 |   date_match=which(gt_data$Date==gt_results[nrow(gt_results),1])
40 | 
41 | #To ensure that we haven't skipped a date (since the data might be on a weekly level in some cases) we do the same check for the results data
42 |   date_match_results=which(gt_results$Date==gt_results[nrow(gt_results),1])
43 |   
44 |   if(length(date_match)>0) {
45 |     gt_data_subset=gt_data[(date_match+1):nrow(gt_data),]
46 |     gt_results=gt_results[1:date_match_results,]
47 |     } else {gt_data_subset=gt_data}
48 |   colnames(gt_data_subset)=c("Date", "SVI", "Company",  "Path", "Percentage")
49 |   gt_results=rbind(gt_results, gt_data_subset)
50 | }
51 | 


--------------------------------------------------------------------------------
/Most googled by state:
--------------------------------------------------------------------------------
 1 | queries=""
 2 | downloadDir="C:/Users/erik.johansson/Downloads"
 3 | country="US"
 4 | paths=data.frame()
 5 | states=c('AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY')
 6 | states2=read.csv("C:/Users/erik.johansson/Dropbox/Google Trends/States/states.csv", sep=";")
 7 | GT.raw=list()
 8 | 
 9 | 
10 | for(i in 1:length(states)){
11 |   URL=paste("http://www.google.com/trends/trendsReport?hl=en-US", "&geo=", country, "-", states[i], "&q=", queries, "&cmpt=q&content=1&export=1", sep="") 
12 |     
13 | #Get the file path for the csv
14 |   startingFiles=list.files(downloadDir)
15 |   endingFiles=list.files(downloadDir)
16 |   browseURL(URL)
17 |   while(length(setdiff(endingFiles,startingFiles))==0) {
18 |     Sys.sleep(3)
19 |     endingFiles=list.files(downloadDir)
20 |     }
21 |   filePath=setdiff(endingFiles,startingFiles)
22 |   paths[i,1]=states[i]
23 |   paths[i,2]=filePath
24 | }
25 | 
26 | for(i in 1:nrow(paths)){
27 | filePath=paste(downloadDir, "/", paths[i,2], sep="")
28 | GT.raw[[i]]=read.csv(filePath, header=FALSE, blank.lines.skip=FALSE, stringsAsFactors=FALSE)
29 | }
30 | 
31 | summary=NA
32 | output=as.data.frame(matrix(NA, nrow=0, ncol=3))
33 | for(i in 1:length(GT.raw)){
34 | start=which(GT.raw[[i]][,1]=="")[3]+2
35 | stop=which(GT.raw[[i]][,1]=="")[4]-1
36 | data=GT.raw[[i]][start:stop,, drop=F]
37 | topList=as.numeric(as.character(data[,1]))
38 | summary=data[which(is.na(topList)),,drop=F]
39 | summary[2]=topList[-which(is.na(topList))]
40 | summary[3]=tolower(states2[i,1])
41 | output=rbind(output,summary)
42 | }
43 | 


--------------------------------------------------------------------------------
/Searches by state:
--------------------------------------------------------------------------------
 1 | celebrities=c('Snooki', 'Paris Hilton', 'Nicole Polizzi', 'Miley Cyrus', 'Lindsay Lohan', 'Kris Jenner', 'Kourtney Kardashian', 'Kim Kardashian', 'Khloe Kardashian', 'Justin Bieber', 'Honey Boo Boo') 
 2 | downloadDir="C:/Users/erik.johansson/Downloads"
 3 | 
 4 | scoringTable=function(data=data.frame()){
 5 |   summary=as.data.frame(matrix(NA, nrow=min(1, (ncol(data)-1)), ncol=5))
 6 |   for(i in 1:(ncol(data)-1)) summary[i,2]=data[which(data[,1]=="")[1]+2,i+1]
 7 |   for(i in 1:(ncol(data)-1)) summary[i,3]=data[which(data[,1]=="")[2]-2,i+1]
 8 |   summary[,3]=as.numeric(as.character(summary[,3]))
 9 |   highScore=max(summary[,3])
10 |   roundWinner=summary[which(summary[,3]==highScore),1]
11 |   summary[,4]=summary[,3]/highScore
12 |   summary[which(summary[,3]==highScore),5]="Round winner"
13 |   summary[,1]=data[2,1]
14 |   
15 |   return(summary)
16 | }
17 | 
18 | 
19 | mostGoogled=function(query, downloadDir){
20 |   country="US"
21 |   states=c('AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY')
22 |   states=states[4:14]
23 | 
24 | #Initialize the score board
25 |   scoreBoard=as.data.frame(matrix(NA, 0, 6)) #Main score board
26 | 
27 |   for(i in 1:length(states)){
28 |  #roundWinner requires initial value
29 |     roundWinner=query[1]
30 |     for(j in seq(1,length(query), by=5)){
31 |     queries=roundWinner
32 |     for(k in 1:4) if(!is.na(query[j+k])) queries=paste(queries, "%2C ", query[j+k])
33 |     URL=paste("http://www.google.com/trends/trendsReport?hl=en-US", "&geo=", country, "-", states[i], "&q=", queries, "&cmpt=q&content=1&export=1", sep="") 
34 |     
35 | #Get the file path for the csv
36 |     startingFiles=list.files(downloadDir)
37 |     endingFiles=list.files(downloadDir)
38 |     browseURL(URL)
39 |     while(length(setdiff(endingFiles,startingFiles))==0) {
40 |       Sys.sleep(3)
41 |       endingFiles=list.files(downloadDir)
42 |       }
43 |     filePath=setdiff(endingFiles,startingFiles)
44 |     
45 | #Read the csv
46 |     error_handler=tryCatch(
47 |       read.csv(paste(downloadDir, "/", filePath, sep=""), header=FALSE, blank.lines.skip=FALSE, stringsAsFactors=FALSE),
48 |       error=function(e) e
49 |     )
50 |     
51 |     if(inherits(error_handler, "error")){
52 |       Sys.sleep(8)
53 |     }
54 |     
55 |     trendsData=read.csv(paste(downloadDir, "/", filePath, sep=""), header=FALSE, blank.lines.skip=FALSE, stringsAsFactors=FALSE)
56 |     score=scoringTable(trendsData)
57 |     score[ncol(score)+1]=filePath
58 |     roundWinner=score[which(score[,5]=="Round winner"),2][1]
59 |     scoreBoard=rbind(scoreBoard, score)
60 |     }
61 |   }
62 | }
63 | 
64 | mostGoogled(celebrities, downloadDir)
65 | 


--------------------------------------------------------------------------------
/1. Find winner:
--------------------------------------------------------------------------------
  1 | 
  2 | competitionGT=function(players, path, download=TRUE){
  3 | 
  4 | #Set errortest default value
  5 | errortest=NA
  6 | 
  7 | #Initialize file count
  8 | k=1
  9 | 
 10 | #Create file references
 11 | maxFiles=2000
 12 | filenames=vector()
 13 | filenames[1]=paste(path, "/report.csv", sep="")
 14 | for(i in 1:maxFiles) filenames[i+1]=paste(path, "/report(", i, ").csv", sep="")
 15 | 
 16 | scoreBoard=as.data.frame(matrix(NA, 0, 3))
 17 | colnames(scoreBoard)=c("Round", "Player", "Score")
 18 | 
 19 | 
 20 | 
 21 | #Set initial value for round winner
 22 | roundWinner=as.character(players[1,1])
 23 | 
 24 | #####START THE COMPETITION
 25 | start=2
 26 | 
 27 | if(download==TRUE) {stop=nrow(players)
 28 | } else {stop=length(list.files(path))}
 29 | 
 30 | for(i in seq(start, stop, by=4)){
 31 | 
 32 | #roundWinner always participates in the competition
 33 | query=roundWinner
 34 | 
 35 | #Select batch of four players to go up against the roundWinner. The min function ensures that there are no empty slots
 36 | batch=players[i:min((i+3),nrow(players)),1]
 37 | 
 38 | #If the files haven't been downloaded already, create Google Trends query, download and import file
 39 | if(download==TRUE){
 40 |  
 41 | #File selector k
 42 | k=length(list.files(path))+1
 43 |  
 44 |   for(j in 1:length(batch)) query=paste(query, "%2C ", batch[j], sep="")
 45 |   URL=paste("http://www.google.com/trends/trendsReport?hl=en-US&q=", query,"&cmpt=q&content=1&export=1", sep="")
 46 |   browseURL(URL)
 47 | 
 48 | 
 49 |   Sys.sleep(4)
 50 | 
 51 | #Error handler. Try do read file. If it fails, wait 8 seconds. If it fails again, wait 8 seconds more.
 52 |   errortest=tryCatch(
 53 |    read.csv(filenames[k], header=FALSE, blank.lines.skip=FALSE, stringsAsFactors=FALSE),
 54 |    error=function(e) e
 55 |   )
 56 | 
 57 |   if(inherits(errortest, "error")){
 58 |    Sys.sleep(8)
 59 |   }
 60 | 
 61 |   errortest=tryCatch(
 62 |    read.csv(filenames[k], header=FALSE, blank.lines.skip=FALSE, stringsAsFactors=FALSE),
 63 |    error=function(e) e
 64 |   )
 65 | 
 66 |   if(inherits(errortest, "error")){
 67 |    Sys.sleep(8)
 68 |   }
 69 |  }
 70 | 
 71 | #Make sure that the files can be loaded. If not, skip this iteration
 72 | if(!inherits(errortest, "error")){
 73 | players.raw=read.csv(filenames[k], header=FALSE, blank.lines.skip=FALSE, stringsAsFactors=FALSE)
 74 | 
 75 | 
 76 | #Initialize results
 77 | results=data.frame(matrix(NA, (nrow=length(players.raw)-1), ncol=3))
 78 | results[,1]=k
 79 | for(i in 1:(ncol(players.raw)-1)) results[i,2]=players.raw[5,i+1]
 80 | for(i in 1:(ncol(players.raw)-1)) results[i,3]=as.numeric(as.character(players.raw[(which(players.raw=="")[2]-2),i+1]))
 81 | 
 82 | highScore=max(results[,3])
 83 | 
 84 | colnames(results)=c("Round", "Player", "Score")
 85 | 
 86 | roundWinner=results[which(results[,3]==highScore),2]
 87 | scoreBoard=rbind(scoreBoard, results)
 88 | 
 89 | k=k+1 #Move file selector one step forward
 90 | 
 91 | 
 92 | winner=data.frame("Winner", roundWinner, highScore)
 93 | colnames(winner)=c("Round", "Player", "Score")
 94 | scoreBoard=rbind(scoreBoard, winner)
 95 | }
 96 | }
 97 | return(scoreBoard)
 98 | }
 99 | 
100 | 
101 | 
102 | 


--------------------------------------------------------------------------------
/Execution:
--------------------------------------------------------------------------------
  1 | setwd("C:/Users/erik.johansson/Dropbox/Google Trends")
  2 | 
  3 | pathData.capitals="Capitals/GT data"
  4 | pathPlayers.capitals="Capitals/European capitals.csv"
  5 | 
  6 | pathData.brands="Top 10 brands/Data"
  7 | pathPlayers.brands="Top 10 brands/500 most valuable.csv"
  8 | 
  9 | pathData.slush="Coming to Slush/Data"
 10 | pathPlayers.slush="Coming to Slush/companies.csv"
 11 | 
 12 | #pathPlayers=pathPlayers.capitals
 13 | #pathData=pathData.capitals
 14 | #pathPlayers=pathPlayers.brands
 15 | #pathData=pathData.brands
 16 | pathPlayers=pathPlayers.slush
 17 | pathData=pathData.slush
 18 | 
 19 | players=read.csv(pathPlayers, sep=";")
 20 | podiumSize=3
 21 | 
 22 | 
 23 | #For downloading the files
 24 | round=list()
 25 | for(i in 1:podiumSize){
 26 | round[[i]]=competitionGT(players, pathData, download=TRUE)
 27 | players=players[-which(tolower(players[,1])==round[[i]][nrow(round[[i]]),2]),]
 28 | }
 29 | 
 30 | ###Once the files have been downloaded, we can import them again by setting the download option fo False
 31 | scoreBoard=data.frame(1:10)
 32 | for(i in 1:podiumSize) scoreBoard[i,2]=round[[i]][which(round[[i]][,1]=="Winner"),2]
 33 | colnames(scoreBoard)=c("Rank", "Capital")
 34 | 
 35 | #Once the data has been downloaded, we can import and order the data using the scoringGT function
 36 | data=competitionGT(players, pathData, download=FALSE)
 37 | #winner="London"
 38 | winner="Facebook"
 39 | 
 40 | scoringMatrix=scoringGT(players, data)
 41 | scoreBoard=scoringMatrix[,which(colnames(scoringMatrix)==winner), drop=FALSE]
 42 | scoreBoard=scoreBoard[-which(is.na(scoreBoard[,1])),,drop=FALSE]
 43 | colnames(scoreBoard)="Score"
 44 | scoreBoard=scoreBoard[order(scoreBoard, decreasing=TRUE),, drop=FALSE]
 45 | 
 46 | #Now we have a ranking of the cities. Finally, let's download the individual files and create a nice graph of them all combined
 47 | library(quantmod)
 48 | path="C:/Users/erik.johansson/Dropbox/Google Trends/Capitals/Individual data"
 49 | #downloadWeeklyGT(as.character(players[,1]))
 50 | GT.raw=importGT(path)
 51 | GT.summary=summaryTable(GT.raw)
 52 | GT.f=formatGT(GT.raw, GT.summary)
 53 | GT.m=mergeGT(GT.f)
 54 | rownames(GT.m)=as.Date(GT.m[,1], "%Y-%m-%d")
 55 | GT.date=GT.m[,1]
 56 | GT.m=GT.m[-1]
 57 | GT.m=as.xts(GT.m)
 58 | index(GT.m)=GT.date
 59 | GT.m=na.approx(GT.m)
 60 | 
 61 | GT.return=as.data.frame(matrix(NA, nrow(GT.m), ncol(GT.m)))
 62 | rownames(GT.return)=index(GT.m)
 63 | colnames(GT.return)=colnames(GT.m)
 64 | 
 65 | for(i in 1:ncol(GT.m)){
 66 | GT.return[i]=Delt(GT.m[,i])
 67 | }
 68 | 
 69 | GT.final=as.data.frame(matrix(NA, nrow(GT.m), nrow(scoreBoard)))
 70 | colnames(GT.final)=rownames(scoreBoard)
 71 | rownames(GT.final)=index(GT.m)
 72 | 
 73 | #Athens got lost along the way, so let's adjust for that
 74 | GT.return=GT.return[,-which(colnames(GT.return)=="athens")]
 75 | GT.return=GT.return[,order(colnames(GT.return))]
 76 | rownames(GT.return)=index(GT.m)
 77 | rownames(GT.final)=as.Date(index(GT.m), "%Y-%m-%d")
 78 | 
 79 | GT.final[nrow(GT.final),]=scoreBoard[,1]*100
 80 | GT.final=GT.final[,order(colnames(GT.final))]
 81 | 
 82 | for(i in 1:(nrow(GT.final))){
 83 |   GT.final[nrow(GT.final)-i,]=as.numeric(GT.final[nrow(GT.final)-i+1,])/(1+as.numeric(GT.return[nrow(GT.return)-i+1,]))
 84 | }
 85 | 
 86 | #Then let's plot the winners
 87 | batch=rownames(scoreBoard)[1:10]
 88 | numbers=1:nrow(GT.final)
 89 | plots=list()
 90 | for(i in 1:10){
 91 | winner=which(colnames(GT.final)==batch[i])
 92 | plots[[i]]=smooth.spline(as.Date(rownames(GT.final), "%Y-%m-%d"), GT.final[,winner], spar=0.35)
 93 | }
 94 | 
 95 | temp=data.frame(plots[[1]]$x)
 96 | temp[,2]=plots[[1]]$y
 97 | temp[1,2]=0
 98 | plot(as.Date(temp[,1]), temp[,2], type="l", col="white", xlab="Date", ylab="Search Volume", main="Most popular capitals")
 99 | for(i in 1:10){
100 | lines(plots[[i]], col=i)
101 | }
102 | text(locator(), labels = batch[1:5])
103 | 


--------------------------------------------------------------------------------
/competitor_tiers.R:
--------------------------------------------------------------------------------
  1 | library(dplyr)
  2 | 
  3 | company <- 'transferwise'
  4 | 
  5 | competitors <- c('moneygram',
  6 | 'western union',
  7 | 'fairfx',
  8 | 'caxton fx',
  9 | 'worldfirst',
 10 | 'worldremit',
 11 | 'currencyfair',
 12 | 'transfergo',
 13 | 'tawipay',
 14 | 'xoom',
 15 | 'transfast',
 16 | 'remitly',
 17 | 'ria money transfer',
 18 | 'azimo',
 19 | 'moneycorp',
 20 | 'ukforex',
 21 | 'hifx',
 22 | 'post office money',
 23 | 'transferwise',
 24 | 'revolut')
 25 | 
 26 | 
 27 | competitors <- c('ZhongAn',
 28 |                 'oscar health', # Do distinguish from The Oscars
 29 |                 'wealthfront',
 30 |                 'qufenqi',
 31 |                 'funding circle',
 32 |                 'kreditech',
 33 |                 'avant',
 34 |                 'atom bank',
 35 |                 'klarna',
 36 |                 'our crowd',
 37 |                 'lufax',
 38 |                 'robinhood',
 39 |                 '%2Fm%2F0by16yq', # Square
 40 |                 'motif investing',
 41 |                 'xero',
 42 |                 'stripe',
 43 |                 'collective health',
 44 |                 'credit karma',
 45 |                 'adyen',
 46 |                 'personal capital',
 47 |                 'secure key technologies ',
 48 |                 'betterment',
 49 |                 'kabbage',
 50 |                 'lending club',
 51 |                 'prosper',
 52 |                 'coinbase',
 53 |                 'izettle',
 54 |                 'policybazaar',
 55 |                 'knip',
 56 |                 'affirm',
 57 |                 'circleup',
 58 |                 'iex ',
 59 |                 'prospa',
 60 |                 'etoro',
 61 |                 'spotcap',
 62 |                 'jimubox',
 63 |                 'transferwise',
 64 |                 'rong360',
 65 |                 '21inc',
 66 |                 'coverfox',
 67 |                 'angellist')
 68 | 
 69 | competitors=tolower(competitors)
 70 | rank_table <- data.frame(competitors=competitors, batch = ceiling(seq(1, length(competitors),1)/4), stringsAsFactors=F)
 71 | downloadDir = '/users/erik.johansson/downloads'
 72 | res = list()
 73 | for(i in 1:max(rank_table$batch)){
 74 |   r = which(rank_table$batch == i)
 75 |   keywords = c(rank_table$competitors[r], company)
 76 |   url = URL_GT(keywords, country='GB')
 77 |   GT_dir = downloadGT(url, downloadDir)
 78 |   GT_dir = paste(downloadDir, GT_dir, sep='/')
 79 |   res[[i]] = readGT(GT_dir)
 80 | }
 81 | res.normalised = list()
 82 | for(i in 1:length(res)){
 83 |   print(i)
 84 |   res.normalised[[i]] = res[[i]]
 85 |   r <- which(res[[i]]$Keyword==company)
 86 |   res.company <- res[[i]][r,]
 87 |   keywords = unique(res[[i]]$Keyword)
 88 |   
 89 |   for(j in 1:length(keywords)){
 90 |     print(paste("j", j))
 91 |     s = which(res[[i]]$Keyword == keywords[j])
 92 |     res.normalised[[i]]$SVI[s] = res[[i]]$SVI[s] / res.company$SVI
 93 |   }
 94 | }
 95 | 
 96 | df <- do.call("rbind", res.normalised)
 97 | 
 98 | df %>% ggplot(aes(Date, SVI,color=Keyword))+geom_line()
 99 | 
100 | 
101 | df.max <- df[which(df$Date==max(df$Date)),]
102 | df.max <-df.max[!duplicated(df.max[-4]),]
103 | rank_table <- merge(rank_table, df.max[c(3,2)], by.x='competitors', by.y='Keyword') %>% unique
104 | rank_table <- rank_table[order(rank_table$SVI, decreasing=T),]
105 | rank_table <- rank_table[is.finite(rank_table$SVI),]
106 | deviation <- rank_table$SVI-mean(rank_table$SVI)
107 | top_tier <- which(deviation > sd(rank_table$SVI)/2)
108 | bottom_tier <- which(deviation < -sqrt(sd(rank_table$SVI))/5)
109 | rank_table$tier <- 'mid_tier'
110 | rank_table$tier[top_tier] <- 'top_tier'
111 | rank_table$tier[bottom_tier] <- 'bottom_tier'
112 | 
113 | top_tier_competitors <- rank_table$competitors[which(rank_table$tier == 'top_tier')]
114 | df[which(df$Keyword %in% top_tier_competitors),] %>% filter(Date > '2015-01-01') %>% ggplot(aes(Date, SVI,color=Keyword))+geom_line()
115 | 
116 | mid_tier_competitors <- rank_table$competitors[which(rank_table$tier == 'mid_tier')]
117 | df[which(df$Keyword %in% mid_tier_competitors),] %>% filter(Date > '2015-01-01') %>%  ggplot(aes(Date, SVI,color=Keyword))+geom_line()
118 | 
119 | bottom_tier_competitors <- rank_table$competitors[which(rank_table$tier == 'bottom_tier')]
120 | df[which(df$Keyword %in% bottom_tier_competitors),] %>% filter(Date > '2015-01-01') %>%  ggplot(aes(Date, SVI,color=Keyword))+geom_line()
121 | 
122 | df[-which(df$Keyword %in% bottom_tier_competitors),] %>%  filter(Date > '2014-01-01') %>% ggplot(aes(Date, SVI,color=Keyword))+geom_line()
123 | 
124 | write.cb(rank_table)
125 | 


--------------------------------------------------------------------------------
/lynx automate script:
--------------------------------------------------------------------------------
  1 | 
  2 | # Remove old files from directory
  3 | system('rm /root/downloads/*.*')
  4 | library(data.table)
  5 | library(RCurl)
  6 | library(RMySQL)
  7 | library(dplyr)
  8 | library(reshape2)
  9 | functions <- getURL("https://raw.githubusercontent.com/321k/R-Helper-Functions/master/general%20helper%20functions")
 10 | eval(parse(text=functions))
 11 | functions <- getURL("https://raw.githubusercontent.com/321k/Google-Trends/master/Google%20Trends%20functions")
 12 | eval(parse(text=functions))
 13 | 
 14 | 
 15 | 
 16 | swap <- function(x, winner, loser){
 17 |   x_copy <- x
 18 |   w <- which(x==winner)
 19 |   l <- which(x==loser)
 20 |   x[w] <- x_copy[l]
 21 |   x[l] <- x_copy[w]
 22 |   return(x)
 23 | }
 24 | 
 25 | 
 26 | # Function that creates the instructions for lynx to fetch data
 27 | lynx_script <- function(url){
 28 |   file_name <- paste('google_trends_download_,' ', Sys.time(), '.csv.gz', sep='')
 29 |   path <- paste('/root/downloads/', file_name, sep='')
 30 |   script <- list()
 31 |   script[[1]] <- c('A'
 32 |   ,'Down Arrow'
 33 |   ,'Down Arrow'
 34 |   ,'Down Arrow'
 35 |   ,'Down Arrow'
 36 |   ,'Down Arrow'
 37 |   ,'Down Arrow'
 38 |   ,'Down Arrow'
 39 |   ,'Down Arrow'
 40 |   ,'Down Arrow'
 41 |   ,'Down Arrow'
 42 |   ,'^J'
 43 |   ,'A'
 44 |   ,'a'
 45 |   ,'n'
 46 |   ,'t'
 47 |   ,'o'
 48 |   ,'n'
 49 |   ,'.'
 50 |   ,'m'
 51 |   ,'a'
 52 |   ,'i'
 53 |   ,'n'
 54 |   ,'h'
 55 |   ,'o'
 56 |   ,'f'
 57 |   ,'@'
 58 |   ,'g'
 59 |   ,'m'
 60 |   ,'a'
 61 |   ,'i'
 62 |   ,'l'
 63 |   ,'.'
 64 |   ,'c'
 65 |   ,'o'
 66 |   ,'m'
 67 |   ,'<tab>'
 68 |   ,'o'
 69 |   ,'o'
 70 |   ,'d'
 71 |   ,'i'
 72 |   ,'x'
 73 |   ,'a'
 74 |   ,'c'
 75 |   ,'h'
 76 |   ,'^J'
 77 |   ,'^J'
 78 |   ,'A'
 79 |   ,'A'
 80 |   ,'A'
 81 |   ,'A'
 82 |   ,'A'
 83 |   ,'g')
 84 |   
 85 |   script[[2]] <- substring(url, 1:nchar(url), 1:nchar(url))
 86 |   script[[3]] <- c(
 87 |   '^J',
 88 |   'A',
 89 |   'D',
 90 |   'Down Arrow',
 91 |   'Down Arrow',
 92 |   'Down Arrow',
 93 |   '^J',
 94 |   '<delete>',
 95 |   '<delete>',
 96 |   '<delete>',
 97 |   '<delete>',
 98 |   '<delete>',
 99 |   '<delete>',
100 |   '<delete>',
101 |   '<delete>',
102 |   '<delete>',
103 |   '<delete>',
104 |   '<delete>',
105 |   '<delete>',
106 |   '<delete>'
107 |   )
108 |   
109 |   script[[4]] <- substring(path, 1:nchar(path), 1:nchar(path))
110 |   script[[5]] <- c(
111 |   '^J',
112 |   'q',
113 |   'y'
114 |   )
115 |   
116 |   res <- vector()
117 |   for(i in 1:length(script)){
118 |     res <- c(res, script[[i]])
119 |   }
120 |   for(i in 1:length(res)){
121 |     res[i] <- paste('key', res[i])
122 |   }
123 |   res <- paste(res,collapse="\n")
124 |   return(res)
125 | }
126 | 
127 | 
128 | 
129 | # Connect to db and get list of competitors to fetch data for
130 | con = dbConnect(MySQL(), user='erik', password='johansson', dbname='gt')
131 | 
132 | query = 'select * from gt.keywords;'
133 | competitors <- dbGetQuery(con, query)
134 | competitors <- competitors$competitors
135 | 
136 | # Create pairwise combinations of all competitors
137 | pairwise <- combn(competitors, 2)
138 | 
139 | # Create download url for all combinations
140 | url=vector()
141 | for(i in 1:ncol(pairwise)){
142 |   url[i] <- URL_GT(pairwise[,i])
143 | }
144 | 
145 | # Download the files
146 | for(i in 1:length(url)){
147 |   lynx_commands <- lynx_script(url[i])
148 |   write.table(lynx_commands, '/root/gt_download', row.names=F, col.names=F, quote=F)
149 |   system("lynx -cmd_script=/root/gt_download www.google.com")
150 | }
151 | 
152 | # List path to files just downloaded
153 | file_path <- list.files('/root/downloads/')
154 | 
155 | # Read files
156 | gt <- readGT(paste('/root/downloads/', file_path, sep=''))
157 | 
158 | # Calculate the ratio between the different combinations
159 | paths <- unique(gt$Path)
160 | res <- list()
161 | for(i in 1:length(paths)){
162 |   r <- which(gt$Path==paths[i])
163 |   if(length(unique(gt$Keyword[r]))==1) next()
164 |   res[[i]] <- gt[r,] %>% 
165 |     select(Date, Keyword, SVI) %>% 
166 |     dcast(Date~Keyword) %>%
167 |     mutate(numerator_name=names(.)[3]
168 |            , denominator_name=names(.)[2]
169 |            , ratio=.[,3]/.[,2]) %>% 
170 |     setnames(names(.)[2:3], c('numerator', 'denominator'))
171 | }
172 | results <- data.frame(Reduce(rbind, res))
173 | 
174 | dbWriteTable(con, 'competitors', results, overwrite=T)
175 | 
176 | max_date=max(results$Date)
177 | x <- results %>% filter(Date==max_date) %>% select(numerator_name, denominator_name, ratio)
178 | tmp <- x %>% select(denominator_name, numerator_name, ratio) %>% mutate(ratio=1/ratio) %>% setnames(names(.), names(x))
179 | x <- rbind(x, tmp)
180 | 
181 | # Ranking algorithm
182 | rank <- competitors
183 | for(j in 1:5){
184 |   for(i in 1:(length(rank)-1)){
185 |     participants <- rank[i:(i+1)]
186 |     r <- which(x$numerator_name==participants[1] & x$denominator_name==participants[2])
187 |     if(length(r)==0){
188 |       player_ahead = which(x$numerator_name==participants[1])
189 |       player_behind = which(x$numerator_name==participants[2])
190 |       if(length(player_ahead) < length(player_behind)){
191 |         rank <- swap(rank, participants[1], participants[2])
192 |         next()
193 |       } else {next()}
194 |     }
195 |     if(x$ratio[r]<1){
196 |       rank <- swap(rank, participants[1], participants[2])
197 |       print(paste("swapping", participants[1], participants[2]))
198 |       print(rank)
199 |     }
200 |   }
201 | }
202 | 
203 | # Create leaderboard
204 | leaderboard=as.data.frame(rank)
205 | leaderboard$pairwise_ratio <- NA
206 | 
207 | for(i in 1:(nrow(leaderboard)-1)){
208 |   numerator = as.character(leaderboard$rank[i])
209 |   denominator = as.character(leaderboard$rank[i+1])
210 |   leaderboard$pairwise_ratio[i] <- x %>% filter(numerator_name==numerator, denominator_name==denominator) %>% select(ratio) %>% as.numeric
211 | }
212 | 
213 | leaderboard$absolute_ratio <- NA
214 | start <- which(is.na(leaderboard$pairwise_ratio))[1]
215 | leaderboard$absolute_ratio[start]=1
216 | for(i in start:2){
217 |   leaderboard$absolute_ratio[i-1] <- leaderboard$absolute_ratio[i] * leaderboard$pairwise_ratio[i-1]
218 | }
219 | leaderboard$absolute_ratio[which(is.na(leaderboard$absolute_ratio))] <- 0
220 | dbWriteTable(con, 'leaderboard', leaderboard)
221 | 


--------------------------------------------------------------------------------
/Google Trends functions:
--------------------------------------------------------------------------------
  1 | #This script automates the downloading of Google Trends.
  2 | #It works best with firefox in combination with the Tab Mix Plus add-on that is used to automate tab closing.
  3 | #Ask firefox not to prompt for new downloads and this script should run automatically.
  4 | #Google Trends restricts the number of download to roughly 400 at a time.
  5 | 
  6 | URL_GT=function(keyword="", country=NA, region=NA, year=NA, month=1, length=3){
  7 |   
  8 |   start="http://www.google.com/trends/trendsReport?hl=en-US&q="
  9 |   end="&cmpt=q&content=1&export=1"
 10 |   geo=""
 11 |   date=""
 12 |   
 13 |   #Geographic restrictions
 14 |   if(!is.na(country)) {
 15 |     geo="&geo="
 16 |     geo=paste(geo, country, sep="")
 17 |     if(!is.na(region)) geo=paste(geo, "-", region, sep="")
 18 |   }
 19 |   
 20 |   queries=keyword[1]
 21 |   if(length(keyword)>1) {
 22 |     for(i in 2:length(keyword)){
 23 |       queries=paste(queries, "%2C ", keyword[i], sep="")
 24 |     }
 25 |   }
 26 |   
 27 |   #Dates
 28 |   if(!is.na(year)){
 29 |     date="&date="
 30 |     date=paste(date, month, "%2F", year, "%20", length, "m", sep="")
 31 |   }
 32 |   
 33 |   URL=paste(start, queries, geo, date, end, sep="")
 34 |   URL <- gsub(" ", "%20", URL)
 35 |   return(URL)
 36 | }
 37 | 
 38 | downloadGT=function(URL, downloadDir){
 39 |   
 40 |   #Determine if download has been completed by comparing the number of files in the download directory to the starting number
 41 |   startingFiles=list.files(downloadDir)
 42 |   browseURL(URL)
 43 |   endingFiles=list.files(downloadDir)
 44 |   
 45 |   while(length(setdiff(endingFiles,startingFiles))==0) {
 46 |     Sys.sleep(3)
 47 |     endingFiles=list.files(downloadDir)
 48 |   }
 49 |   filePath=setdiff(endingFiles,startingFiles)
 50 |   return(filePath)
 51 | }
 52 | 
 53 | 
 54 | readGT=function(filePath){
 55 |   rawFiles=list()
 56 |   
 57 |   for(i in 1:length(filePath)){
 58 |     if(length(filePath)==1) rawFiles[[1]]=read.csv(filePath, header=F, blank.lines.skip=F)
 59 |     if(length(filePath)>1) rawFiles[[i]]=read.csv(filePath[i], header=F, blank.lines.skip=F)
 60 |   }
 61 |   
 62 |   output=data.frame()
 63 |   name=vector()
 64 |   
 65 |   for(i in 1:length(rawFiles)){
 66 |     data=rawFiles[[i]]
 67 |     name=as.character(t(data[5,-1]))
 68 |     
 69 |     #Select the time series
 70 |     start=which(data[,1]=="")[1]+3
 71 |     stop=which(data[,1]=="")[2]-2
 72 |     
 73 |     #Skip to next if file is empty
 74 |     if(ncol(data)<2) next
 75 |     if(is.na(which(data[,1]=="")[2]-2)) next
 76 |     
 77 |     data=data[start:stop,]
 78 |     data[,1]=as.character(data[,1])
 79 |     
 80 |     #Convert all columns except date column into numeric
 81 |     for(j in 2:ncol(data)) data[,j]=as.numeric(as.character(data[,j]))
 82 |     
 83 |     #FORMAT DATE
 84 |     len=nchar(data[1,1])
 85 |     
 86 |     #Monthly data
 87 |     if(len==7) {
 88 |       data[,1]=as.Date(paste(data[,1], "-1", sep=""), "%Y-%m-%d")
 89 |       data[,1]=sapply(data[,1], seq, length=2, by="1 month")[2,]-1
 90 |       data[,1]=as.Date(data[,1], "%Y-%m-%d", origin="1970-01-01")
 91 |     }
 92 |     
 93 |     #Weekly data
 94 |     if(len==23){
 95 |       data[,1]=sapply(data[,1], substr, start=14, stop=30)
 96 |       data[,1]=as.Date(data[,1], "%Y-%m-%d")
 97 |     }
 98 |     
 99 |     #Daily data
100 |     if(len==10) data[,1]=as.Date(data[,1], "%Y-%m-%d")
101 |     
102 |     #Structure into panel data format
103 |     panelData=data[1:2]
104 |     panelData[3]=name[1]
105 |     names(panelData)=c("Date", "SVI", "Keyword")
106 |     if(ncol(data)>2) {
107 |       
108 |       for(j in 3:ncol(data)) {
109 |         appendData=data[c(1,j)]
110 |         appendData[3]=name[j-1]
111 |         names(appendData)=c("Date", "SVI", "Keyword")
112 |         panelData=rbind(panelData, appendData)
113 |       }
114 |     }
115 |     
116 |     #Add file name  
117 |     panelData[ncol(panelData)+1]=filePath[i]
118 |     
119 |     #Add path to filename
120 |     names(panelData)[4]="Path"
121 |     
122 |     #Merge several several files into one
123 |     if(i==1) output=panelData
124 |     if(i>1) output=rbind(output, panelData)
125 |   }
126 |   return(output)
127 | }
128 | 
129 | readGeoGT=function(filePath){
130 |   output=data.frame()
131 |   rawFiles=list()
132 |   for(i in 1:length(filePath)){
133 |     if(length(filePath)==1) rawFiles[[1]]=read.csv(filePath, header=F, blank.lines.skip=F)
134 |     if(length(filePath)>1) rawFiles[[i]]=read.csv(filePath[i], header=F, blank.lines.skip=F)
135 |   }
136 |   
137 |   for(i in 1:length(rawFiles)){
138 |     data=rawFiles[[i]]
139 |     start=which(data[,1]=="")[3]+3
140 |     stop=which(data[,1]=="")[4]-1
141 |     names=data[start-1,]
142 |     
143 |     for(j in 1:ncol(names)) names(data)[j]=as.character(names[1,j])
144 |     data=data[start:stop,]
145 |     data[,1]=as.character(data[,1])
146 |     data[,-1]=as.numeric(as.character(data[,-1]))
147 |     data[ncol(data)+1]=filePath[i]
148 |     
149 |     output=rbind(output, data)
150 |   }
151 |   return(output)
152 | }
153 | 
154 | 
155 | readAdditionalGT = function(filePath){
156 |   
157 |   output=list()
158 |   rawFiles=list()
159 |   for(i in 1:length(filePath)){
160 |     if(length(filePath)==1) rawFiles[[1]]=read.csv(filePath, header=F, blank.lines.skip=F)
161 |     if(length(filePath)>1) rawFiles[[i]]=read.csv(filePath[i], header=F, blank.lines.skip=F)
162 |   }
163 |   
164 |   
165 |   for(file in rawFiles){
166 |     search_term = substring(as.character(file[1,1]), 22)
167 |     start = grep('Top regions', file[,1]) + 2
168 |     end = start + which(file[start:nrow(file),1]=="")[1] - 2
169 |     tmp = file[start:end,]
170 |     tmp$Type = 'Top regions'
171 |     tmp$Keyword = search_term
172 |     output[[length(output)+1]] = tmp
173 |     
174 |     start = grep('Top cities', file[,1]) + 2
175 |     end = start + which(file[start:nrow(file),1]=="")[1] - 2
176 |     tmp = file[start:end,]
177 |     tmp$Type = 'Top cities'
178 |     tmp$Keyword = search_term
179 |     output[[length(output)+1]] = tmp
180 |     
181 |     start = grep('Top searches', file[,1]) + 2
182 |     end = start + which(file[start:nrow(file),1]=="")[1] - 2
183 |     tmp = file[start:end,]
184 |     tmp$Type = 'Top searches'
185 |     tmp$Keyword = search_term
186 |     output[[length(output)+1]] = tmp
187 |     
188 |     start = grep('Rising searches', file[,1]) + 2
189 |     end = start + which(file[start:nrow(file),1]=="")[1] - 2
190 |     tmp = file[start:end,]
191 |     tmp$Type = 'Rising searches'
192 |     tmp$Keyword = search_term
193 |     output[[length(output)+1]] = tmp
194 |   }
195 |   output = do.call(rbind, output)
196 |   return(output)
197 | }
198 | 


--------------------------------------------------------------------------------
/Daily data example.R:
--------------------------------------------------------------------------------
  1 | 
  2 | library(Rmisc)
  3 | library(ggplot2)
  4 | library(dplyr)
  5 | 
  6 | # The Google Trends formating functions -----------------------------------
  7 | 
  8 | #This script automates the downloading of Google Trends.
  9 | #It works best with firefox in combination with the Tab Mix Plus add-on that is used to automate tab closing.
 10 | #Ask firefox not to prompt for new downloads and this script should run automatically.
 11 | #Google Trends restricts the number of download to roughly 400 at a time.
 12 | 
 13 | URL_GT=function(keyword="", country=NA, region=NA, year=NA, month=1, length=3){
 14 |   
 15 |   start="http://www.google.com/trends/trendsReport?hl=en-US&q="
 16 |   end="&cmpt=q&content=1&export=1"
 17 |   geo=""
 18 |   date=""
 19 |   
 20 |   #Geographic restrictions
 21 |   if(!is.na(country)) {
 22 |     geo="&geo="
 23 |     geo=paste(geo, country, sep="")
 24 |     if(!is.na(region)) geo=paste(geo, "-", region, sep="")
 25 |   }
 26 |   
 27 |   queries=keyword[1]
 28 |   if(length(keyword)>1) {
 29 |     for(i in 2:length(keyword)){
 30 |       queries=paste(queries, "%2C ", keyword[i], sep="")
 31 |     }
 32 |   }
 33 |   
 34 |   #Dates
 35 |   if(!is.na(year)){
 36 |     date="&date="
 37 |     date=paste(date, month, "%2F", year, "%20", length, "m", sep="")
 38 |   }
 39 |   
 40 |   URL=paste(start, queries, geo, date, end, sep="")
 41 |   URL <- gsub(" ", "%20", URL)
 42 |   return(URL)
 43 | }
 44 | 
 45 | downloadGT=function(URL, downloadDir){
 46 |   
 47 |   #Determine if download has been completed by comparing the number of files in the download directory to the starting number
 48 |   startingFiles=list.files(downloadDir)
 49 |   browseURL(URL)
 50 |   endingFiles=list.files(downloadDir)
 51 |   
 52 |   while(length(setdiff(endingFiles,startingFiles))==0) {
 53 |     Sys.sleep(3)
 54 |     endingFiles=list.files(downloadDir)
 55 |   }
 56 |   filePath=setdiff(endingFiles,startingFiles)
 57 |   return(filePath)
 58 | }
 59 | 
 60 | 
 61 | readGT=function(filePath){
 62 |   rawFiles=list()
 63 |   
 64 |   for(i in 1:length(filePath)){
 65 |     if(length(filePath)==1) rawFiles[[1]]=read.csv(filePath, header=F, blank.lines.skip=F)
 66 |     if(length(filePath)>1) rawFiles[[i]]=read.csv(filePath[i], header=F, blank.lines.skip=F)
 67 |   }
 68 |   
 69 |   output=data.frame()
 70 |   name=vector()
 71 |   
 72 |   for(i in 1:length(rawFiles)){
 73 |     data=rawFiles[[i]]
 74 |     name=as.character(t(data[5,-1]))
 75 |     
 76 |     #Select the time series
 77 |     start=which(data[,1]=="")[1]+3
 78 |     stop=which(data[,1]=="")[2]-2
 79 |     
 80 |     #Skip to next if file is empty
 81 |     if(ncol(data)<2) next
 82 |     if(is.na(which(data[,1]=="")[2]-2)) next
 83 |     
 84 |     data=data[start:stop,]
 85 |     data[,1]=as.character(data[,1])
 86 |     
 87 |     #Convert all columns except date column into numeric
 88 |     for(j in 2:ncol(data)) data[,j]=as.numeric(as.character(data[,j]))
 89 |     
 90 |     #FORMAT DATE
 91 |     len=nchar(data[1,1])
 92 |     
 93 |     #Monthly data
 94 |     if(len==7) {
 95 |       data[,1]=as.Date(paste(data[,1], "-1", sep=""), "%Y-%m-%d")
 96 |       data[,1]=sapply(data[,1], seq, length=2, by="1 month")[2,]-1
 97 |       data[,1]=as.Date(data[,1], "%Y-%m-%d", origin="1970-01-01")
 98 |     }
 99 |     
100 |     #Weekly data
101 |     if(len==23){
102 |       data[,1]=sapply(data[,1], substr, start=14, stop=30)
103 |       data[,1]=as.Date(data[,1], "%Y-%m-%d")
104 |     }
105 |     
106 |     #Daily data
107 |     if(len==10) data[,1]=as.Date(data[,1], "%Y-%m-%d")
108 |     
109 |     #Structure into panel data format
110 |     panelData=data[1:2]
111 |     panelData[3]=name[1]
112 |     names(panelData)=c("Date", "SVI", "Keyword")
113 |     if(ncol(data)>2) {
114 |       
115 |       for(j in 3:ncol(data)) {
116 |         appendData=data[c(1,j)]
117 |         appendData[3]=name[j-1]
118 |         names(appendData)=c("Date", "SVI", "Keyword")
119 |         panelData=rbind(panelData, appendData)
120 |       }
121 |     }
122 |     
123 |     #Add file name  
124 |     panelData[ncol(panelData)+1]=filePath[i]
125 |     
126 |     #Add path to filename
127 |     names(panelData)[4]="Path"
128 |     
129 |     #Merge several several files into one
130 |     if(i==1) output=panelData
131 |     if(i>1) output=rbind(output, panelData)
132 |   }
133 |   return(output)
134 | }
135 | 
136 | readGeoGT=function(filePath){
137 |   output=data.frame()
138 |   rawFiles=list()
139 |   for(i in 1:length(filePath)){
140 |     if(length(filePath)==1) rawFiles[[1]]=read.csv(filePath, header=F, blank.lines.skip=F)
141 |     if(length(filePath)>1) rawFiles[[i]]=read.csv(filePath[i], header=F, blank.lines.skip=F)
142 |   }
143 |   
144 |   for(i in 1:length(rawFiles)){
145 |     data=rawFiles[[i]]
146 |     start=which(data[,1]=="")[3]+3
147 |     stop=which(data[,1]=="")[4]-1
148 |     names=data[start-1,]
149 |     
150 |     for(j in 1:ncol(names)) names(data)[j]=as.character(names[1,j])
151 |     data=data[start:stop,]
152 |     data[,1]=as.character(data[,1])
153 |     data[,-1]=as.numeric(as.character(data[,-1]))
154 |     data[ncol(data)+1]=filePath[i]
155 |     
156 |     output=rbind(output, data)
157 |   }
158 |   return(output)
159 | }
160 | 
161 | 
162 | # Downloading the data ----------------------------------------------------
163 | 
164 | 
165 | search_terms = c("bull market", "bear market", "recession")
166 | 
167 | years = c(2005,2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016)
168 | months = c(1,4,7,10)
169 | res.daily=list()
170 | counter=1
171 | for(year in years){
172 |   for(month in months){
173 |     url=URL_GT(search_terms, year=year, month=month)
174 |     GT_dir = downloadGT(url, downloadDir)
175 |     GT_dir = paste(downloadDir, GT_dir, sep='/')
176 |     res.daily[[counter]] = readGT(GT_dir)
177 |     counter=counter+1
178 |   }
179 | }
180 | 
181 | df.daily <- do.call("rbind", res.daily)
182 | 
183 | url = URL_GT(search_terms)
184 | GT_dir = downloadGT(url, downloadDir)
185 | GT_dir = paste(downloadDir, GT_dir, sep='/')
186 | df.weekly = readGT(GT_dir)
187 | 
188 | 
189 | # Formating the data ------------------------------------------------------
190 | 
191 | 
192 | df.merged = merge(df.daily, df.weekly, by=c('Date', 'Keyword'), all.x=T)
193 | df.merged$adjustment_factor = df.merged$SVI.y /df.merged$SVI.x
194 | 
195 | for(i in search_terms){
196 |   r=which(df.merged$Keyword==i)
197 |   for(j in 2:length(r)){
198 |     if(!is.finite(df.merged$adjustment_factor[r][j])){
199 |       df.merged$adjustment_factor[r][j] = df.merged$adjustment_factor[r][j-1]
200 |     }
201 |   }
202 | }
203 | df.merged$daily = df.merged$adjustment_factor * df.merged$SVI.x
204 | df.merged$weekly = df.merged$SVI.y
205 | for(i in search_terms){
206 |   r=which(df.merged$Keyword==i)
207 |   for(j in 2:length(r)){
208 |     if(is.na(df.merged$weekly[r][j])){
209 |       df.merged$weekly[r][j] = df.merged$weekly[r][j-1]
210 |     }
211 |   }
212 | }
213 | 
214 | 
215 | # Plotting the data -------------------------------------------------------
216 | 
217 | df.merged$daily[which(is.infinite(df.merged$daily))] = NA
218 | 
219 | p1 = df.merged %>%
220 |   ggplot(aes(Date, daily, color=Keyword))+geom_line()
221 | 
222 | p2 = df.merged %>%
223 |   ggplot(aes(Date, weekly, color=Keyword))+geom_line()
224 | 
225 | multiplot(p1,p2)
226 | 
227 | 
228 | # Saving the data ---------------------------------------------------------
229 | 
230 | 
231 | write.csv(df.merged,'df.merged.csv')
232 | 


--------------------------------------------------------------------------------
/Working example:
--------------------------------------------------------------------------------
  1 | # This script automates the downloading of Google Trends.
  2 | # It works best with firefox in combination with the Tab Mix Plus add-on that is used to automate tab closing.
  3 | # Ask firefox not to prompt for new downloads and this script should run automatically.
  4 | # Google Trends restricts the number of download to roughly 400 at a time.
  5 | # This is a fully working script for downloading Google Trends daily Google Trends data for the key word "FTSE 100" since 2004
  6 | # The only thing you need to do is change downloadDir to the download directory of your default browser.
  7 | # The final results will be located in the data frame "summary"
  8 | # Note that this has only been tested on a Windows machine. It should work on a Mac as well.
  9 | 
 10 | downloadDir="C:/Users/erik.johansson/Downloads"
 11 | setwd(downloadDir)
 12 | seach_word="FTSE 100"
 13 | 
 14 | 
 15 | URL_GT=function(keyword="", country=NA, region=NA, year=NA, month=1, length=3){
 16 | 
 17 |   start="http://www.google.com/trends/trendsReport?hl=en-US&q="
 18 |   end="&cmpt=q&content=1&export=1"
 19 |   geo=""
 20 |   date=""
 21 | 
 22 | #Geographic restrictions
 23 |   if(!is.na(country)) {
 24 |     geo="&geo="
 25 |     geo=paste(geo, country, sep="")
 26 |     if(!is.na(region)) geo=paste(geo, "-", region, sep="")
 27 |   }
 28 | 
 29 |   queries=keyword[1]
 30 |   if(length(keyword)>1) {
 31 |     for(i in 2:length(keyword)){
 32 |     queries=paste(queries, "%2C ", keyword[i], sep="")
 33 |     }
 34 |   }
 35 | 
 36 | #Dates
 37 |   if(!is.na(year)){
 38 |     date="&date="
 39 |     date=paste(date, month, "%2F", year, " ", length, "m", sep="")
 40 |   }
 41 | 
 42 |   URL=paste(start, queries, geo, date, end, sep="")
 43 |   return(URL)
 44 | }
 45 | 
 46 | downloadGT=function(URL, downloadDir){
 47 | 
 48 | #Determine if download has been completed by comparing the number of files in the download directory to the starting number
 49 |   startingFiles=list.files(downloadDir)
 50 |   browseURL(URL)
 51 |   endingFiles=list.files(downloadDir)
 52 | 
 53 |   while(length(setdiff(endingFiles,startingFiles))==0) {
 54 |     Sys.sleep(3)
 55 |     endingFiles=list.files(downloadDir)
 56 |   }
 57 |   filePath=setdiff(endingFiles,startingFiles)
 58 |   return(filePath)
 59 | }
 60 | 
 61 | 
 62 | readGT=function(filePath){
 63 | rawFiles=list()
 64 | 
 65 |   for(i in 1:length(filePath)){
 66 |     if(length(filePath)==1) rawFiles[[1]]=read.csv(filePath, header=F, blank.lines.skip=F)
 67 |     if(length(filePath)>1) rawFiles[[i]]=read.csv(filePath[i], header=F, blank.lines.skip=F)
 68 |   }
 69 | 
 70 |   output=data.frame()
 71 |   name=vector()
 72 | 
 73 |   for(i in 1:length(rawFiles)){
 74 |     data=rawFiles[[i]]
 75 |     raw_name=as.character(data[1,1])
 76 |     raw_name=substr(raw_name, 22, nchar(raw_name))
 77 |   
 78 | #Create a vector called name containing the search terms
 79 |     if(grepl(";", raw_name)) {
 80 |       separators=gregexpr(";", raw_name)[[1]]
 81 |       separators=c(separators, nchar(raw_name)+1)
 82 |       
 83 |       name[1]=substr(raw_name, 1, separators[1]-1)
 84 |       for(j in 2:length(separators)) {
 85 |         name[j]=substr(raw_name, separators[j-1]+2, separators[j]-1)
 86 |       }
 87 |     } else {name=raw_name}
 88 |     
 89 | #Select the time series
 90 |     start=which(data[,1]=="")[1]+3
 91 |     stop=which(data[,1]=="")[2]-2
 92 |     
 93 | #Skip to next if file is empty
 94 |     if(ncol(data)<2) next
 95 |     if(is.na(which(data[,1]=="")[2]-2)) next
 96 |     
 97 |     data=data[start:stop,]
 98 |     data[,1]=as.character(data[,1])
 99 | 
100 | #Convert all columns except date column into numeric
101 |     for(j in 2:ncol(data)) data[,j]=as.numeric(as.character(data[,j]))
102 | 
103 | #FORMAT DATE
104 |     len=nchar(data[1,1])
105 | 
106 | #Monthly data
107 |     if(len==7) {
108 |         data[,1]=as.Date(paste(data[,1], "-1", sep=""), "%Y-%m-%d")
109 |         data[,1]=sapply(data[,1], seq, length=2, by="1 month")[2,]-1
110 |         data[,1]=as.Date(data[,1], "%Y-%m-%d", origin="1970-01-01")
111 |       }
112 | 
113 | #Weekly data
114 |       if(len==23){
115 |         data[,1]=sapply(data[,1], substr, start=14, stop=30)
116 |         data[,1]=as.Date(data[,1], "%Y-%m-%d")
117 |       }
118 | 
119 | #Daily data
120 |       if(len==10) data[,1]=as.Date(data[,1], "%Y-%m-%d")
121 |       
122 | #Structure into panel data format
123 |       panelData=data[1:2]
124 |       panelData[3]=name[1]
125 |       names(panelData)=c("Date", "SVI", "Keyword")
126 |       if(ncol(data)>2) {
127 | 
128 |         for(j in 3:ncol(data)) {
129 |           appendData=data[c(1,j)]
130 |           appendData[3]=name[j-1]
131 |           names(appendData)=c("Date", "SVI", "Keyword")
132 |           panelData=rbind(panelData, appendData)
133 |         }
134 |       }
135 | 
136 | #Add file name  
137 |       panelData[ncol(panelData)+1]=filePath[i]
138 |   
139 | #Add path to filename
140 |       names(panelData)[4]="Path"
141 |       
142 | #Merge several several files into one
143 |       if(i==1) output=panelData
144 |       if(i>1) output=rbind(output, panelData)
145 |   }
146 |   return(output)
147 | }
148 | 
149 | 
150 | reindexGT=function(GT.daily, GT.weekly){
151 |   GT.daily=GT.daily[,order(colnames(GT.daily))]
152 |   GT.weekly=GT.weekly[,order(colnames(GT.weekly))]
153 |  
154 |   w=match(names(GT.daily), names(GT.weekly))
155 |   w=w[!is.na(w)]
156 |   d=match(names(GT.weekly), names(GT.daily))
157 |   d=d[!is.na(d)]
158 |  
159 |   GT.weekly=GT.weekly[,w]
160 |   GT.daily=GT.daily[,d]
161 |  
162 |   merged=merge(GT.daily, GT.weekly, by="Date", all=T)
163 |   GT.daily=merged[2:(1+((length(merged)-1))/2)]
164 |   rownames(GT.daily)=merged$Date
165 |   GT.weekly=merged[(2+(length(merged)-1)/2):length(merged)]
166 |   rownames(GT.weekly)=merged$Date
167 |  
168 |   reindex=GT.daily/GT.weekly
169 |   for(i in 2:length(reindex[,1])) {
170 |     for(j in 1:length(reindex[1,])) {
171 |       if(is.na(reindex[i,j])) {
172 |         reindex[i,j]=reindex[i-1,j]
173 |       }
174 |     }
175 |   }
176 |  
177 |   for(i in 2:length(GT.daily[,1])) {
178 |    for(j in 1:length(GT.daily[1,])) {
179 |     if(is.na(GT.daily[i,j])) {
180 |       GT.daily[i,j]=GT.daily[i-1,j]
181 |       }
182 |     }
183 |   }
184 |  
185 |   for(i in 2:length(GT.weekly[,1])) {
186 |     for(j in 1:length(GT.weekly[1,])) {
187 |       if(is.na(GT.weekly[i,j])) {
188 |         GT.weekly[i,j]=GT.weekly[i-1,j]
189 |       }
190 |     }
191 |   }
192 |  
193 | output=list(reindex, GT.daily, GT.weekly)
194 | return(output)
195 | }
196 | 
197 | 
198 | # EXECUTION
199 | 
200 | year=c(2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015)
201 | 
202 | n=1
203 | trendsDir=vector()
204 | for(i in year){
205 |   for(j in 1:12){
206 |     URL=URL_GT(keyword=seach_word, year=i, month=j, length=1)
207 |     trendsDir[n]=downloadGT(URL, downloadDir)
208 |     n=n+1
209 |     if(i==2015 && j==6) break()
210 |   }
211 | }
212 | 
213 | daily_GT=readGT(trendsDir)
214 | 
215 | URL=URL_GT(keyword=seach_word)
216 | trendsDir_weekly=downloadGT(URL, downloadDir)
217 | weekly_GT=readGT(trendsDir_weekly)
218 | 
219 | reindexing=reindexGT(daily_GT[1:2], weekly_GT[1:2])
220 | 
221 | # GT is the final results
222 | GT=reindexing[[2]]/reindexing[[1]]
223 | GT[which(!is.finite(GT[,1])),]=0
224 | 
225 | # Create summary table for comparison
226 | summary=as.data.frame(matrix(NA, nrow(GT), 3))
227 | names(summary)=c("Date", "Reindexed", "Original daily")
228 | summary[,1]=as.Date(rownames(GT))
229 | summary[,2]=GT[,1]
230 | summary[,3]=reindexing[[2]][,1]
231 | summary=merge(summary, weekly_GT[1:2], all=T, by="Date")
232 | 
233 | for(i in 2:nrow(summary)){
234 | 	if(is.na(summary[i,4])) summary[i,4]=summary[i-1,4]
235 | }
236 | 
237 | library(scales)
238 | 
239 | plot(summary$Date, summary[,3], type="l")
240 | plot(summary$Date, summary[,4], type="l")
241 | plot(summary$Date, summary[,2], type="l")
242 | 
243 | plot(summary$Date, summary[,2], type="l", col=alpha("green", 0.5), lwd=2)
244 | lines(summary$Date, summary[,3], type="l", col=alpha("blue", 0.8), lwd=1)
245 | lines(summary$Date, summary[,4], type="l", col=alpha("red", 0.9), lwd=2)
246 | 


--------------------------------------------------------------------------------