├── learning
    ├── CheckLibraryPath.R
    ├── TwitterMapUse.R
    ├── EnvExample.R
    ├── Plot.R
    ├── Kmeans.R
    ├── GoogleVis-Motion.R
    ├── fileUpload.R
    ├── GoogleVis-Geo_Table.R
    ├── RjsonIO.R
    ├── geoCodeXML.R
    ├── MasterBlaster.R
    ├── ListLoadedPads.R
    ├── PlayingWithEnv.R
    ├── DQQE-Utility.R
    ├── ggplot2.R
    ├── ggplot.R
    ├── loadPADS.R
    ├── JSON.R
    ├── Clinic.R
    ├── WikiRead.R
    ├── NYTWebService.R
    ├── SP500MarketInflection.R
    ├── VCorpus.R
    ├── DataParser.R
    ├── MiscR.R
    ├── TransformDataSet.R
    ├── geoCode.R
    ├── TransformFailedBanks.R
    ├── SQLQueryDataFrame.R
    ├── DGQueryEngine.R
    ├── DGQEWebSockets.R
    ├── DKOMSentimentAnalysis.R
    ├── TwiterSentimentAnalysis.R
    ├── Olympics.R
    ├── birbal.R
    ├── TransformMasterBlasterData.R
    └── TwitterMap.R
├── GenerateMaplightPADS.R
├── README.md
├── FetchData
    ├── FetchFailedBanksData.R
    ├── FetchBeerRatings.R
    ├── FetchRussell2000.R
    ├── FetchIPLData.R
    ├── FetchImmigrationData.R
    └── GenerateFlightStats.R
├── GeneratePADS.R
├── GenerateFredPADS.R
├── GenerateWDI.R
├── MongoFns.R
├── r_solr_integration.R
├── GenerateWDIPADS.R
├── PresedentialElectionsResults.R
├── SentimentAnalysis.R
├── GenerateCrimePADS.R
├── CreatePADS.R
├── ClassifyData.R
├── flights_map_reduce.R
└── Util.R


/learning/CheckLibraryPath.R:
--------------------------------------------------------------------------------
1 | #Check library path
2 | pkg <- installed.packages() 
3 | pkg[which(pkg[, 1] == "base"), 2]  


--------------------------------------------------------------------------------
/learning/TwitterMapUse.R:
--------------------------------------------------------------------------------
1 | source("http://biostat.jhsph.edu/~jleek/code/twitterMap.R")
2 | twitterMap("spoonan", userLocation="Palo Alto", plotType="both")


--------------------------------------------------------------------------------
/GenerateMaplightPADS.R:
--------------------------------------------------------------------------------
 1 | #
 2 | # Generate maplight
 3 | #
 4 | 
 5 | require(data.table)
 6 | 
 7 | assign("folder.path", "./pads/raw-data/maplight/", envir=.GlobalEnv)
 8 | #assign("data.file", "data.csv.gz", envir=.GlobalEnv)
 9 | #ml.data.f <- data.table(read.csv(paste(folder.path, data.file, sep="")))
10 | 
11 | assign("data.cache", "ml-data.Rdata", envir=.GlobalEnv)
12 | ml.data <- data.table(read.csv(paste(folder.path, data.cache, sep="")))
13 | 


--------------------------------------------------------------------------------
/learning/EnvExample.R:
--------------------------------------------------------------------------------
 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
 2 | # Date: 3/15/2013
 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 5 | # All rights reserved.
 6 | 
 7 | e <- new.env()
 8 | e$foo <- 42
 9 | attach(e, name='datagram')
10 | rm(list=ls()) # Remove all in global env
11 | foo # Still there!
12 | #...and to detach it:
13 | #detach('datagram')


--------------------------------------------------------------------------------
/learning/Plot.R:
--------------------------------------------------------------------------------
 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
 2 | # Date: 3/15/2013
 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 5 | # All rights reserved.
 6 | 
 7 | #setContentType ("image/png")
 8 | temp <- tempfile ()
 9 | y = rnorm (100)
10 | png (temp, type="cairo")
11 | plot (1:100, y, t='l')
12 | dev.off ()
13 | sendBin (readBin (temp, 'raw', n=file.info(temp)$size))
14 | unlink (temp)


--------------------------------------------------------------------------------
/learning/Kmeans.R:
--------------------------------------------------------------------------------
 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
 2 | # Date: 3/15/2013
 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 5 | # All rights reserved.
 6 | 
 7 | newiris <- iris
 8 | newiris$Species <- NULL
 9 | kc <- kmeans(newiris, 3)
10 | table(iris$Species, kc$cluster)
11 | plot(newiris[c("Sepal.Length", "Sepal.Width")], col = kc$cluster)
12 | points(kc$centers[,c("Sepal.Length", "Sepal.Width")], col = 1:3, pch = 8, cex=2)


--------------------------------------------------------------------------------
/learning/GoogleVis-Motion.R:
--------------------------------------------------------------------------------
 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
 2 | # Date: 3/15/2013
 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 5 | # All rights reserved.
 6 | 
 7 | library("googleVis")
 8 | 
 9 | data(Fruits)
10 | #typeof(Fruits)
11 | M <- gvisMotionChart(Fruits, idvar="Fruit", timevar="Year")
12 | plot(M)
13 | 
14 | #Chart Outout
15 | #print(M, 'chart')
16 | #cat(M$html$header)
17 | #cat(M$html$chart)
18 | #cat(M$html$footer)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | R
 2 | =
 3 | In this repository, the datadolph.in team is sharing all of the R codebase that it developed to analyze large quantities of data.  
 4 | datadolph.in team has benefited tremendously from fellow R bloggers and other open source communities and is proud to contribute
 5 | all of its codebase into the community.
 6 | 
 7 | The codebase includes ETL and interatiion scripts on - 
 8 | - R-Solr Integration
 9 | - R-Mongo Interaction
10 | - R-MySQL Interaction
11 | - Fetching, cleansing and transforming data 
12 | - Classfication (idenfiy column types)
13 | - Default chart generation (based on simple heuristics and matching a dimension with a measure) 
14 | 


--------------------------------------------------------------------------------
/learning/fileUpload.R:
--------------------------------------------------------------------------------
 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
 2 | # Date: 3/15/2013
 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 5 | # All rights reserved.
 6 | 
 7 | #Set content type for output
 8 | setContentType("text/html")
 9 | post <- str(POST)
10 | print(post)
11 | cat(post)
12 | cat("<br/>")
13 | 
14 | #Files
15 | FileStr <- str(FILES)
16 | cat(FileStr)
17 | cat("<br/>")
18 | 
19 | #Or you can use the paste function 
20 | cat(paste(FILES,names(FILES)))
21 | cat("<br/>")
22 | 
23 | destination <- file.path('/Users/homemac/Toga/tmp',FILES$FirstFile$name)
24 | file.copy(FILES$FirstFile$tmp_name,destination,overwrite=TRUE)


--------------------------------------------------------------------------------
/learning/GoogleVis-Geo_Table.R:
--------------------------------------------------------------------------------
 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
 2 | # Date: 3/15/2013
 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 5 | # All rights reserved.
 6 | 
 7 | library("googleVis")
 8 | #Geo
 9 | G <- gvisGeoChart(Exports, "Country", "Profit",	options=list(width=200, height=100))
10 | #Table
11 | T <- gvisTable(Exports, options=list(width=200, height=270)) 
12 | #Motion
13 | M <- gvisMotionChart(Fruits, "Fruit", "Year", options=list(width=400, height=370)) 
14 | #Merge Charts
15 | GT <- gvisMerge(G,T, horizontal=FALSE) 
16 | #Merge Horizontal
17 | GTM <- gvisMerge(GT, M, horizontal=TRUE, tableOptions="bgcolor=\"#CCCCCC\" cellspacing=10")
18 | plot(GTM)


--------------------------------------------------------------------------------
/learning/RjsonIO.R:
--------------------------------------------------------------------------------
 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
 2 | # Date: 3/15/2013
 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 5 | # All rights reserved.
 6 | 
 7 | setwd("/Users/homemac/Toga/Alto")
 8 | library("RJSONIO")
 9 | l <- fromJSON('[{"winner":"68694999",  "votes":[ {"ts":"Thu Mar 25 03:13:01 UTC 2010", "user":{"name":"Lamur","user_id":"68694999"}}, {"ts":"Thu Mar 25 03:13:08 UTC 2010", "user":{"name":"Lamur","user_id":"68694999"}}], "lastVote":{"timestamp":1269486788526,"user":{"name":"Lamur","user_id":"68694999"}},"startPrice":0}]')
10 | m <- lapply(l[[1]]$votes, function(x) c(x$user$name, x$user$user_id, x$ts))
11 | ma <- matrix(unlist(m), ncol=3, byrow=TRUE)
12 | p <- data.frame(ma)


--------------------------------------------------------------------------------
/learning/geoCodeXML.R:
--------------------------------------------------------------------------------
 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
 2 | # Date: 3/15/2013
 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 5 | # All rights reserved.
 6 | 
 7 | getDocNodeVal <- function(doc, path)
 8 | {
 9 |    sapply(getNodeSet(doc, path), function(el) xmlValue(el))
10 | }
11 | 
12 | 
13 | getGeoCode <- function(str)
14 | {
15 |   library(XML)
16 |   #str <- gsub(' ','%20',str)
17 |   url <- paste('http://maps.google.com/maps/api/geocode/xml?sensor=false&address=',str)
18 |   response <- xmlTreeParse(url, useInternal=TRUE)
19 |   
20 |   lat <- getDocNodeVal(response, "/GeocodeResponse/result/geometry/location/lat")
21 |   lng <- getDocNodeVal(response, "/GeocodeResponse/result/geometry/location/lng")
22 |   return (c(lat,lng))
23 | }
24 | 
25 | t <- getGeoCode("Bangalore, India")


--------------------------------------------------------------------------------
/learning/MasterBlaster.R:
--------------------------------------------------------------------------------
 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
 2 | # Date: 3/15/2013
 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 5 | # All rights reserved.
 6 | 
 7 | library("RJSONIO")
 8 |  rm (list = ls())
 9 | 
10 | # Read SachinTestRecords.csv
11 | 
12 | mb <- read.csv("datasets/SachinTestRecords.csv", as.is=TRUE, header=TRUE, stringsAsFactors=FALSE, strip.white=TRUE)
13 | #mb <- read.csv("datasets/SachinTestRecords.csv", header=TRUE,strip.white=TRUE)
14 | nRows <- nrow(mb)
15 | nCols <- ncol(mb)
16 | cNames <- colnames(mb)
17 | cClass <- sapply(mb, class)
18 | iCols <- list(name=cNames[1], class=class(mb[,1]), sample=head(mb[,1]))
19 | for(i in 2:5) {
20 | 	iCols <- list(iCols, list(name=cNames[i], class=class(mb[,i]), sample=head(mb[,i])))
21 | }
22 | metaData <- list(nRows=nRows, nCols=nCols, iCols)
23 | jsonString <- toJSON(metaData)


--------------------------------------------------------------------------------
/learning/ListLoadedPads.R:
--------------------------------------------------------------------------------
 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
 2 | # Date: 3/15/2013
 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 5 | # All rights reserved.
 6 | 
 7 | setContentType("text/html")
 8 | #Step 2 - load environment
 9 | pads.loaded <- ls(pattern='pad[0-9]', envir=.GlobalEnv)
10 | #pads.loaded <- ls()
11 | #cat(is.environment(.rAenv))
12 | cat("This is a test script. It checks for pre-loaded pads. <br/> Following pads are pre-loaded in this environment: <br/>");
13 | cat(paste("No. of Pads: ", length(pads.loaded), "<br/>", sep=""))
14 | getDetails <- function(x)
15 | {
16 |   d <- get(x)
17 |   return(paste("PadName:", x, ", Records:", nrow(d), ", Columns:", length(d), ", ColNames:", paste(colnames(d), collapse="; "), ", Col Classes:", paste(lapply(d, class), collapse="||"), sep=""))
18 | }
19 | v <- vapply(pads.loaded, getDetails, "Pad Was Not Found.")
20 | cat(paste(v, collapse="<br/>"))


--------------------------------------------------------------------------------
/learning/PlayingWithEnv.R:
--------------------------------------------------------------------------------
 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
 2 | # Date: 3/15/2013
 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 5 | # All rights reserved.
 6 | rm (list = ls("PADSEnv"))
 7 | padsEnv <- new.env()
 8 | padsFile <- "/Users/homemac/pads.json"
 9 | 
10 | getPADS <- function()
11 | {
12 |   library("RJSONIO")
13 |   con <- file(padsFile)
14 |   data.json <- fromJSON(paste(readLines(con), collapse=""))
15 |   close(con)
16 |   #Convert the list to a matrix and then to a data frame
17 |   m <- matrix(unlist(data.json), ncol=2, byrow=TRUE)
18 |   pads <- data.frame(m, stringsAsFactors=FALSE)
19 |   colnames(pads) <- c("id", "location")
20 |   return (pads)
21 | }
22 | 
23 | padsEnv$pads <- getPADS()
24 | #Load all the PADS
25 | sysID <- NULL
26 | for(i in 1:length(padsEnv$pads)) {
27 |    sysID[i] <- load(padsEnv$pads$location[i], padsEnv)
28 | }
29 | padsEnv$pads$sysID <- sysID
30 | attach(padsEnv, name='PADSEnv')
31 | # All Done .rAenv
32 | rm(list=ls())
33 | 


--------------------------------------------------------------------------------
/learning/DQQE-Utility.R:
--------------------------------------------------------------------------------
 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
 2 | # Date: 3/15/2013
 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 5 | # All rights reserved.
 6 | 
 7 | getResult <- function(dt, expr, gby) {
 8 |   return(dt[,eval(expr), by=eval(gby)])
 9 | }
10 | dt <- data.table(iris)
11 | v1 <- "Sepal.Length"
12 | v2 <- "Species"
13 | e <- parse(text = paste("sum(", v1, ", na.rm = TRUE)"))
14 | b <- parse(text = v2)
15 | #rDT2 <- dt[, eval(e), by = eval(b)]
16 | dtR <- getResult(dt = dt, expr = e, gby = b)
17 | 
18 | 
19 | i <- which(colnames(padDT)==m)
20 | getData(padDT, sum(i, na.rm=TRUE), d)
21 | #getData(padDT, parse(text=paste("sum(", m, ",na.rm=TRUE)", sep="")), d) lapply(padDT, class)
22 | 
23 | 
24 | getResult <- function(dt, expr, gby) {
25 |   e <- substitute(expr)
26 |   b <- substitute(gby)
27 |   return(dt[,eval(e),by=b])
28 | }
29 | v1 <- "Sepal.Length"
30 | v2 <- "Species"
31 | 
32 | 
33 | getResult(dt, sum(get(v1), na.rm=TRUE), v2)
34 | #e <- substitute(expr)
35 | #b <- substitute(gby)
36 | 


--------------------------------------------------------------------------------
/FetchData/FetchFailedBanksData.R:
--------------------------------------------------------------------------------
 1 | library(XML)
 2 | library("chron")
 3 | library(plyr)
 4 | library(stringr)
 5 | 
 6 | source("UtilPADS.R")
 7 | 
 8 | run <- function() {
 9 |   #col names
10 |   col.names <- c("bank_name","city","st","closing_date","acquiring_institution","assets_in_mil_usd")
11 |   #wiki url
12 |   url <- "http://en.wikipedia.org/wiki/List_of_bank_failures_in_the_United_States_(2008%E2%80%93present)"
13 |   # read wiki page
14 |   page <- htmlParse(url, encoding="UTF-8")
15 |   # read all tables
16 |   all.tables <- try(readHTMLTable(page, colClasses=c('numeric', 'character', 'character', 'character', 'mixed.dates', 
17 |                                             'character', 'num.with.commas'), stringsAsFactors = FALSE, 
18 |                                               trim=TRUE, elFun = removeSpecialChars))
19 |   
20 |   #bind all the tables together
21 |   s.d <- do.call(rbind, all.tables)
22 |   # fix row names
23 |   rownames(s.d) <- 1:nrow(s.d)
24 |   # remove first column 
25 |   s.d <- s.d[-c(1)]
26 |   # assing column names
27 |   colnames(s.d) <- col.names
28 |   saveData(s.d, "./pads/raw-data/failed-banks/failed_banks_assets_fdic.csv")
29 | }
30 | 
31 | run()


--------------------------------------------------------------------------------
/learning/ggplot2.R:
--------------------------------------------------------------------------------
 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
 2 | # Date: 3/15/2013
 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 5 | # All rights reserved.
 6 | 
 7 | pres <- read.csv("http://www.stanford.edu/~messing/primaryres.csv", as.is=T)
 8 | 
 9 | # sort data in order of percent of vote:
10 | pres <- pres[order(pres$Percentage, decreasing=T), ]
11 | 
12 | # only show top 15 candidates:
13 | pres <- pres[1:15,]
14 | 
15 | # create a precentage variable
16 | pres$Percentage <- pres$Percentage*100
17 | 
18 | # reorder the Candidate factor by percentage for plotting purposes:
19 | pres$Candidate <- reorder(pres$Candidate, pres$Percentage)
20 | 
21 | # To install ggplot2, run the following line after deleting the #
22 | #install.packages("ggplot2")
23 | 
24 | library(ggplot2)
25 | ggplot(pres, aes(x = Percentage, y = factor(Candidate) )) + 
26 |   	geom_point() +
27 | 		theme_bw() + opts(axis.title.x = theme_text(size = 12, vjust = .25))+ 
28 | 		xlab("Percent of Vote") + ylab("Candidate") +
29 | 		opts(title = expression("New Hampshire Primary 2012"))


--------------------------------------------------------------------------------
/learning/ggplot.R:
--------------------------------------------------------------------------------
 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
 2 | # Date: 3/15/2013
 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 5 | # All rights reserved.
 6 | 
 7 | library (ggplot2) 
 8 | 
 9 | v1  <- c(1,2,3,3,4) 
10 | v2  <- c(4,3,1,1,9) 
11 | v3  <- c(3,5,7,2,9) 
12 | gender <- c("m","f","m","f","f") 
13 | 
14 | d.data  <- data.frame (v1, v2, v3, gender) 
15 | 
16 | molten <- melt(d.data) 
17 | 
18 | Average <- ddply(molten, c("gender", "variable"), function(z){ 
19 |         c(Mean = mean(z$value))} 
20 | ) 
21 | 
22 | ggplot (data=Average, aes(x = variable, y = Mean, fill = gender)) + 
23 |                 geom_bar(position = position_dodge()) + 
24 |                 coord_flip() + 
25 |                 geom_text (position = position_dodge(0.5), 
26 | aes(label=round(Mean, 1)), vjust=0.5, hjust=4,colour="white", size=7) 
27 | 
28 | ggplot (data=Average, aes(x = variable, y = Mean)) + 
29 |                 geom_bar() + 
30 |                 geom_text(aes(label=round(Mean, 1)), vjust=0.5, 
31 | hjust=4,colour="white", size=7) + 
32 |                 facet_wrap(~gender) 


--------------------------------------------------------------------------------
/GeneratePADS.R:
--------------------------------------------------------------------------------
 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
 2 | # Date: 3/15/2013
 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 5 | # All rights reserved.
 6 | 
 7 | # Generate PADS - new way - read a csv file that has the list of pads
 8 | #
 9 | source("CreatePADS.R")
10 | 
11 | #
12 | # generate Pads
13 | #
14 | generatePads <- function() {
15 |   #initialize system
16 |   initializeSystem()
17 |   assign("folder.path", "./pads/raw-data/internet_stats/", envir=.GlobalEnv)
18 |   assign("pads.file", "pads_meta.csv", envir=.GlobalEnv)
19 |   assign("dataset", "Internet-Stats", envir=.GlobalEnv)
20 |   pads <- readFile(paste(folder.path, pads.file, sep=""))
21 |   for(i in 1:nrow(pads)){
22 |     #prepare pad meta data
23 |     series <- list()
24 |     series["source"] <- pads$source[i]
25 |     series["category"] <- pads$category[i]
26 |     series["subcategory"] <- pads$subcategory[i]
27 |     series["tags"] <- pads$tags[i]
28 |     series["desc"] <- pads$desc[i]
29 |     series["title"] <- pads$title[i]
30 |     series["pagetag"] <- "internet"
31 |     series.data <- trimData(readFile(paste(folder.path, pads$file[i], sep="")))
32 |     series.data[2:ncol(series.data)] <- sapply(series.data[2:ncol(series.data)], removeMetaFromNumeric) 
33 |     padify(series, series.data)
34 |   }
35 |   #clean up
36 |   cleaupSystem()
37 |   updateCatPadCount()
38 | }
39 | 
40 | generatePads()
41 | 


--------------------------------------------------------------------------------
/learning/loadPADS.R:
--------------------------------------------------------------------------------
 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
 2 | # Date: 3/15/2013
 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 5 | # All rights reserved.
 6 | 
 7 | #setContentType("text/html")
 8 | setwd("/Users/homemac/Toga")
 9 | #read pre-analyzed data-sets file
10 | #padsFile <- "pads/pads.json"
11 | #getPADS <- function()
12 | #{
13 | #  library("RJSONIO")
14 | #  con <- file(padsFile)
15 | #  data.json <- fromJSON(paste(readLines(con), collapse=""))
16 | #  close(con)
17 |   #Convert the list to a matrix and then to a data frame
18 | #  m <- matrix(unlist(data.json), ncol=10, byrow=TRUE)
19 | #  pads <- data.frame(m, stringsAsFactors=FALSE)
20 |   #colnames(pads) <- c("id", "location")
21 | #  return (pads)
22 | #}
23 | #SYS <- new.env(hash = TRUE, size = NA)
24 | #Read pads configuration
25 | #pads <- getPADS()
26 | pads <- data.frame(read.csv("Alto/pads/pads.csv", header=TRUE, sep=",",stringsAsFactors=FALSE ))
27 | for(i in 1:nrow(pads)) {
28 |   cat(pads$cache[i])
29 |   cat("<br/>")
30 |   #Load all the PADS
31 |   #cat(pads$cache[i])
32 |   #cat("<br/>")
33 |   cat(try(load(pads$cache[i], envir=.GlobalEnv), FALSE))
34 |   #try(load(pads$cache[i], envir=SYS), FALSE)
35 |   #unlink(pads$cache[i])
36 |   #cat("<br/>")
37 |   
38 | }
39 | cat(ls(pattern="pad[0-9]", envir=.GlobalEnv))
40 | # All Done - Al these variables get loaded in .GlobalEnv, run "search" to see.
41 | 
42 | 


--------------------------------------------------------------------------------
/learning/JSON.R:
--------------------------------------------------------------------------------
 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
 2 | # Date: 3/15/2013
 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 5 | # All rights reserved.
 6 | 
 7 | library("rjson")
 8 | x <- list( alpha = 1:5, beta = "Bravo", gamma = list(a=1:3, b=NULL), delta = c(TRUE, FALSE) ) 
 9 | json <- toJSON( x )
10 | fromJSON( json )
11 | #named vectors are treated as JSON objects (lists) 
12 | toJSON(islands[1:4])
13 | #data.frames must be converted into a list before converting into JSON 
14 | plot(cars, pch=2) 
15 | json_cars <- toJSON(as.list(cars)) 
16 | points( data.frame( fromJSON( json_cars ) ), col="red", pch=3 )
17 | #special R types are encoded as strings 
18 | testString <- c(1,2,3,4,NA,NaN,Inf,8,9); 
19 | toJSON(testString);
20 | 
21 | sample_json <- ' { "breakfast" : [ "milk", "fruit loops", "juice" ], "lunch" : [ "left over sushi" ] } '
22 | parser <- newJSONParser()
23 | parser$addData( sample_json ) 
24 | food <- parser$getObject() 
25 | print( food )
26 | 
27 | #This is equivalent to using FromJSON( sample_json ) 
28 | #However, sample_json can be split into several parts:
29 | ### EXAMPLE 2:
30 | part_1 <- '{ "breakfast" : [ "milk", "fruit loops", "juice" ], ' 
31 | part_2 <- '"lunch" : [ "left over sushi" ]' 
32 | part_3 <- '} [1,2,3,4,5]' #close off the first object, and create a 2nd JSON object, which i
33 | parser <- newJSONParser() 
34 | parser$addData( part_1 )
35 | parser$getObject() #returns NULL - since part_1 isn't complete 
36 | parser$addData( part_2 ) 
37 | parser$getObject() #returns NULL - since part_2 still isn't complete 
38 | parser$addData( part_3 )
39 | parser$getObject() #returns the first food object 
40 | parser$getObject() #returns the second array


--------------------------------------------------------------------------------
/learning/Clinic.R:
--------------------------------------------------------------------------------
 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
 2 | # Date: 3/15/2013
 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 5 | # All rights reserved.
 6 | 
 7 | ## generate data for medical example 
 8 | clinical.trial <-
 9 |     data.frame(patient = 1:100,
10 |                age = rnorm(100, mean = 60, sd = 6),
11 | 			year.enroll = sample(paste("19", 85:99, sep = ""),
12 | 			                 100, replace = TRUE),
13 |                treatment = gl(2, 50,
14 |                  labels = c("Treatment", "Control")),
15 |                center = sample(paste("Center", LETTERS[1:5]), 100, replace = TRUE)) 
16 | ## set some ages to NA (missing) 
17 | is.na(clinical.trial$age) <- sample(1:100, 20)
18 | summary(clinical.trial)
19 | 
20 | ## a simple example of a table call 
21 | table(clinical.trial$center)
22 | 
23 | ## a logical vector is created and passed into table
24 | table(clinical.trial$age < 60)
25 | #FALSE  TRUE 
26 | #   41    39  
27 |  ## the useNA argument shows the missing values, too
28 | table(clinical.trial$age < 60, useNA = "always")
29 | #FALSE  TRUE  <NA> 
30 | #  41    39    20
31 | 
32 | ## the table of missing age by center 
33 | table(clinical.trial$center, is.na(clinical.trial$age))
34 | #           FALSE TRUE
35 | #  Center A    16    6
36 | #  Center B     8    2
37 | #  Center C    23    5
38 | #  Center D    20    3
39 | #  Center E    13    4 
40 | ## centers with most missing ages listed in order 
41 | ## highest to lowest
42 | sort(table(clinical.trial$center, is.na(clinical.trial$age))[, 2],      
43 |        decreasing = TRUE)
44 | #Center A Center C Center E Center D Center B 
45 | #       6        5        4        3        2  
46 | 
47 | 
48 | c1 <- cut(clinical.trial$age, breaks = seq(30, 80, by = 10))
49 | table(c1)


--------------------------------------------------------------------------------
/learning/WikiRead.R:
--------------------------------------------------------------------------------
 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
 2 | # Date: 3/15/2013
 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 5 | # All rights reserved.
 6 | 
 7 | library(XML)
 8 | library(RColorBrewer)
 9 | library(plyr)
10 | library(quantmod)
11 | library(Heatplus)
12 |  
13 | # get the list of symbols
14 | l <- readHTMLTable('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[[2]]
15 | l <- as.vector(l$Ticker)
16 | l <- l[c(-59, -71, -80, -124, -141, -147, -275, -283, -292, -299, -309, -316, -360, -378, -381, -406, -439, -470, -471)]
17 |  
18 | getMonthlyReturns <- function(sym) {
19 |   y <- to.monthly(getSymbols(sym, auto.assign=FALSE, from='2007-01-01'))
20 | 	as.vector(ClCl(y)*100)
21 | }
22 |  
23 | d <- unlist(llply(l, getMonthlyReturns, .progress="text"))
24 | # bounds at -10% and +10% for visual clarity
25 | d[d < -10] <- -10
26 | d[d > 10] <- 10
27 |  
28 | heatmap_2(t(matrix(d, ncol=481)), col=brewer.pal(9, 'PuBu'), Rowv=NA, Colv=NA, do.dendro=c(FALSE,FALSE), scale='none', legend=2, main="S&P 500 since 2007 (monthly returns)")
29 | 
30 | 
31 | 
32 | http://blog.datapunks.com/2011/10/sp-500-components-heatmap-in-r/
33 |   
34 | getWikiFC <- function(d) {  
35 |   l <- try(readHTMLTable("http://en.wikipedia.org/wiki/Forbes%27_list_of_the_most_valuable_football_clubs", 
36 |             header=FALSE, skip.rows=1, which=d$tableNo, stringsAsFactors = FALSE))
37 |   l$year = d$year
38 |   ifelse(d$year > 2010, return(with(l, data.frame(year, club=V2, country=V3, value=V4, revenue=V6))), 
39 |          return(with(l, data.frame(year, club=V2, country=V3, value=V4, revenue=V7))))
40 | }
41 | d <- data.frame(year=2012:2007, tableNo=2:7) 
42 | mostValuedSoccerClubs <- ddply(d, .(year), getWikiFC)
43 | 
44 | 
45 | 
46 | 
47 | l <- readHTMLTable("http://en.wikipedia.org/wiki/Forbes%27_list_of_the_most_valuable_football_clubs")[[7]]


--------------------------------------------------------------------------------
/FetchData/FetchBeerRatings.R:
--------------------------------------------------------------------------------
 1 | #
 2 | # Get beer data
 3 | #
 4 | # Author: Jitender Aswani, Co-Founder @datadolph.in
 5 | # Date: 3/15/2013
 6 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 7 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 8 | # All rights reserved.
 9 | require(data.table)
10 | startup <- function() {
11 |   #initialize system
12 |   initializeSystem(0)
13 |   
14 |   assign("folder.path", "./pads/raw-data/crunchbase/", envir=.GlobalEnv)
15 |   assign("comps.file", "cb_companies_june_2013.csv", envir=.GlobalEnv)
16 |   assign("comps.rounds.file", "cb_companies_rounds.csv", envir=.GlobalEnv)
17 |   assign("investors.file", "cb_investors.csv", envir=.GlobalEnv)
18 |   
19 |   assign("dataset", "crunchbase", envir=.GlobalEnv)
20 |   #assign("verbose", TRUE, envir=.GlobalEnv)
21 |   
22 |   loadCompaniesStats()
23 |   loadCompanyRounds()
24 |   loadInvestosStats()
25 |   
26 |   #prepare pad meta data
27 |   series <- list()
28 |   series["source"] <- "CrunchBase"
29 |   series["category"] <- "Financial Sector"
30 |   series["subcategory"] <- "Investment"
31 |   series["tags"] <- tolower(paste(series$source, "VC, venture capital, startups, US startups, investments, angel, series-a, series-b, series-c, funding, seed, biotech, ecommerce, enterprise, software, mobile, web", sep=","))
32 |   series["pagetag"] <- "crunchbase"
33 |   series["desc"] <- "Built using data from CrunchBase extracted on June 6, 2013." 
34 |   assign("series", series, envir=.GlobalEnv)
35 | }
36 | 
37 | assign("folder.path", "./pads/raw-data/beer-data/", envir=.GlobalEnv)
38 | assign("data.file", "beer_reviews.csv.gz", envir=.GlobalEnv)
39 | 
40 | beer.data <- data.table(read.csv(paste(folder.path, data.file, sep="")))
41 | setkey(beer.data, beer_beerid)
42 | #get beers with 500 or more reviews
43 | b.d.500 <- beer.data[, list(no_of_reviews=nrow(.SD)), by=beer_beerid][no_of_reviews >= 500]
44 | setkey(b.d.500, beer_beerid)
45 | #merge to get all records ofbeers which have 500 or more reviews
46 | b.d.500 <- merge(b.d.500, beer.data, all=F)


--------------------------------------------------------------------------------
/FetchData/FetchRussell2000.R:
--------------------------------------------------------------------------------
 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
 2 | # Date: 3/15/2013
 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 5 | # All rights reserved.
 6 | 
 7 | library(XML)
 8 | library(plyr)
 9 | library(quantmod)
10 | library(data.table)
11 | library("zoo")
12 | 
13 | folder.path = "./raw-data/fin-markets/"
14 | russell <- new.env()
15 | # get the list of symbols from the csv file
16 | ru2000 <- read.csv(paste(folder.path, "russell-2000-components.csv", sep=""), stringsAsFactors=F)
17 | colnames(ru2000) <- c("name", "ticker")
18 | #ru2000$name  <- gsub(ru2000$ticker, "", ru2000$name) 
19 | 
20 | getDailyReturns <- function(sym) {
21 |   y <- tryCatch(getSymbols(sym, auto.assign=FALSE, from="2007-01-01"), error = identity)
22 |   
23 |   if(inherits(y, "error"))
24 |     cat("Symbol '", sym, "' not downloadable!\n", sep = "")
25 |   else {
26 |     y <- tryCatch(cbind(y, OpCl(y)*100), error = identity)
27 |     if(inherits(y, "error"))
28 |       cat("Symbol '", sym, "' not downloadable!\n", sep = "")
29 |     else {
30 |       write.csv(data.frame( date=index(y), coredata(y) ),row.names=FALSE, file=paste(sym, ".csv", sep=""))
31 |       cat("Sucessfully saved the stock data to %s",sym)
32 |     }
33 |   }
34 | }
35 | 
36 | ru2000.DR <- llply(ru2000$Ticker[1109:nrow(ru2000)], getDailyReturns, .progress="text")
37 | 
38 | 
39 | #transpose d to have tickers as columns and date as rows
40 | ru2000.DR.T <- t(ru2000.DR)
41 | colnames(ru2000.DR.T)<- head(ru2000$Ticker)
42 | AACC <- data.frame[seq(as.Date("2007-01-01"), Sys.Date(), by="day"), ru2000.DR.T$AACC]
43 | #SP100.MR <- data.table(SP100.MR)
44 | plot(SP100.MR$AAP, ylab="Apple", main="Montly Returns Stock"
45 |      read.zoo(SP100.MR)
46 |      #d <- unlist(llply(l, getMonthlyReturns, .progress="text"))
47 |      # bounds at -10% and +10% for visual clarity
48 |      d[d < -10] <- -10
49 |      d[d > 10] <- 10
50 |      
51 |      heatmap_2(t(matrix(d, ncol=100)), col=brewer.pal(9, 'PuBu'), 
52 |                Rowv=NA, Colv=NA, do.dendro=c(FALSE,FALSE), scale='none', legend=2, 
53 |                main="S&P 100 since 2007 (monthly returns)")


--------------------------------------------------------------------------------
/learning/NYTWebService.R:
--------------------------------------------------------------------------------
 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
 2 | # Date: 3/15/2013
 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 5 | # All rights reserved.
 6 | 
 7 | setwd("/Users/homemac/Toga/Alto")
 8 | library(RJSONIO)
 9 | 
10 | ### set parameters ###
11 | api <- "YOUR_KEY" # API key goes here!!
12 | q <- "health+care+reform" # Query string, use + instead of space
13 | records <- 500 # total number of records to return, note limitations above
14 | # calculate parameter for offset
15 | os <- 0:(records/10-1)
16 | 
17 | # read first set of data in
18 | uri <- paste ("http://api.nytimes.com/svc/search/v1/article?format=json&query=", q, "&offset=", os[1], "&fields=date&api-key=", api, sep="")
19 | raw.data <- readLines(uri, warn="F") # get them
20 | res <- fromJSON(raw.data) # tokenize
21 | 
22 | dat <- unlist(res$results) # convert the dates to a vector
23 | # read in the rest via loop
24 | for (i in 2:length(os)) {
25 | # concatenate URL for each offset
26 | uri <- paste ("http://api.nytimes.com/svc/search/v1/article?format=json&query=", q, "&offset=", os[i], "&fields=date&api-key=", api, sep="")
27 | raw.data <- readLines(uri, warn="F")
28 | res <- fromJSON(raw.data)
29 | dat <- append(dat, unlist(res$results)) # append
30 | }
31 | 
32 | # aggregate counts for dates and coerce into a data frame
33 | cts <- as.data.frame(table(dat))
34 | # establish date range
35 | dat.conv <- strptime(dat, format="%Y%m%d") # need to convert dat into POSIX format for this
36 | daterange <- c(min(dat.conv), max(dat.conv))
37 | dat.all <- seq(daterange[1], daterange[2], by="day")
38 | # all possible days
39 | # compare dates from counts dataframe with the whole data range
40 | # assign 0 where there is no count, otherwise take count
41 | # (take out PSD at the end to make it comparable)
42 | dat.all <- strptime(dat.all, format="%Y-%m-%d")
43 | 
44 | # cant' seem to be able to compare Posix objects with %in%, so coerce them to character for this:
45 | freqs <- ifelse(as.character(dat.all) %in% as.character(strptime(cts$dat, format="%Y%m%d")), cts$Freq, 0)
46 | 
47 | plot (freqs, type="l", xaxt="n", main=paste("Search term(s):",q), ylab="# of articles", xlab="date")
48 | axis(1, 1:length(freqs), dat.all)
49 | lines(lowess(freqs, f=.2), col = 2)


--------------------------------------------------------------------------------
/learning/SP500MarketInflection.R:
--------------------------------------------------------------------------------
 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
 2 | # Date: 3/15/2013
 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 5 | # All rights reserved.
 6 | 
 7 | library(XML)
 8 | library(plyr)
 9 | library(quantmod)
10 | #library(data.table)
11 | #library("zoo")
12 | 
13 | # get the list of symbols from Wiki
14 | sp500 <- readHTMLTable('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies', 
15 |                        which=2, header=FALSE, skip.rows=1)
16 | sp500.tickers <- as.vector(sp500$V1)
17 | #Remove LOW
18 | sp500.tickers <- sp500.tickers[-which(sp500.tickers=="LOW")]
19 | sp500.tickers <- sp500.tickers[-which(sp500.tickers=="PSX")]
20 | sp500.tickers <- sp500.tickers[-which(sp500.tickers=="XYL")]
21 | sp500.tickers <- sp500.tickers[-which(sp500.tickers=="TRIP")]
22 | sp500.tickers <- sp500.tickers[-which(sp500.tickers=="WPX")]
23 | 
24 | sp500.tickers[which(sp500.tickers=="BFB")]="BF-B"
25 | 
26 | isFiftyTwoWeekLow <- function(sym) {
27 |   #to <- "2012/6/01"
28 |   #to <- "2012/4/02"
29 |   to <- "2011/10/04"
30 |   from <- last(seq(as.Date(to), by="-1 week", length.out=52))
31 |   #last(seq(Sys.Date(), by="-1 week", length.out=52))
32 |   print(sym)
33 |   y <- getSymbols(sym, auto.assign=FALSE, from=from, to=to)
34 |   low <- ifelse(Cl(last(y))[[1]] < Cl(seriesLo(last(y, '52 weeks')))[[1]], 1, 0)
35 |   return (low)
36 | }
37 | #Percent of stocks trading at 52 weeks low
38 | sp500.low <- ldply(sp500.tickers, isFiftyTwoWeekLow, .progress="text")
39 | per.stocks.trading.low.Jun1 <- (sum(sp500.low$V1)/length(sp500.low$V1))*100
40 | 
41 | #Get SP500 Index data
42 | getSymbols("^gspc")
43 | Cl(last(GSPC))[[1]]  #Get Last Close
44 | Cl(seriesLo(last(GSPC, '52 weeks')))[[1]] #Get Low from last 52weeks
45 | 
46 | #April 2nd - SP500 closed at 1419, multi-year high - What Percent of stocks trading at 52 weeks low
47 | sp500.low.Apr2 <- ldply(sp500.tickers, isFiftyTwoWeekLow, .progress="text")
48 | per.stocks.trading.low.Apr2 <- (sum(sp500.low.Apr2$V1)/length(sp500.low.Apr2$V1))*100
49 | 
50 | 
51 | #October 4th - SP500 closed at 1123.95 - What Percent of stocks were trading at 52 weeks low
52 | sp500.low.Oct4 <- ldply(sp500.tickers, isFiftyTwoWeekLow, .progress="text")
53 | per.stocks.trading.low.Oct4 <- (sum(sp500.low.Oct4$V1)/length(sp500.low.Oct4$V1))*100
54 | 
55 | 


--------------------------------------------------------------------------------
/learning/VCorpus.R:
--------------------------------------------------------------------------------
 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
 2 | # Date: 3/15/2013
 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 5 | # All rights reserved.
 6 | 
 7 | library(tm)
 8 | library(twitteR)
 9 | 
10 | tweets <- searchTwitter("#DKOM", n=1500 )
11 | df <- do.call("rbind", lapply(tweets, as.data.frame))
12 | 
13 | #tweets1 <- searchTwitter("#Netflix", n=1500 )
14 | #df1 <- do.call("rbind", lapply(tweets, as.data.frame))
15 | #dim(df)
16 | 
17 | # build a corpus, which is a collection of text documents
18 | # VectorSource specifies that the source is character vectors.
19 | myCorpus <- Corpus(VectorSource(df$text))
20 | 
21 | #After that, the corpus needs a couple of transformations, including changing letters to lower case, removing punctuations/numbers and removing stop words. The general English stop-word list is tailored by adding "available" and "via" and removing "r".
22 | myCorpus <- tm_map(myCorpus, tolower)
23 | # remove punctuation
24 | myCorpus <- tm_map(myCorpus, removePunctuation)
25 | # remove numbers
26 | myCorpus <- tm_map(myCorpus, removeNumbers)
27 | # remove stopwords
28 | # keep "r" by removing it from stopwords
29 | myStopwords <- c(stopwords('english'), "available", "via")
30 | idx <- which(myStopwords == "r")
31 | myStopwords <- myStopwords[-idx]
32 | myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
33 | 
34 | #Stemming Words
35 | dictCorpus <- myCorpus
36 | # stem words in a text document with the snowball stemmers,
37 | # which requires packages Snowball, RWeka, rJava, RWekajars
38 | myCorpus <- tm_map(myCorpus, stemDocument)
39 | # inspect the first three ``documents"
40 | #inspect(myCorpus[1:3])
41 | 
42 | # stem completion
43 | myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=dictCorpus)
44 | 
45 | #inspect(myCorpus[1:3])
46 | 
47 | #Building a Document-Term Matrix
48 | myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 1))
49 | inspect(myDtm[266:270,31:40])
50 | 
51 | #Frequent Terms and Associations
52 | findFreqTerms(myDtm, lowfreq=10)
53 | 
54 | # which words are associated with "r"?
55 | #findAssocs(myDtm, 'r', 0.30)
56 | 
57 | # which words are associated with "mining"?
58 | # Here "miners" is used instead of "mining",
59 | # because the latter is stemmed and then completed to "miners". :-(
60 | #findAssocs(myDtm, 'miners', 0.30)
61 | 
62 | 
63 | library(wordcloud)
64 | m <- as.matrix(myDtm)
65 | # calculate the frequency of words
66 | v <- sort(rowSums(m), decreasing=TRUE)
67 | myNames <- names(v)
68 | #k <- which(names(v)=="miners")
69 | #myNames[k] <- "mining"
70 | d <- data.frame(word=myNames, freq=v)
71 | wordcloud(d$word, d$freq, min.freq=3)
72 | 


--------------------------------------------------------------------------------
/learning/DataParser.R:
--------------------------------------------------------------------------------
 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
 2 | # Date: 3/15/2013
 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 5 | # All rights reserved.
 6 | 
 7 | library(RJSONIO)
 8 | library(stringr)
 9 | 
10 | #Set content type for output
11 | setContentType("text/html")
12 | #Query Engine - dfp datafilepath 
13 | dfp <- NULL
14 | udir <- NULL
15 | 
16 | #Convert to a ROW JSON
17 | getRowWiseJson <- function (jsonDT) {
18 |   row.json <- apply(jsonDT, 1, toJSON)
19 |   json.st <- paste('[', paste(row.json, collapse=', '), ']')
20 |   return (json.st)
21 | }
22 | 
23 | 
24 | # Read SachinTestRecords.csv
25 | readDataSet <- function(file.name) {
26 |   return(
27 |     read.csv(file.name, na.strings="-", as.is=TRUE, header=TRUE, 
28 |              stringsAsFactors=FALSE, strip.white=TRUE)
29 |   )
30 | }
31 | readRequest <- function() {
32 |   #GetString <- str(GET)
33 |   #cat(GetString)
34 |   dfp <<- GET$dfp
35 |   udir <<- GET$udir
36 |   #cat(d, "," ,m, "," ,f, "," ,qds, sep="    ")
37 |   #cat("<br/>")
38 | }
39 | writeResponse <- function(df, datafile, dsID) {
40 |   sJSON = paste('{"upad":"', datafile,'","dsID":"', dsID,'","colTypes":',toJSON(unlist(lapply(df, class), use.names=FALSE)),', "rows":', getRowWiseJson(head(df,50)), '}', sep="")
41 |   #Write response back
42 |   cat(sJSON)
43 | }
44 | #Step 1 - Read Request, query string
45 | readRequest()
46 | 
47 | #Step 2 - load environment
48 | df <- readDataSet(dfp)
49 | #Remove white space in col names
50 | colnames(df) <- str_replace(colnames(df), ' ', '')
51 | 
52 | #Get Unique ID
53 | dsID <- paste("upad", substr(as.character(unclass(Sys.time())), 12,16), sep="")
54 | out.file.csv <- paste(udir,"analyzed/", dsID, ".csv", sep="")
55 | #out.file.JSON <- paste("pads/meta/",dsID, ".JSON", sep="")
56 | out.file.R.dataframes <- paste(udir, dsID, ".RData", sep="")
57 | write.csv(df, out.file.csv)
58 | 
59 | # Write JSON
60 | #jsonSt <- paste('{ 
61 | #                "name":"', file.name, '",
62 | #                "id":"', dsID, '",', sep="");
63 | #dlist <- paste()
64 | #                "dList": {
65 | #                  "Player":', toJSON(levels(factor(ipl$Player))), '
66 | #                },
67 | #                "mList": {
68 | #                  "Mat":"",
69 | #                  "Sixes":""
70 | #                },
71 | #                "dListType": {
72 | #                },
73 | #                "source":"CricInfo"
74 | #        }', sep="")
75 | #file.out <- file(out.file.JSON, 'wt')
76 | #cat(jsonSt, file=file.out, fill = TRUE)
77 | #close(file.out)
78 | #Save transformed dataset as data.frame for later reading
79 | assign(dsID, df)
80 | names <- c(eval(dsID))
81 | save(list=names, file=out.file.R.dataframes)
82 | 
83 | #Step 4 - Write Response
84 | writeResponse(df, out.file.R.dataframes, dsID)
85 | 


--------------------------------------------------------------------------------
/learning/MiscR.R:
--------------------------------------------------------------------------------
 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
 2 | # Date: 3/15/2013
 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 5 | # All rights reserved.
 6 | 
 7 | responseDF <- switch(fnType,
 8 |        sum = getSum(df, d, m), 
 9 |        count = getCount(df, d, m),
10 |        'default'
11 |        )
12 |     if(!is.null(g) && g == "t")
13 |       responseDF <- geoCode(responseDF)
14 |     return(responseDF)
15 | 
16 | {
17 |         "name": "Australia",
18 |         "center": new google.maps.LatLng(41.878113, -87.629798),
19 |         "data": 2842518
20 |       },
21 | 
22 | JSON <- with(head(mbDS), paste('{"d":"',mbDS$Country,
23 |                                '", "center": new google.maps.LatLng(',
24 |                                 mbDS$Country.Lat, ',', mbDS$Country.Lng, 
25 |                                '), "data":', mbDS$Test.Runs, 
26 |                                 '},', sep=""))
27 | 
28 | m <- "Country"
29 | lat <- paste(m,".Lat", sep="")
30 | sumDS <- ddply(mbDS, .(Country), summarise, Test.Runs=sum(Test.Runs, na.rm=TRUE))
31 | JSON <- with(sumDS, paste('{"d":"',sumDS[m],
32 |                                '", "center": new google.maps.LatLng(',
33 |                                 mbDS[lat][sumDS[m]==mbDS[m]], ',', mbDS$Country.Lng[mbDS$Country=="India"], 
34 |                                '), "data":', sumDS$Test.Runs, 
35 |                                 '},', sep=""))
36 | 
37 | 
38 | lookupLat <- function(m, i="India") 
39 | {
40 |   lat <- paste(m,".Lat",sep="")
41 |   return(mbDS[lat][mbDS[m]==i][1])
42 | }
43 | lookupLat("Country", "India")
44 | }
45 | }
46 | with(sumDS, data.frame(Country, Test.Runs, laply(Country, function(x){getGeoCode(x, x)})))
47 | 
48 | ddply(mbDS, "Opposition", function(x) c(Lat=unique(x$Opposition.Lat, Lng=unique(x$Opposition.Lng))))
49 | 
50 | with(sumDS, data.frame(Country, Test.Runs, laply(Country, function(x){lookupGeoCode(x)})))
51 | 
52 | 
53 | with(responeDF, paste('{"d":"',responseDF[m]
54 |                                '", "center": new google.maps.LatLng(',
55 |                                 df[latName][df[m]==responeDF[m]][1], ',', df[lngName][df[m]==responeDF[m]][1], 
56 |                                '), "data":', mbDS$Test.Runs, 
57 |                                 '},', sep=""))
58 | 
59 | 
60 | sumDS <- ddply(mbDS, .(Opposition), summarise, Test.Runs=sum(Test.Runs, na.rm=TRUE))
61 | 
62 | d <- "Opposition"
63 | latName <- paste(d, ".Lat", sep="")
64 | lngName <- paste(d, ".Lng", sep="")
65 | m <- "Test.Runs"
66 | responseDF <- sumDS
67 | df <- mbDS
68 | with(responseDF, paste('{"d":"',responseDF[d],
69 |                                '", "center": new google.maps.LatLng(',
70 |                                 df[latName][df[d]==responseDF[d]][1], 
71 |                                 ',', df[lngName][df[d]==responseDF[d]][1], 
72 |                                '), "data":', responseDF[m], 
73 |                                 '},', sep=""))
74 | 


--------------------------------------------------------------------------------
/learning/TransformDataSet.R:
--------------------------------------------------------------------------------
 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
 2 | # Date: 3/15/2013
 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 5 | # All rights reserved.
 6 | 
 7 | rm (list = ls())
 8 | setwd("~/Toga/Alto")
 9 | library("chron")
10 | library("plyr")
11 | library("RJSONIO")
12 | library("data.table")
13 | library("ggplot2")
14 | in.file.name <- "datasets/BollywoodCinema-1940-2008.csv"
15 | out.file.name <- "datasets/masterblaster/TransformedSachinTestRecords.csv"
16 | out.file.JSON <- "datasets/masterblaster/SachinTestRecords.JSON"
17 | out.file.R.dataframes <- "datasets/masterblaster/MB.RData" 
18 | dt <- data.table(read.csv(in.file.name, na.strings="-", as.is=TRUE, header=TRUE, 
19 |              stringsAsFactors=FALSE, strip.white=TRUE))
20 | 
21 | lMonths <- c("January","February","March", "April","May","June","July","August","September", "October","November","December")
22 | lDays <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
23 | 
24 | 
25 | # Line plot
26 | ggplot(totmidc, aes(variable, value)) + geom_line() + xlab("") + ylab("")
27 | 
28 | # Bar plot
29 | # Note that the parameter stat="identity" passed to geom_bar()
30 | ggplot(totmidc, aes(x=variable, y=value)) + geom_bar(stat="identity") + xlab("") + ylab("")
31 | 
32 | 
33 | d <- dt[,list(Deols=sum(grepl("deol|Dharmendra",Cast, ignore.case=TRUE)), 
34 |               Kapoors=sum(grepl("kapoor",Cast, ignore.case=TRUE)),
35 |               Khans=sum(grepl("khan",Cast, ignore.case=TRUE)),
36 |               Bachchans=sum(grepl("bachchan",Cast, ignore.case=TRUE))), 
37 |         by=year]
38 | d1 <- melt(d, id.var="year")
39 | ggplot(d1)+geom_bar(aes(x=year,y=value,fill=variable),stat='identity')
40 | 
41 | ggplot(d, aes(year))+ geom_bar() +facet_wrap(~n)
42 | ggplot(d, aes(x=factor(year), y=c(Deols, Kapoors))) + geom_bar(stat="identity") + xlab("") + ylab("Deols") 
43 | n <- c("Deols", "Kapoors")
44 | ggplot(d, aes(x=year, )) + geom_bar(stat="identity") + facet_wrap(n)
45 | 
46 | 
47 | 
48 | dat<-data.frame(num=1:3,usage=c(4,2,5),cap=c(10,20,10),diff=c(6,18,5)) 
49 | dat.melt<-melt(dat,id.var=c('num','cap')) 
50 | ggplot(dat.melt)+geom_bar(aes(x=num,y=value,fill=variable),stat='identity') 
51 | 
52 | #IPL DS
53 | in.file.name <- "pads/data/pad56638.csv" pad89706
54 | dt <- data.table(read.csv(in.file.name, na.strings="-", as.is=TRUE, header=TRUE, 
55 |              stringsAsFactors=FALSE, strip.white=TRUE))
56 | dt[,Sixes, by=Player][order(-Sixes)]
57 | 
58 | getCount <- function(dat, expr, gby) {
59 |   e <- substitute(expr)
60 |   b <- substitute(gby)
61 |   print(dat[,eval(e),by=b])
62 | }
63 | getCount(dt, sum(Sixes), Player)
64 | 
65 | q <- quote(Sixes)
66 | q1 <- quote(Player)
67 | dt[,eval(q)), by=q1 ][order(-eval(q))]
68 | dt[,q,by=q1]
69 | 
70 | #IPL DS
71 | in.file.name <- "pads/data/pad89706.csv" 
72 | mbdt <- data.table(read.csv(in.file.name, na.strings="-", as.is=TRUE, header=TRUE, 
73 |              stringsAsFactors=FALSE, strip.white=TRUE))
74 | mbdt[,list(sum(as.integer(Test_Runs), na.rm=TRUE)), by=Year][order(Year)]
75 | 


--------------------------------------------------------------------------------
/learning/geoCode.R:
--------------------------------------------------------------------------------
 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
 2 | # Date: 3/15/2013
 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 5 | # All rights reserved.
 6 | 
 7 | library("plyr")
 8 | 
 9 | mb.df <- "MB.RData"
10 | load(mb.df)
11 | 
12 | getGeoCode <- function(str)
13 | {
14 |   print(str)
15 |   library("RJSONIO")
16 |   str <- gsub(' ','%20',str)
17 |   connectStr <- paste('http://maps.google.com/maps/api/geocode/json?sensor=false&address=',str, sep="")
18 |   print(connectStr)
19 |   #data.json <- fromJSON(paste(readLines(url("http://maps.google.com/maps/api/geocode/json?sensor=false&address=South%20Africa")), collapse=""))
20 |   data.json <- fromJSON(paste(readLines(url(connectStr)), collapse=""))
21 |   close(con)
22 |   data.json <- unlist(data.json)
23 |   #print(data.json["results.geometry.location.lat"])
24 |     #return (lat = data.json["results.geometry.location.lat"])
25 |     return (c(lat=data.json["results.geometry.location.lat"], lng=data.json["results.geometry.location.lng"]))
26 | }
27 | printVal <- function(val){
28 |   print(val)
29 |   return ("YO")
30 | }
31 | shortDS <- with(head(mbDS, 20), data.frame(Opposition, Ground, Toss))
32 | shortDS <- mutate(shortDS, Oppostion, Ground, Toss, lat=printVal(Opposition))
33 |                 
34 | cn <- levels(mbDS$Opposition)
35 | llply(cn, function(x) {getGeoCode(x)})
36 | ds <- transform(mbDS, lat=getGeoCode(Opposition))
37 | d <- data.frame(Country=cn)
38 | d$lng <- apply(d, 1, function(x) {getGeoCode(x)})
39 | 
40 | 
41 | printVal <- function(val){
42 |   print(val)
43 |   return ("YO")
44 | }
45 | ddply(shortDS, "Opposition", function(x){ getGeoCode(x$Opposition[1])})
46 | 
47 | 
48 | 
49 | #************* Working Version
50 | df <- NULL
51 | #code <- with(head(mbDS, 4), getGeoCode(Opposition))
52 | #m <- ddply(mbDS, "Opposition", function(x){c("lat", "lag")})
53 | #t <- getGeoCode("Bangalore, India")
54 | cn <- levels(mbDS$Opposition)
55 | for(i in 1:length(cn)) {
56 |   t <- getGeoCode(cn[i])
57 |   r <- c(cn[i], t)
58 |   df <- rbind(df, r) 
59 | }
60 | #************* Working Version
61 | 
62 | getGeoCode <- function(str)
63 | {
64 |   library("RJSONIO")
65 |   str <- gsub(' ','%20',str)
66 |   connectStr <- paste('http://maps.google.com/maps/api/geocode/json?sensor=false&address=',str, sep="")
67 |   #print(connectStr)
68 |   con <- url(connectStr)
69 |   data.json <- fromJSON(paste(readLines(con), collapse=""))
70 |   close(con)
71 |   data.json <- unlist(data.json)
72 |   #print(data.json["results.geometry.location.lat"])
73 |     #return (lat = data.json["results.geometry.location.lat"])
74 |     return (c(lat=data.json["results.geometry.location.lat"], lng=data.json["results.geometry.location.lng"]))
75 | }
76 | 
77 | shortDS <- with(head(mbDS, 20), data.frame(Opposition, Ground.Country, Toss))
78 | #v <- shortDS$Ground.Country
79 | #g <- laply(v, function(x){getGeoCode(x)})
80 | shortDS <- with(shortDS, data.frame(Opposition, Ground.Country, Toss,
81 |                   laply(Ground.Country, function(val){getGeoCode(val)})))
82 | closeAllConnections()
83 | 
84 | 


--------------------------------------------------------------------------------
/GenerateFredPADS.R:
--------------------------------------------------------------------------------
 1 | # Generate FRED PADS
 2 | # Author: Jitender Aswani, Co-Founder @datadolph.in
 3 | # Date: 3/15/2013
 4 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 5 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 6 | # All rights reserved.
 7 | #
 8 | 
 9 | #Generate PADS from FRED data
10 | setwd("/Users/homemac/R")
11 | source("CreatePADS.R")
12 | 
13 | padifyFRED <- function(){
14 |   folder.path = "./raw-data/FRED/"
15 |   file.name = "FRED_series.csv"
16 |   fred <- read.csv(paste(folder.path, file.name, sep=""), stringsAsFactors=F)
17 |   #pads <- data.frame(pad_id=character(), title=character(), desc=character(), records=numeric(), columns=numeric(),
18 |   #                   category_id=character(), subcategory_id=character(), analyzed=numeric(), stories_created=numeric(),
19 |   #                   stories_published=numeric(), "source"=character(), source_data_file=character(), 
20 |   #                   created_on=character(), last_updated=character(), tags=character())
21 |   pads <- getSystemFREDPads()
22 |   solr.index.docs <- list()
23 |   series <- list()
24 |   series["source"] <- "FRED"
25 |   #for(i in 1:nrow(fred)){
26 |   for(i in 1:2){
27 |     #series["title"] <- ifelse(nchar(fred$Title[i]) > 58, 
28 |     #                          paste(substr(fred$Title[i], 1, 58), "...", sep=""), fred$Title[i])
29 |     series["title"] <- fred$Title[i]
30 |     series["desc"] <- paste(fred$Title[i], " (Units:", fred$Units[i], 
31 |                             ", Frequency:", fred$Frequency[i], ", Seasonal Adjustmet: ", 
32 |                             fred$Seasonal.Adjustment[i], ")", sep="")
33 |     series["category"] <- fred$category_name[i]
34 |     series["subcategory"] <- fred$subcategory_name[i]
35 |     series["category_id"]<- fred$category_id[i]
36 |     series["subcategory_id"]<- fred$subcategory_id[i]
37 |     
38 |     series.data <- read.csv(paste(folder.path, "data/", gsub(" ", "", gsub("\\\\", "/", fred$File[i])), sep=""),stringsAsFactors=F)
39 |     print(paste("Starting file #", i, " name: " , fred$File[i], sep=""))
40 |     #print(series.data)
41 |     #print(series)
42 |     #Create pad and get pad meta data
43 |     pmd <- createPAD(series.data, series$title, 
44 |                      series$desc, series$category, series$subcategory, 
45 |                      series$source, paste(series$name, series$country, sep="."), 
46 |                      tolower(paste(series$category, series$subcategory, series$source, sep=",")))
47 |     #l <- list(pmd$id, pmd$title, pmd$desc, pmd$records, pmd$columns, series["category_id"], 
48 |     #          series["subcategory_id"], pmd$analyzed, pmd$stories_created, pmd$stories_published, 
49 |     #          pmd$src, pmd$src_file, "", "", pmd$tags)
50 |     
51 |     pads <- rbind(pads, 
52 |                   data.frame(pad_id=pmd$id[1], title=pmd$title[1], desc=pmd$desc[1], 
53 |                              records=pmd$records[1], columns=pmd$columns[1], 
54 |                              category_id=series$category_id, subcategory_id=series$subcategory_id, 
55 |                              analyzed=pmd$analyzed[1], stories_created=pmd$stories_created[1], 
56 |                              stories_published=pmd$stories_published[1], 
57 |                              "source"=pmd$src[1], source_data_file=pmd$src_file[1], created_on="", last_updated="", 
58 |                              tags=pmd$tags[1]))
59 |     #
60 |     #generate solr doc
61 |     #
62 |     doc <- list("type"="pad", "id" = pmd$id, "title" = pmd$title, "desc"=pmd$desc, "category"=pmd$category, 
63 |                 "subcategory"=pmd$subcategory, "tags"=pmd$tags, "author"="system")
64 |     solr.index.docs[[length(solr.index.docs)+1]] <- doc            
65 |   }
66 |   # now save solr doc
67 |   generateSolrDocFromAllDocs(solr.index.docs)
68 |   
69 |   # Save pads to a csv file
70 |   saveData(pads, "./pads/system_pads.csv")
71 |   
72 |   #now save pads to db
73 |   #scon <- getDBCon()
74 |   #dbRemoveTable(scon, "system_pads")
75 |   #dbWriteTable(scon, name="system_pads", value=pads, row.names = F, overwrite = T)  
76 |   #disconnectDB(scon)
77 |   
78 | }
79 | 
80 | padifyFRED()


--------------------------------------------------------------------------------
/learning/TransformFailedBanks.R:
--------------------------------------------------------------------------------
  1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
  2 | # Date: 3/15/2013
  3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
  4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
  5 | # All rights reserved.
  6 | 
  7 | rm (list = ls())
  8 | setwd("~/Toga/Alto")
  9 | library("chron")
 10 | library("plyr")
 11 | library("RJSONIO")
 12 | #Get Unique ID
 13 | dsID <- paste("pad", substr(as.character(unclass(Sys.time())), 12,16), sep="")
 14 | out.file.csv <- paste("pads/data/",dsID, ".csv", sep="")
 15 | out.file.JSON <- paste("pads/meta/",dsID, ".JSON", sep="")
 16 | out.file.R.dataframes <- paste("pads/cache/", dsID, ".RData", sep="")
 17 | in.file.name <- "datasets/failedbanks/failedbanksdata.csv"
 18 | 
 19 | lMonths <- c("January","February","March", "April","May","June","July","August","September", "October","November","December")
 20 | lDays <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
 21 | 
 22 | # Read *.csv
 23 | readDataSet <- function(file.name) {
 24 |   return(
 25 |     read.csv(file.name, na.strings="-", as.is=TRUE, header=TRUE, 
 26 |              stringsAsFactors=FALSE, strip.white=TRUE)
 27 |   )
 28 | }
 29 | 
 30 | # Transform raw dataset
 31 | transformDataSet <- function(df) {
 32 | tdf <- mutate(df,
 33 |       Date=as.Date(Date, "%m/%d/%Y"),
 34 |       Day=factor(weekdays(Date), levels=lDays, ordered=TRUE), 
 35 |       Month=factor(months(Date), levels=lMonths, ordered=TRUE), 
 36 |       Year=years(Date),
 37 |       Bank.Name=Bank.Name, 
 38 |       City=as.factor(City), 
 39 |       State=as.factor(State),
 40 |       Assets=as.numeric(gsub('[\\$,]', '', Assets)),
 41 |       Deposits=as.numeric(gsub('[\\$,]', '', Deposits)),
 42 |       Branches=as.numeric(Branches),
 43 |       FDIC.Cost=as.numeric(gsub('[\\$,]', '', FDIC.Cost)),
 44 |       Lat=as.numeric(Lat),  
 45 |       Lng=as.numeric(Lng)
 46 |     )
 47 | return(with(tdf, data.frame(Date, Day, Month, Year,
 48 |               Bank.Name, City, State, Assets, Deposits, Branches, FDIC.Cost,
 49 |               Lat, Lng
 50 |         ))
 51 |        )
 52 | }
 53 | 
 54 | writeTransformedDS <- function(df, file.name) {
 55 |   write.csv(df, file.name)
 56 | }
 57 | 
 58 | writeJSON <- function(df, file.JSON, id) {
 59 |   
 60 |   jsonSt <- paste('{ 
 61 |                 "datasetName": "FailedBanks",
 62 |                 "dsID": id, 
 63 |                 "dList": {
 64 |                   "Year":', toJSON(levels(df$Year)), ',
 65 |                   "Month":', toJSON(levels(df$Month)), ',
 66 |                   "Day":', toJSON(levels(df$Day)), ',
 67 |                   "Bank.Name":', toJSON(df$Bank.Name), ',
 68 |                   "City":', toJSON(levels(df$City)), ',
 69 |                   "State":', toJSON(levels(df$State)),
 70 |                 '},
 71 |                 "mList": {
 72 |                   "Assets":"",
 73 |                   "Deposits":"",
 74 |                   "Branches":"",
 75 |                   "FDIC.Cost":""
 76 |                 }
 77 |         }')
 78 |   file.out <- file(file.JSON, 'wt')
 79 |   cat(jsonSt, file=file.out, fill = TRUE)
 80 |   close(file.out)
 81 | }
 82 | 
 83 | # Read Raw File and Transform it
 84 | fbDS <- transformDataSet(readDataSet(in.file.name))
 85 | 
 86 | #Write Transformed data set to a CSV file
 87 | writeTransformedDS(fbDS, out.file.csv)
 88 | 
 89 | #Generate JSON with m and d
 90 | writeJSON(fbDS, out.file.JSON, id)
 91 | 
 92 | #Save transformed dataset as data.frame for later reading
 93 | #save(fbDS, file=out.file.R.dataframes)
 94 | #Save transformed dataset as data.frame for later reading
 95 | assign(dsID, fbDS)
 96 | names <- c(eval(dsID))
 97 | save(list=names, file=out.file.R.dataframes)
 98 | pads <- rbind(pads, data.table(id=dsID, title="Failed Banks Analysis in US (2008-2011)", 
 99 |                                subtitle="This data set explores US Banks that perished during the 2008 downturn.", 
100 |                    records=nrow(fbDS), analyzed=0, stories=0, json=out.file.JSON, data=out.file.csv,
101 |                                cache=out.file.R.dataframes, source="Federal Reserve")
102 |               )


--------------------------------------------------------------------------------
/learning/SQLQueryDataFrame.R:
--------------------------------------------------------------------------------
  1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
  2 | # Date: 3/15/2013
  3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
  4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
  5 | # All rights reserved.
  6 | 
  7 | rm (list = ls())
  8 | setwd("~/Toga/Alto")
  9 | 
 10 | #load pre-saved dataframe
 11 | out.file.R.dataframes <- "datasets/masterblaster/mb.RData" 
 12 | load(out.file.R.dataframes)
 13 | 
 14 | library(plyr)
 15 | 
 16 | ## Operate on a data.frame, split by "by", by could be a vector or a single filed
 17 | ## aggreage by columns in vector on, on could be a single field also
 18 | getSum <- function(df, by, on) {
 19 |     #print(sum(df[,on],  na.rm=TRUE))
 20 |     return(ddply(df, by, colwise(function(col) {sum(col, na.rm=TRUE)}, on)))
 21 | }
 22 | ## Try it out
 23 | crpopy <- getSum(mbDS, .(Opposition, Year), .(Test.Century, Test.Runs))
 24 | 
 25 | ## Operate on a data.frame, split by "by", by could be a vector or a single filed
 26 | ## and aggreage by columns in vector on, on could be a single field also
 27 | getCount <- function(df, by, on) {
 28 |     return(ddply(df, by, colwise(function(col) {length(unique(col))}, on)))
 29 | }
 30 | ## Try it out
 31 | sx <- getCount(mbDS, "Year", c("TestNo", "Ground"))
 32 | 
 33 | #For counting column that has 0 or 1 value :: Very similar to getSum - the answer should be same
 34 | getBinaryCount <- function(df, by, on) {
 35 |     return(ddply(df, by, colwise(function(x) {length(which(x==1))}, on)))
 36 |     #length(which(mbDS[,"Test.Century"] ==1))
 37 | }
 38 | ## Try it out
 39 | cpo <- getBinaryCount(mbDS, "Opposition", c("Ing1.Century", "Ing2.Century", "Test.Century"))
 40 | 
 41 | ###################### Samples ####################
 42 | 
 43 | #Runs by Opposition
 44 | #rpo <- ddply(mbDS, "Opposition", function(x) c(TotalRuns=sum(x$Test_Runs, na.rm=TRUE)))
 45 | #rpo <- ddply(mbDS, .(Opposition), summarise, TotalRuns=sum(Test_Runs, na.rm=TRUE))
 46 | rpo <- getSum(mbDS, "Opposition", "Test.Runs")
 47 | 
 48 | #Runs by Opposition by Inning
 49 | rpopi <- getSum(mbDS, "Opposition", c("Ing1.Runs", "Ing2.Runs", "Test.Runs"))
 50 | 
 51 | #Runs by Year
 52 | #rpy <- ddply(mbDS, .(Year), summarise, TotalRuns = sum(Test.Runs, na.rm=TRUE))
 53 | rpy <- getSum(mbDS, "Year", "Test_Runs")
 54 | 
 55 | #Tests Per Opposition
 56 | tpo <- getCount(mbDS, "Opposition", "TestNo")
 57 | 
 58 | #Tests Per Year
 59 | tpy <- getCount(mbDS, "Year", "TestNo")
 60 | 
 61 | #Runs by Opposition by Year - userful for clicking on an opposition and drilling down by year
 62 | #rpopy <- ddply(mbDS, .(Year, Opposition), summarise, TotalRuns = sum(Test_Runs, na.rm=TRUE))
 63 | rpopy <- getSum(mbDS, c("Opposition", "Year"), "Test.Runs")
 64 | 
 65 | #Tests Per Opposition By Year
 66 | tpopy <- getCount(mbDS, c("Opposition", "Year"), "TestNo")
 67 | 
 68 | #Runs and Tests Per Opposition
 69 | rtpo <- getSum1Count2(mbDS, "Opposition", c("Test.Runs", "TestNo")) ### Doesn't work
 70 | 
 71 | #Total Runs by Opposiotn by Inning
 72 | #rpopi <- ddply(mbDS, .(Opposition), summarise, 
 73 | #               1stIngRuns = sum(Ing1.Runs, na.rm=TRUE), 
 74 | #               2ndIngRuns = sum(Ing2.Runs, na.rm=TRUE),
 75 | #              TotalRuns = sum(Test_Runs, na.rm=TRUE))
 76 | 
 77 | #Total Runs by Year by Opposition - userful for clicking on a year to drill down
 78 | #rpypo <- ddply(mbDS, .(Year, Opposition), summarise, TotalRuns = sum(Test_Runs, na.rm=TRUE))
 79 | 
 80 | 
 81 | #Total Centuries by Opposition
 82 | #CPO <- ddply(mbDS, .(Opposition), summarise, TestCenturies = sum(Test.Century==1, na.rm=TRUE))
 83 | cpo <- getBinaryCount(mbDS, "Opposition", "Test.Century")
 84 | #cpo1 <- getSum(mbDS, "Opposition", "Test.Century")
 85 | 
 86 | cpo <- getBinaryCount(mbDS, "Opposition", c("Ing1.Century", "Ing2.Century", "Test.Century"))
 87 | 
 88 | 
 89 | #Centuries Per Year
 90 | cpy <- getBinaryCount(mbDS, "Year", "Test.Century")
 91 | 
 92 | #Centuries Per Opposition By Year
 93 | cpopy <- getBinaryCount(mbDS, c("Opposition", "Year"), "Test.Century")
 94 | 
 95 | #Mege T, R, C by Year
 96 | trcpy <- merge(x=merge(tpy,rpy, by.x="Year", by.y="Year"), cpy, by.x="Year", by.y="Year")
 97 | colnames(trcpy) <- c("Year", "Tests", "Runs", "Centuries")
 98 | 
 99 | 
100 | # Passing multiple arguments ?colwise example
101 | #sum <- function(x){sum(x, na.rm=TRUE)}
102 | #new <- ddply(tips, c("sex", "smoker"), colwise(function(x) {sum(x, na.rm=TRUE)}, c("tip", "total_bill")))
103 | #new <- ddply(tips, c("sex", "smoker"), colwise(sum, c("tip", "total_bill")))


--------------------------------------------------------------------------------
/learning/DGQueryEngine.R:
--------------------------------------------------------------------------------
  1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
  2 | # Date: 3/15/2013
  3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
  4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
  5 | # All rights reserved.
  6 | 
  7 | library(plyr)
  8 | library(RJSONIO)
  9 | #setwd("/Users/homemac/Toga/Alto")
 10 | #Set content type for output
 11 | setContentType("text/html")
 12 | #Query Engine - qds -  query this database, d=dimension, m=measure, f=funciton 
 13 | qds <- d <- m <- f <- g <- NULL
 14 | #d <- ""
 15 | #m <- ""
 16 | #f <- ""
 17 | 
 18 | #load pre-analyzed and saved dataframe
 19 | loadDS <- function(qds) {
 20 |   if(exists(qds)) {
 21 |       #lpads <- get("pads", envir=.GlobalEnv)
 22 |       #cat(exists(pads$sysID[pads$id==qds], envir=.GlobalEnv), "<br/>")
 23 |       #cat(qds)
 24 |       return(get(qds))    #get a local copy in this enviornment
 25 |     } else {
 26 |       cat(paste('{"response_code": "error", "error_message": "The pad ', qds, ' does not exist. Check to see if you start up R script in RApache ran correctly."}', sep=""))
 27 |     }
 28 | }
 29 | 
 30 | ## Operate on a data.frame, split by "by", by could be a vector or a single filed
 31 | ## aggreage by columns in vector on, on could be a single field also
 32 | getSum <- function(df, by, on) {
 33 |     #print(sum(df[,on],  na.rm=TRUE))
 34 |     #cat(paste(by, on, sep=", "))
 35 |     return(ddply(df, by, colwise(function(col) {sum(col, na.rm=TRUE)}, on)))
 36 | }
 37 | 
 38 | getMean <- function(df, by, on) {
 39 |   #print(sum(df[,on],  na.rm=TRUE))
 40 |   #cat(paste(by, on, sep=", "))
 41 |   return(ddply(df, by, colwise(function(col) {mean(col, na.rm=TRUE)}, on)))
 42 | }
 43 | 
 44 | #ddply(mbDS, "Opposition", function(tDF){data.frame(Lat=tDF$Opposition.Lat[1], Lng=tDF$Opposition.Lng[1], Test.Runs=sum(tDF$Test.Runs, na.rm=TRUE))})
 45 | ## Operate on a data.frame, split by "by", by could be a vector or a single filed
 46 | ## and aggreage by columns in vector on, on could be a single field also
 47 | #getCount <- function(df, by, on) {
 48 | #  return(ddply(df, by, colwise(function(col) {length(unique(col))}, on)))
 49 | #}
 50 | 
 51 | getCount<- function(df, by, on) {   
 52 |     lds <- ddply(df, by, "nrow")
 53 |     colnames(lds) <- c(by,on)
 54 |     return(lds)
 55 | }
 56 | 
 57 | #For counting column that has 0 or 1 value :: Very similar to getSum - the answer should be same
 58 | getBinaryCount <- function(df, by, on) {
 59 |     return(ddply(df, by, colwise(function(x) {length(which(x==1))}, on)))
 60 |     #length(which(mbDS[,"Test.Century"] ==1))
 61 | }
 62 | 
 63 | readRequest <- function() {
 64 |   #GetString <- str(GET)
 65 |   #cat(GetString)
 66 |   qds <<- GET$qds
 67 |   d <<- GET$d
 68 |   m <<- GET$m
 69 |   f <<- GET$f
 70 |   g <<- GET$g
 71 |   #cat(d, "," ,m, "," ,f, "," ,qds, sep="    ")
 72 |   #cat("<br/>")
 73 | }
 74 | 
 75 | processRequest <- function(fnType, df){  
 76 |     return(switch(fnType,
 77 |        s = getSum(df, d, m), 
 78 |        c = getCount(df, d, m),
 79 |        m = getMean(df,d,m),
 80 |        d = getValue(df,d,m),
 81 |        'default'
 82 |        )
 83 |     )   
 84 | }
 85 | 
 86 | writeResponse <- function(responseDF, df) {
 87 |   sJSON <- paste('{"response_code": "ok","', m, '":', toJSON(responseDF[,m]), sep="") #JSON String is not closed.
 88 |   if(!is.null(g) && g == "t") {
 89 |     latName <- paste(d, ".Lat", sep="")
 90 |     lngName <- paste(d, ".Lng", sep="")
 91 |     sJSON <- paste(sJSON, ', "mapsJSON": [', sep="")
 92 |     for(i in 1:nrow(responseDF)) {
 93 |       key <- responseDF[,d][i]
 94 |       #cat(paste ("<br/>", key , ", what?"))
 95 |       j <- match(key, df[,d])
 96 |       sJSON <- paste(sJSON, '{"d":"', key,
 97 |                                '", "lat":', 
 98 |                                 df[latName][j,],
 99 |                               ', "lng":', df[lngName][j,], 
100 |                                ', "data":', responseDF[i,2], 
101 |                                 '}', sep="")
102 |       if(i!=nrow(responseDF))
103 |         sJSON <- paste(sJSON, ",", sep="")
104 |     }
105 |     sJSON <- paste(sJSON, ']', sep="")
106 |   }
107 |     sJSON = paste(sJSON, "}", sep="")
108 |   #Write response back
109 |   cat(sJSON)
110 |   #cat(ls())
111 |   #rm(list=ls()) #Clean up everything - There has to be a better way than this.
112 |   #cat(ls())
113 | }
114 | 
115 | #Step 1 - Read Request, query string
116 | readRequest()
117 | 
118 | #Step 2 - load environment
119 | df <- loadDS(qds)
120 | 
121 | if(!is.null(df)) {
122 |   #Step 3 - Process Request 
123 |   responseDF <- processRequest(f, df)
124 |   #Step 3.a - Apply Filter
125 |   
126 |   #Step 4 - Write Response
127 |   writeResponse(responseDF, df)
128 | }


--------------------------------------------------------------------------------
/learning/DGQEWebSockets.R:
--------------------------------------------------------------------------------
  1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
  2 | # Date: 3/15/2013
  3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
  4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
  5 | # All rights reserved.
  6 | library(plyr)
  7 | library(RJSONIO)
  8 | library(data.table)
  9 | #Set content type for output
 10 | setContentType("text/html")
 11 | #Query Engine - qds -  query this database, d=dimension, m=measure, f=funciton 
 12 | qds <- d <- m <- f <- g <- fv <- NULL
 13 | 
 14 | #Convert to a ROW JSON
 15 | getRowWiseJson <- function (jsonDT) {
 16 |   row.json <- apply(jsonDT, 1, toJSON)
 17 |   json.st <- paste('[', paste(row.json, collapse=', '), ']')
 18 |   return (json.st)
 19 | }
 20 | 
 21 | #load pre-analyzed and saved dataframe
 22 | loadDS <- function(qds) {
 23 |   if(exists(qds, envir=.GlobalEnv)) {
 24 |       #lpads <- get("pads", envir=.GlobalEnv)
 25 |       #cat(exists(pads$sysID[pads$id==qds], envir=.GlobalEnv), "<br/>")
 26 |       return(get(qds, envir=.GlobalEnv))    #get a local copy in this enviornment
 27 |     } else {
 28 |       #print("PAD didn't exist. Check to see if you start up R script in RApache ran correctly.")
 29 |       cat(paste('{"response_code": "error", "error_message": "The pad ', qds, ' does not exist. Check to see if you start up R script in RApache ran correctly."}', sep=""))
 30 |     }
 31 | }
 32 | 
 33 | getData <- function(dat, expr, gby) {
 34 |   e <- substitute(expr)
 35 |   b <- substitute(gby)
 36 |   #cat("<br/>", gby)
 37 |   #cat(b)
 38 |   #cat(dat[,eval(e), by=gby])
 39 |   return(dat[,eval(e),by=b])
 40 | }
 41 | #getData(padDT, sum(eval(m), na.rm=TRUE), d)
 42 | 
 43 | readRequest <- function() {
 44 |   #GetString <- str(GET)
 45 |   #cat(GetString)
 46 |   qds <<- GET$qds
 47 |   d <<- GET$d
 48 |   m <<- GET$m
 49 |   f <<- GET$f
 50 |   g <<- GET$g
 51 |   fv <<- GET$fv
 52 |   #cat(d, "," ,m, "," ,f, "," ,qds, sep="    ")
 53 |   #cat("<br/>")
 54 | }
 55 | #Proecess Request
 56 | processRequest <- function(fnType, padDT, mea, dim, fvalue){  
 57 |   rs <- NULL
 58 |   if(is.null(fvalue)) { 
 59 |     switch(fnType,
 60 |            d={rs <- padDT[,list(m=get(mea)),by=dim]},       #default by accpets character function
 61 |            s= { rs <- padDT[, list(m=sum(get(mea), na.rm=TRUE)), by=dim]}, #sum
 62 |            c= {rs <- padDT[, list(m=length(na.omit(get(mea)))), by=dim]}, #count      
 63 |            m= {rs <- padDT[,list(m=mean(get(mea), na.rm=TRUE)),by=eval(dim)]},  #Average, eval for dim also works
 64 |            'default'
 65 |     )
 66 |   } else {
 67 |     switch(fnType,
 68 |            d={rs <- padDT[,list(m=get(mea)),by=dim][get(dim)==fvalue]},       #default by accpets character function
 69 |            s= { rs <- padDT[, list(m=sum(get(mea), na.rm=TRUE)), by=dim][get(dim)==fvalue]}, #sum
 70 |            c= {rs <- padDT[, list(m=length(na.omit(get(mea)))), by=dim][get(dim)==fvalue]}, #count      
 71 |            m= {rs <- padDT[,list(m=mean(get(mea), na.rm=TRUE)),by=eval(dim)][get(dim)==fvalue]},  #Average, eval for dim also works
 72 |            'default'
 73 |     )    
 74 |   }
 75 |    return (rs)
 76 | }
 77 | #test this out
 78 | #dt <- data.table(iris)
 79 | #m <- "Sepal.Length"
 80 | #d <- "Species"
 81 | #fv <- NULL
 82 | #processRequest("s", padDT=dt, mea=m, dim=d, fvalue=fv)
 83 | 
 84 | writeResponse <- function(responseDF, padDT) {
 85 |   if(is.null(d)) {
 86 |     responseDF$d = ""
 87 |   } else #Change the name to d
 88 |     setnames(responseDF, 1, "d")
 89 |   #sJSON <- paste('{"response_code": "ok","', m, '":', toJSON(responseDF[,m]), sep="") #JSON String is not closed.
 90 |   sJSON <- paste('{"response_code": "ok","chartData":', getRowWiseJson(responseDF), sep="") #JSON String is not closed.
 91 |   if(!is.null(g) && g == "t") {
 92 |     latName <- paste(d, ".Lat", sep="")
 93 |     lngName <- paste(d, ".Lng", sep="")
 94 |     sJSON <- paste(sJSON, ', "mapsJSON": [', sep="")
 95 |     for(i in 1:nrow(responseDF)) {
 96 |       key <- responseDF[,d][i]
 97 |       #cat(paste ("<br/>", key , ", what?"))
 98 |       j <- match(key, padDT[,d])
 99 |       sJSON <- paste(sJSON, '{"d":"', key,
100 |                                '", "lat":', 
101 |                                 padDT[latName][j,],
102 |                               ', "lng":', padDT[lngName][j,], 
103 |                                ', "data":', responseDF[i,2], 
104 |                                 '}', sep="")
105 |       if(i!=nrow(responseDF))
106 |         sJSON <- paste(sJSON, ",", sep="")
107 |     }
108 |     sJSON <- paste(sJSON, ']', sep="")
109 |   }
110 |     sJSON = paste(sJSON, "}", sep="")
111 |   #Write response back
112 |   cat(sJSON)
113 |   #cat(ls())
114 |   rm(list=ls()) #Clean up everything - There has to be a better way than this.
115 |   #cat(ls())
116 | }
117 | 
118 | #Step 1 - Read Request, query string
119 | readRequest()
120 | 
121 | #Step 2 - load environment
122 | padDT <- data.table(loadDS(qds))
123 | 
124 | #Step 3 - Process Request 
125 | responseDF <- processRequest(f, padDT, mea=m, dim=d, fvalue=fv)
126 | 
127 | #Step 4 - Write Response
128 | writeResponse(responseDF, padDT)
129 | 
130 | 
131 | 
132 | 


--------------------------------------------------------------------------------
/learning/DKOMSentimentAnalysis.R:
--------------------------------------------------------------------------------
  1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
  2 | # Date: 3/15/2013
  3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
  4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
  5 | # All rights reserved.
  6 | 
  7 | rm (list = ls())
  8 | setwd("~/Toga/Alto")
  9 | # Revised Sentiment Analyis using Hu & Liu's library of 6,800 negative and positive words
 10 | library("twitteR")
 11 | library("plyr")
 12 | library("stringr")
 13 | library("doBy")
 14 | 
 15 | #Populate the list of sentiment words from Hu and Liu (http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html)
 16 | huliu.pwords <- scan('opinion-lexicon/positive-words.txt', what='character', comment.char=';')
 17 | huliu.nwords <- scan('opinion-lexicon/negative-words.txt', what='character', comment.char=';')
 18 | 
 19 | # Add some words
 20 | huliu.nwords <- c(huliu.nwords,'wtf','wait','waiting','epicfail', 'crash', 'bug', 'bugy', 'bugs', 'slow', 'lie')
 21 | #Remove some words
 22 | huliu.nwords <- huliu.nwords[!huliu.nwords=='sap']
 23 | huliu.nwords <- huliu.nwords[!huliu.nwords=='cloud']
 24 | #which('sap' %in% huliu.nwords)
 25 | 
 26 | cList <- list(twitterTag=c("#DKOM"))
 27 | 
 28 | getTweets <- function(tag) {
 29 |   # Get 1500 tweets - an individual is only allowed to get 1500 tweets
 30 |   return(searchTwitter(tag, n=1500))
 31 |   #since=as.character(Sys.Date()-3 - If you want to restrict this to just last few days tweets
 32 | }
 33 | 
 34 | 
 35 | getSentimentScore <- function(tweets)
 36 | {
 37 |   scores <- laply(tweets, function(singleTweet) {
 38 |     # clean up tweets with R's regex-driven global substitute, gsub()
 39 |     singleTweet <- gsub('[[:punct:]]', '', singleTweet)
 40 |     singleTweet <-  gsub('[[:cntrl:]]', '', singleTweet)
 41 |     singleTweet <- gsub('\\d+', '',  singleTweet)
 42 |     #Convert to lower case for comparision, split the tweet into single words and flatten the list
 43 |     tweetWords <- unlist(str_split(tolower(singleTweet), '\\s+'))
 44 |     # compare our words to the dictionaries of positive & negative terms
 45 |     # match() returns the position of the matched term or NA, apply is.na to convert to boolean
 46 |     pos.matches <- !is.na(match(tweetWords, huliu.pwords))
 47 |     neg.matches <- !is.na(match(tweetWords, huliu.nwords))
 48 |     # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
 49 |     score <- sum(pos.matches) - sum(neg.matches)
 50 |      return(score)
 51 |   })
 52 |   return(data.frame(SentimentScore=scores, Tweet=tweets))
 53 | }
 54 | 
 55 | #Step #1
 56 | 
 57 | sentimentScoreDF <- data.frame()
 58 | sentimentScoreDF <- ldply(cList$twitterTag, function(twitterTag) {
 59 |   tweets <- getTweets(twitterTag)
 60 |   tweets.text <- laply(tweets,function(t)t$getText())
 61 |   sentimentScore <- getSentimentScore(tweets.text)
 62 |   sentimentScore$TwitterTag <- twitterTag
 63 |   rbind(sentimentScoreDF, sentimentScore)
 64 | })
 65 | 
 66 | # Get rid of tweets that have zero score and seperate +ve from -ve tweets
 67 | sentimentScoreDF$posTweets <- as.numeric(sentimentScoreDF$SentimentScore >=1)
 68 | sentimentScoreDF$negTweets <- as.numeric(sentimentScoreDF$SentimentScore <=-1)
 69 | 
 70 | #Summarize finidings
 71 | summaryDF <- ddply(sentimentScoreDF,"TwitterTag", summarise, 
 72 |                  TotalTweetsFetched=length(SentimentScore),
 73 |                  PositiveTweets=sum(posTweets), NegativeTweets=sum(negTweets), 
 74 |                  AverageScore=round(mean(SentimentScore),3))
 75 | 
 76 | summaryDF$TotalTweets <- summaryDF$PositiveTweets + summaryDF$NegativeTweets
 77 | 
 78 | #Get Sentiment Score
 79 | summaryDF$Sentiment  <- round(summaryDF$PositiveTweets/summaryDF$TotalTweets, 2)
 80 | 
 81 | #Order
 82 | orderBy(~-Sentiment,summaryDF)
 83 | 
 84 | write.csv(summaryDF, "DKOM-Results.csv")
 85 | 
 86 | 
 87 | 
 88 | 
 89 | #summary <- ddply(sentimentScoreDF, "TwitterTag", function(df) { MeanScore=round(mean(df$SentimentScore),3)})
 90 | summary <- ddply(sentimentScoreDF, "TwitterTag", summarise, 
 91 |       MeanScore=round(mean(SentimentScore),3), TweetCount=length(SentimentScore))
 92 | 
 93 | period <- seq(as.Date("2010-01-01"), by="month", length.out=24)
 94 | tweets <- llply(period, function(since){
 95 |   until <- seq(since, by="month", length.out=2)[2] 
 96 |   print(since)
 97 |   print(until)
 98 |   tt <- searchTwitter("#SALESforce", n=1500, since=since, until=until )
 99 |   print(length(tt))
100 |   return(length(tt))
101 | })
102 | 
103 | getAuthorized <- function() {
104 |   cred <- OAuthFactory$new(consumerKey="g8sky1SyttNWPsgZOGAA", 
105 |                            consumerSecret="zuJvHtN63yLSYh9Bk2Qc0kd5kGIV9mtzb7r6XPVqq8",
106 |                            requestURL="https://api.twitter.com/oauth/request_token",
107 |                            accessURL="https://api.twitter.com/oauth/access_token",
108 |                            authURL="https://api.twitter.com/oauth/authorize")
109 |   cred$handshake()
110 |   registerTwitterOAuth(cred)
111 |   #3739000
112 |   save(cred, file="TwitterAanalyticsForAll.Cred")
113 | }
114 | 
115 | loadCred <- function(){  
116 |   cred <- load("TwitterAanalyticsForAll.Cred")
117 |   cred$handshake()
118 |   registerTwitterOAuth(cred)
119 | }
120 | 


--------------------------------------------------------------------------------
/learning/TwiterSentimentAnalysis.R:
--------------------------------------------------------------------------------
  1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
  2 | # Date: 3/15/2013
  3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
  4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
  5 | # All rights reserved.
  6 | 
  7 | rm (list = ls())
  8 | setwd("~/Toga/Alto")
  9 | # Revised Sentiment Analyis using Hu & Liu's library of 6,800 negative and positive words
 10 | library("twitteR")
 11 | library("plyr")
 12 | library("stringr")
 13 | library("doBy")
 14 | 
 15 | #Populate the list of sentiment words from Hu and Liu (http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html)
 16 | huliu.pwords <- scan('opinion-lexicon/positive-words.txt', what='character', comment.char=';')
 17 | huliu.nwords <- scan('opinion-lexicon/negative-words.txt', what='character', comment.char=';')
 18 | 
 19 | # Add some words
 20 | huliu.nwords <- c(huliu.nwords,'wtf','wait','waiting','epicfail', 'crash', 'bug', 'bugy', 'bugs', 'slow', 'lie')
 21 | #Remove some words
 22 | huliu.nwords <- huliu.nwords[!huliu.nwords=='sap']
 23 | huliu.nwords <- huliu.nwords[!huliu.nwords=='cloud']
 24 | #which('sap' %in% huliu.nwords)
 25 | 
 26 | cList <- list(twitterTag=c("@Netflix"))
 27 | 
 28 | getTweets <- function(tag) {
 29 |   # Get 1500 tweets - an individual is only allowed to get 1500 tweets
 30 |   return(searchTwitter(tag, n=1500))
 31 |   #since=as.character(Sys.Date()-3 - If you want to restrict this to just last few days tweets
 32 | }
 33 | 
 34 | 
 35 | getSentimentScore <- function(tweets)
 36 | {
 37 |   scores <- laply(tweets, function(singleTweet) {
 38 |     # clean up tweets with R's regex-driven global substitute, gsub()
 39 |     singleTweet <- gsub('[[:punct:]]', '', singleTweet)
 40 |     singleTweet <-  gsub('[[:cntrl:]]', '', singleTweet)
 41 |     singleTweet <- gsub('\\d+', '',  singleTweet)
 42 |     #Convert to lower case for comparision, split the tweet into single words and flatten the list
 43 |     tweetWords <- unlist(str_split(tolower(singleTweet), '\\s+'))
 44 |     # compare our words to the dictionaries of positive & negative terms
 45 |     # match() returns the position of the matched term or NA, apply is.na to convert to boolean
 46 |     pos.matches <- !is.na(match(tweetWords, huliu.pwords))
 47 |     neg.matches <- !is.na(match(tweetWords, huliu.nwords))
 48 |     # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
 49 |     score <- sum(pos.matches) - sum(neg.matches)
 50 |      return(score)
 51 |   })
 52 |   return(data.frame(SentimentScore=scores, Tweet=tweets))
 53 | }
 54 | 
 55 | #Step #1
 56 | 
 57 | sentimentScoreDF <- data.frame()
 58 | sentimentScoreDF <- ldply(cList$twitterTag, function(twitterTag) {
 59 |   tweets <- getTweets(twitterTag)
 60 |   tweets.text <- laply(tweets,function(t)t$getText())
 61 |   sentimentScore <- getSentimentScore(tweets.text)
 62 |   sentimentScore$TwitterTag <- twitterTag
 63 |   rbind(sentimentScoreDF, sentimentScore)
 64 | })
 65 | 
 66 | # Get rid of tweets that have zero score and seperate +ve from -ve tweets
 67 | sentimentScoreDF$posTweets <- as.numeric(sentimentScoreDF$SentimentScore >=1)
 68 | sentimentScoreDF$negTweets <- as.numeric(sentimentScoreDF$SentimentScore <=-1)
 69 | 
 70 | #Summarize finidings
 71 | summaryDF <- ddply(sentimentScoreDF,"TwitterTag", summarise, 
 72 |                  TotalTweetsFetched=length(SentimentScore),
 73 |                  PositiveTweets=sum(posTweets), NegativeTweets=sum(negTweets), 
 74 |                  AverageScore=round(mean(SentimentScore),3))
 75 | 
 76 | summaryDF$TotalTweets <- summaryDF$PositiveTweets + summaryDF$NegativeTweets
 77 | 
 78 | #Get Sentiment Score
 79 | summaryDF$Sentiment  <- round(summaryDF$PositiveTweets/summaryDF$TotalTweets, 2)
 80 | 
 81 | #Order
 82 | orderBy(~-Sentiment,summaryDF)
 83 | 
 84 | write.csv(summaryDF, "Netflix-Results.csv")
 85 | 
 86 | 
 87 | 
 88 | 
 89 | #summary <- ddply(sentimentScoreDF, "TwitterTag", function(df) { MeanScore=round(mean(df$SentimentScore),3)})
 90 | summary <- ddply(sentimentScoreDF, "TwitterTag", summarise, 
 91 |       MeanScore=round(mean(SentimentScore),3), TweetCount=length(SentimentScore))
 92 | 
 93 | period <- seq(as.Date("2010-01-01"), by="month", length.out=24)
 94 | tweets <- llply(period, function(since){
 95 |   until <- seq(since, by="month", length.out=2)[2] 
 96 |   print(since)
 97 |   print(until)
 98 |   tt <- searchTwitter("#SALESforce", n=1500, since=since, until=until )
 99 |   print(length(tt))
100 |   return(length(tt))
101 | })
102 | 
103 | getAuthorized <- function() {
104 |   cred <- OAuthFactory$new(consumerKey="g8sky1SyttNWPsgZOGAA", 
105 |                            consumerSecret="zuJvHtN63yLSYh9Bk2Qc0kd5kGIV9mtzb7r6XPVqq8",
106 |                            requestURL="https://api.twitter.com/oauth/request_token",
107 |                            accessURL="https://api.twitter.com/oauth/access_token",
108 |                            authURL="https://api.twitter.com/oauth/authorize")
109 |   cred$handshake()
110 |   registerTwitterOAuth(cred)
111 |   #3739000
112 |   save(cred, file="TwitterAanalyticsForAll.Cred")
113 | }
114 | 
115 | loadCred <- function(){  
116 |   cred <- load("TwitterAanalyticsForAll.Cred")
117 |   cred$handshake()
118 |   registerTwitterOAuth(cred)
119 | }
120 | 


--------------------------------------------------------------------------------
/learning/Olympics.R:
--------------------------------------------------------------------------------
  1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
  2 | # Date: 3/15/2013
  3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
  4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
  5 | # All rights reserved.
  6 | 
  7 | rm (list = ls())
  8 | setwd("~/Toga")
  9 | library(XML)
 10 | library("chron")
 11 | library(plyr)
 12 | library(stringr)
 13 | library("RJSONIO")
 14 | 
 15 | #############################
 16 | # Olympic Tables
 17 | #library(XML)
 18 | #theurl <- "http://en.wikipedia.org/wiki/Brazil_national_football_team"
 19 | #tables <- readHTMLTable(theurl)
 20 | #n.rows <- unlist(lapply(tables, function(t) dim(t)[1]))
 21 | #the picked table is the longest one on the page
 22 | #braz <- tables[[which.max(n.rows)]]
 23 | #print(braz)
 24 | ############################  
 25 | fixit <- function(x) { # Fix rowspan issue where there is single rank assing to multiple rows
 26 |   if(nchar(x[1]) > 3) { # other than rank
 27 |     for(i in length(x):2) { # Shift items by 1
 28 |       x[i] = x[i-1]
 29 |     }
 30 |     x[1] = -1 #Assign rank column to -1
 31 |   }
 32 |   return(x)
 33 | }
 34 | #tl <- rbind(n, ddply(medals, .(V1), temp))
 35 | 
 36 | readWikiOT <- function(d) { #Read Wiki olympic medal tables
 37 |   whichTable <- 1
 38 |   print(d$year) 
 39 |   if(!(d$year %in% c(1916, 1940, 1944))) { #Code breaks here due to wiki issues
 40 |     #whichTable <- 3
 41 |     if(d$year %in% c(1928, 1948, 1972, 1980,1996))
 42 |       whichTable <- 2
 43 |     if(d$year %in% c(1960, 2000, 2004, 2008, 2012))
 44 |       whichTable <- 3    
 45 |     print(whichTable)
 46 |     medals <- NULL
 47 |     if(d$year == 1992) { # Remove subscript and superscript text
 48 |       overview <- htmlParse(d$url,encoding="UTF-8")
 49 |       temp<-getNodeSet(overview, "/*//sup")
 50 |       removeNodes(temp)
 51 |       medals <- try(readHTMLTable(overview, header=FALSE, skip.rows=1, which=whichTable, stringsAsFactors = FALSE)) 
 52 |     } else
 53 |       medals <- try(readHTMLTable(d$url, header=FALSE, skip.rows=1, which=whichTable, stringsAsFactors = FALSE)) 
 54 |     
 55 |     medals <- medals[-nrow(medals),]
 56 |     if(d$year < 1992) 
 57 |       medals <- ddply(medals, .(V1), fixit)
 58 |     medals <- medals[, -c(1)]  
 59 |     #colnames(medals) <- c('Nation', 'Gold', 'Silver', 'Bronze', 'Total')
 60 |     medals <- tm <- with(medals, data.frame( 
 61 |                            #Nation=str_trim(sapply(strsplit(str_trim(medals$V2), " \\("), '[', 1)), 
 62 |                            #NationCode=sapply(strsplit(str_trim(sapply(strsplit(str_trim(medals$V2), " \\)"), '[', 2)), "\\)"), '[',1),
 63 |                            Country=substr(medals$V2, 1, nchar(medals$V2)-6),
 64 |                                              CountryCode=substr(medals$V2, nchar(medals$V2)-3, nchar(medals$V2)-1),
 65 |                            Gold=as.numeric(medals$V3), Silver=as.numeric(medals$V4), Bronze=as.numeric(medals$V5),
 66 |                            Total=as.numeric(medals$V6)))
 67 |     #medals$Rank <- rank(medals$V3)
 68 |     #medals <- medals[!c("Rank")]
 69 |     return (medals) 
 70 |   }
 71 | }
 72 | 
 73 | year <- seq(1896, 2012, by=4)
 74 | url <- paste("http://en.wikipedia.org/wiki/", year, "_Summer_Olympics_medal_table", sep="")
 75 | om <- data.frame(year=year, url=url, stringsAsFactors = FALSE) #Indian movies
 76 | omDS <- ddply(om, .(year), readWikiOT)
 77 | write.csv(omDS, "SummerOlympic-Medals-1896-2012.csv",row.names=FALSE)
 78 | 
 79 | ### Read Bollywood.csv
 80 | in.file.name <- "datasets/BollywoodCinema-1940-2008.csv"
 81 | dt <- data.frame(read.csv(in.file.name, na.strings="-", as.is=TRUE, header=TRUE, 
 82 |                           stringsAsFactors=FALSE, strip.white=TRUE))
 83 | #Get Unique ID
 84 | dsID <- paste("pad", substr(as.character(unclass(Sys.time())), 12,16), sep="")
 85 | out.file.csv <- paste("pads/data/",dsID, ".csv", sep="")
 86 | out.file.JSON <- paste("pads/meta/",dsID, ".JSON", sep="")
 87 | out.file.R.dataframes <- paste("pads/cache/", dsID, ".RData", sep="")
 88 | write.csv(dt, out.file.csv)
 89 | jsonSt <- paste('{ 
 90 |                 "datasetName": "Bollywood Cinema (1940-2008)",
 91 |                 "id":"', dsID, '",
 92 |                 "dList": {
 93 |                 "Year":', toJSON(levels(dt$Year)), ',
 94 |                 "Director":', toJSON(levels(dt$Director)), '
 95 |                 },
 96 |                 "mList": {
 97 |                 "Genre1":"", 
 98 |                 "Genre2": ""
 99 |                 },
100 |                 "dListType": {
101 |                 },
102 |                 "source":"Wikipedia"
103 |                 }', sep="")
104 | file.out <- file(out.file.JSON, 'wt')
105 | cat(jsonSt, file=file.out, fill = TRUE)
106 | close(file.out)
107 | #Save transformed dataset as data.frame for later reading
108 | assign(dsID, dt)
109 | names <- c(eval(dsID))
110 | save(list=names, file=out.file.R.dataframes)
111 | pads <- rbind(pads, data.table(id=dsID, title="Bollywood Cinema (1940-2008)", 
112 |                                subtitle="Bollywood Movies from 1940 including director, cast and genre.", 
113 |                                records=nrow(dt), analyzed=0, stories=0, json=out.file.JSON, data=out.file.csv,
114 |                                cache=out.file.R.dataframes, source="Wiki")
115 | )
116 | #url <- paste("http://www.votesmart.org/candidate/evaluations/", 1:50 , sep = "")
117 | #res <- llply(url, function(i) readHTMLtable(i))


--------------------------------------------------------------------------------
/GenerateWDI.R:
--------------------------------------------------------------------------------
  1 | # Script to read wdi data and create tables in WDI database
  2 | # Make sure to convert file WDI_Description.csv to WDI_Description.txt - this file is not needed in the DB
  3 | # Take back up if necessary
  4 | # Author: Jitender Aswani, Co-Founder @datadolph.in
  5 | # Date: 3/15/2013
  6 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
  7 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
  8 | # All rights reserved.
  9 | 
 10 | setwd("/Users/homemac/R")
 11 | 
 12 | require("RMySQL")
 13 | require("plyr")
 14 | 
 15 | source("Util.R")
 16 | #
 17 | #initiate a connection to wdi database
 18 | #
 19 | 
 20 | getWDIDBCon <- function(multi.statements=FALSE) { 
 21 |   if(multi.statements)
 22 |     return(dbConnect(MySQL(), user="wdi", password="wdi", dbname="wdi", host="localhost", 
 23 |                      client.flag=CLIENT_MULTI_STATEMENTS))
 24 |   else 
 25 |     return(dbConnect(MySQL(), user="wdi", password="wdi", dbname="wdi", host="localhost"))
 26 | }
 27 | 
 28 | #
 29 | #disconnect a database
 30 | #
 31 | disconnectDB <- function(con) {
 32 |   #Close Connection
 33 |   dbDisconnect(con)
 34 | } 
 35 | 
 36 | 
 37 | #
 38 | #write a data.frame as a table in WDI database
 39 | # this funciton will overwrite the exisiting table 
 40 | #
 41 | 
 42 | writeTable <- function(table.name ,table.data) {
 43 |   #List tables and fields in a table:
 44 |   con <- getWDIDBCon()
 45 |   on.exit(disconnectDB(con)) # On function exit, close connection
 46 |   if(dbExistsTable(con, table.name)){
 47 |     dbRemoveTable(con, table.name)
 48 |     dbWriteTable(con, name=table.name, value=table.data, row.names = F, overwrite = T)  
 49 |   } else {
 50 |     dbWriteTable(con, name=table.name, value=table.data, row.names = F, overwrite = T)  
 51 |   }
 52 | 
 53 | }
 54 | 
 55 | 
 56 | saveAllWDIData <- function() {
 57 |   folder.path = "./raw-data/wdi"
 58 |   filenames <- list.files(path=folder.path, pattern="*.csv")
 59 |   for(i in filenames) {
 60 |     filePath <- file.path(folder.path,i)
 61 |     wdi.name <- unlist(strsplit(i, "\\."))[1] #Temp - use the file name as the title
 62 |     wdi.df <- readFile(filePath)
 63 |     #clean up
 64 |     wdi.df <- wdi.df[rowSums(is.na(wdi.df)) != ncol(wdi.df),] # remove empty rows
 65 |     wdi.df <- wdi.df[rowSums(is.na(wdi.df)) != ncol(wdi.df)-1,] # remove rows with just one cell filled out
 66 |     wdi.df <- wdi.df[,colSums(is.na(wdi.df)) != nrow(wdi.df)] # remove empty cols
 67 |     
 68 |     #some cells have new line variables, remove it
 69 |     #char.vars <- setdiff(char.vars, "W")  # remove a column
 70 |     char.vars <- names(wdi.df)[sapply(wdi.df, is.character)]  #select character variables
 71 |     for(i in 1:length(char.vars)) {
 72 |       wdi.df[char.vars[i]] = sapply(wdi.df[char.vars[i]], function(x) gsub("\\n", " ", x))
 73 |     }    
 74 |     
 75 |     #clean up column names    
 76 |     colnames(wdi.df) <- replaceMetaChars(colnames(wdi.df))
 77 |     writeTable(wdi.name, wdi.df)
 78 |   }
 79 | }
 80 | 
 81 | saveAllWDIData()
 82 | 
 83 | 
 84 | testFunctionDoNoRun <- function(){
 85 |   wdi.df <- read.csv("./raw-data/wdi/WDI_Series.csv", stringsAsFactors=F)
 86 |   wdi.df <- wdi.df[rowSums(is.na(wdi.df)) != ncol(wdi.df),] # remove empty rows
 87 |   wdi.df <- wdi.df[,colSums(is.na(wdi.df)) != nrow(wdi.df)] # remove empty cols
 88 |   
 89 |   char.vars <- names(wdi.df)[sapply(wdi.df, is.character)]  #select character variables
 90 |   for(i in 1:length(char.vars)) {
 91 |     wdi.df[char.vars[i]] = sapply(wdi.df[char.vars[i]], function(x) gsub("\\n", " ", x))
 92 |   }
 93 |   writeTable("wdi_series", wdi.df)
 94 |   
 95 | 
 96 |   df1 <- as.data.frame(cbind(sapply(df,function(x) { 
 97 |     if(is.character(x)) {
 98 |       gsub("\\n", " ", x)
 99 |     } 
100 |     return(x)
101 |     })))
102 | }
103 | 
104 | generateCatsFromWDI <- function() {
105 |   wdi.cats <- read.csv("./raw-data/wdi/WDI_DD_Categories.csv", stringsAsFactors=F)
106 |   #colnames(wdi.cats)
107 |   #unique(wdi.cats$Category)
108 |   #unique(wdi.cats$Subcategory)
109 |   wdi.cats$Category[wdi.cats$Category=="Economic Policy & Debt"] = "Economics"
110 |   wdi.cats$Category[wdi.cats$Category=="Labor & Social Protection"] = "Social"
111 |   wdi.cats$Category[wdi.cats$Category=="Public Sector"] = "Government"
112 |   wdi.cats$Category[wdi.cats$Category=="Private Sector & Trade"] = "Private Sector"
113 |   
114 |   #write out the cat and subcats for entering into story pads data
115 |   dd.cats <- unique(wdi.cats[,c("Category", "Subcategory")])
116 |   colnames(dd.cats) <- tolower(colnames(dd.cats))
117 |   write.csv(dd.cats, "./pads/PadsCategory.csv", row.names=F)
118 |   #insertCatSubcat()
119 |   con <- getDBCon()
120 |   dd.cats <- dbReadTable(con, "pads_categories")
121 |   dd.subcats <- dbReadTable(con, "pads_subcategories")
122 |   disconnectDB(con)
123 |   #Assign category id
124 |   wdi.cats$category_id = 0
125 |   for(i in 1:nrow(dd.cats)){
126 |     wdi.cats$category_id[which(wdi.cats$Category==dd.cats$category_name[i])] = dd.cats$category_id[i]
127 |   }
128 |   #trim white spaces from front and end
129 |   #wdi.cats$Subcategory <- gsub("^\\s|\\s$","", wdi.cats$Subcategory)
130 |   #assign subcat_id
131 |   wdi.cats$subcategory_id = 0
132 |   for(i in 1:nrow(dd.subcats)){
133 |     wdi.cats$subcategory_id[which(wdi.cats$Subcategory==dd.subcats$subcategory_name[i])] = dd.subcats$subcategory_id[i]
134 |   }
135 |   colnames(wdi.cats) = c("SeriesCode", "Topic", "category_name", "subcategory_name", "category_id", "subcategory_id")
136 |   writeTable("wdi_categories", wdi.cats)
137 | }
138 | 


--------------------------------------------------------------------------------
/learning/birbal.R:
--------------------------------------------------------------------------------
  1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
  2 | # Date: 3/15/2013
  3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
  4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
  5 | # All rights reserved.
  6 | 
  7 | #This version will use data.table in place of PLYR.
  8 | #library(plyr)
  9 | library(RJSONIO)
 10 | library(data.table)
 11 | #setwd("/Users/homemac/Toga/Alto")
 12 | 
 13 | #Set content type for output
 14 | setContentType("text/html")
 15 | #Query Engine - qds -  query this database, d=dimension, m=measure, g= geo, f=funciton 
 16 | qds <- d <- m <- f <- g <- NULL
 17 | #d <- ""
 18 | #m <- ""
 19 | #f <- ""
 20 | 
 21 | #load pre-analyzed and saved dataframe
 22 | loadDS <- function(qds) {
 23 |   if(exists("pads", envir=.GlobalEnv)) {
 24 |       #lpads <- get("pads", envir=.GlobalEnv)
 25 |       #cat(exists(pads$sysID[pads$id==qds], envir=.GlobalEnv), "<br/>")
 26 |       return(get(pads$sysID[pads$id==qds], envir=.GlobalEnv))    #get a local copy in this enviornment
 27 |     } else {
 28 |       print("PADS didn't exist. Check to see if you start up R script in RApache ran correctly.")
 29 |     }
 30 | }
 31 | 
 32 | ## Operate on a data.frame, split by "by", by could be a vector or a single filed
 33 | ## aggreage by columns in vector on, on could be a single field also
 34 | 
 35 | getResults <- function(dt, byVar, onVar) {
 36 |     e <- substitute(onVar)
 37 |     cat(e)
 38 |     return(dt[,eval(e),by=byVar])
 39 | }
 40 | 
 41 | 
 42 | ## Operate on a data.frame, split by "by", by could be a vector or a single filed
 43 | ## and aggreage by columns in vector on, on could be a single field also
 44 | #getCount <- function(df, by, on) {
 45 | #  return(ddply(df, by, colwise(function(col) {length(unique(col))}, on)))
 46 | #}
 47 | 
 48 | getCount<- function(dt, by, on) {   
 49 |     lds <- dt[], by, "nrow")
 50 |     colnames(lds) <- c(by,on)
 51 |     return(lds)
 52 | }
 53 | 
 54 | #For counting column that has 0 or 1 value :: Very similar to getSum - the answer should be same
 55 | getBinaryCount <- function(df, by, on) {
 56 |     return(ddply(df, by, colwise(function(x) {length(which(x==1))}, on)))
 57 |     #length(which(mbDS[,"Test.Century"] ==1))
 58 | }
 59 | 
 60 | readRequest <- function() {
 61 |   #GetString <- str(GET)
 62 |   #cat(GetString)
 63 |   qds <<- GET$qds
 64 |   d <<- GET$d
 65 |   m <<- GET$m
 66 |   f <<- GET$f
 67 |   g <<- GET$g
 68 |   cat(d, "," ,m, "," ,f, "," ,qds, sep="    ")
 69 |   #cat("<br/>")
 70 | }
 71 | 
 72 | processRequest <- function(fnType, dt){  
 73 |     return(switch(fnType,
 74 |        sum = dt[,eval(sum(m, na.rm=TRUE)), by=d]
 75 |        count = getCount(df, d, m),
 76 |        'default'
 77 |        )
 78 |     )   
 79 | }
 80 | 
 81 | 
 82 | library(data.table)
 83 | 
 84 | mbDT <- data.table(mbDS)
 85 | 
 86 | myfunction = function(dt, expr) {
 87 |   e = substitute(expr)
 88 |   dt[,eval(e),by=Species]
 89 | }
 90 | 
 91 | d <- quote(list(Opposition, Toss))
 92 | 
 93 | setkey(mbDT,Opposition)
 94 | #m <- quote(Test.Runs)
 95 | #mbDT[Opposition=="England" ,eval(m)]
 96 | #m <- quote(sum(Test.Runs))
 97 | #mbDT[Opposition=="England" ,eval(m)]
 98 | 
 99 | #m <- quote(list(T=sum(Test.Runs)))
100 | #mbDT[Opposition=="England" ,eval(m)]
101 | 
102 | #m <- quote(list(Test.Runs=sum(Test.Runs)))
103 | #mbDT[Opposition=="England" ,eval(m)]
104 | 
105 | #m <- quote(list(Test.Runs=sum(Test.Runs)))
106 | #mbDT[,eval(m), by=d]
107 | 
108 | 
109 | ## Operate on a data.frame, split by "by", by could be a vector or a single filed
110 | ## aggreage by columns in vector on, on could be a single field also
111 | getSum <- function(dt, onVar, byVar) {
112 |     e <- substitute(onVar)
113 |     j <- eval(e)
114 |     print(j)
115 |     dt[,eval(j),by=byVar]
116 | }
117 | 
118 | m <- "Test.Runs"
119 | d<- "Opposition"
120 | #expr <- substitute(list(m=sum(m, na.rm=TRUE)))
121 | expr <- paste("list(",m,"=sum(",m,",na.rm=TRUE))", sep="")
122 | getSum(mbDT, paste("sum(",m,")", sep=""), d)
123 | 
124 | 
125 | writeResponse <- function(responseDF, df) {
126 |   sJSON <- paste('{"', m, '":', toJSON(responseDF[,m]), sep="") #JSON String is not closed.
127 |   
128 |   if(!is.null(g) && g == "t") {
129 |     latName <- paste(m, ".Lat", sep="")
130 |     lngName <- paste(m, ".Lng", sep="")
131 |     sJSON <- paste(sJSON, ', "responseJSON": [', sep="")
132 |     JSON <- with(head(df), paste('{"d":"',responseDF[m]
133 |                                '", "center": new google.maps.LatLng(',
134 |                                 df[latName][df[m]==responeDF[m]][1], ',', df[lngName][df[m]==responeDF[m]][1], 
135 |                                '), "data":', mbDS$Test.Runs, 
136 |                                 '},', sep=""))
137 |     
138 |   } else
139 |     sJSON = paste(sJSON, "}", sep="")
140 |   #Write response back
141 |   cat(sJSON)
142 |   #cat(ls())
143 |   rm(list=ls()) #Clean up everything - There has to be a better way than this.
144 |   #cat(ls())
145 | }
146 | 
147 | writeError <- function() {
148 |   sJSON <- paste('{ "Error":"An Error Occured :-( "}' , sep="") 
149 | }
150 | 
151 | #Step 1 - Read Request, query string
152 | readRequest()
153 | 
154 | #Step 2 - load environment
155 | df <- loadDS(qds)
156 | 
157 | if(!is.null(df)) 
158 | {
159 |   dt <- data.table(df[order(df[,d]),])
160 |   #Step 3 - Process Request 
161 |   responseDF <- processRequest(dt)
162 |   if(!is.null(responseDF)) {
163 |     #Step 4 - Write Response
164 |     writeResponse(responseDF)
165 |   }
166 |   else {
167 |    writeError(); 
168 |   }
169 | } else {
170 |   writeError();
171 | }
172 | 
173 | 
174 |  


--------------------------------------------------------------------------------
/learning/TransformMasterBlasterData.R:
--------------------------------------------------------------------------------
  1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
  2 | # Date: 3/15/2013
  3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
  4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
  5 | # All rights reserved.
  6 | 
  7 | rm (list = ls())
  8 | setwd("~/Toga")
  9 | library("chron")
 10 | library("plyr")
 11 | library("RJSONIO")
 12 | #Get Unique ID
 13 | dsID <- paste("pad", substr(as.character(unclass(Sys.time())), 12,16), sep="")
 14 | out.file.csv <- paste("Alto/pads/data/",dsID, ".csv", sep="")
 15 | out.file.JSON <- paste("Alto/pads/meta/",dsID, ".JSON", sep="")
 16 | out.file.R.dataframes <- paste("Alto/pads/cache/", dsID, ".RData", sep="")
 17 | in.file.name <- "Alto/datasets/masterblaster/SachinTestRecords.csv"
 18 | 
 19 | 
 20 | lMonths <- c("January","February","March", "April","May","June","July","August","September", "October","November","December")
 21 | lDays <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
 22 | 
 23 | # Use Google's Geocoding service and geo code the data
 24 | getGeoCode <- function(gcStr, cnPre)
 25 | {
 26 |   library("RJSONIO")
 27 |   gcStr <- gsub(' ','%20',gcStr)
 28 |   connectStr <- paste('http://maps.google.com/maps/api/geocode/json?sensor=false&address=',gcStr, sep="")
 29 |   #print(connectStr)
 30 |   con <- url(connectStr)
 31 |   data.json <- fromJSON(paste(readLines(con), collapse=""))
 32 |   close(con)
 33 |   data.json <- unlist(data.json)
 34 |   lat <- data.json["results.geometry.location.lat"]
 35 |   lng <- data.json["results.geometry.location.lng"]
 36 |   gcodes <- c(lat, lng)
 37 |   names(gcodes) <- c(paste(cnPre,".Lat",sep=""),paste(cnPre, ".Lng", sep=""))
 38 |   return (gcodes)
 39 | }
 40 | 
 41 | # Read SachinTestRecords.csv
 42 | readDataSet <- function(file.name) {
 43 |   return(
 44 |     read.csv(file.name, na.strings="-", as.is=TRUE, header=TRUE, 
 45 |              stringsAsFactors=FALSE, strip.white=TRUE)
 46 |   )
 47 | }
 48 | 
 49 | # Transform raw dataset
 50 | transformDataSet <- function(df) {
 51 | tdf <- mutate(df,
 52 |       StartDate=as.Date(StartDate, "%m/%d/%Y"),
 53 |       Day=factor(weekdays(StartDate), levels=lDays, ordered=TRUE), 
 54 |       Month=factor(months(StartDate), levels=lMonths, ordered=TRUE), 
 55 |       Year=years(StartDate),
 56 |       Ing1_Runs=as.numeric(gsub('\\*', '', Bat1)), 
 57 |       Ing1_NotOut=ifelse(is.na(Bat1), 0, ifelse(grepl("\\*", Bat1),1,0)), 
 58 |       Ing2_Runs=as.numeric(gsub('\\*', '', Bat2)),
 59 |       Ing2_NotOut=as.factor(ifelse(grepl("\\*", Bat2),1,0)),
 60 |       Test_Runs=as.numeric(Runs),
 61 |     	Opposition=as.factor(Opposition), 
 62 |       Ground=as.factor(Ground), 
 63 |       Result=as.factor(Result), 
 64 |       Toss=as.factor(Toss),
 65 |       Country=as.factor(Country),
 66 |     	Ing1_Century=ifelse(is.na(Ing1_Runs), 0, ifelse(Ing1_Runs>=100, 1, 0)), 
 67 |       Ing2_Century=ifelse(is.na(Ing2_Runs), 0, ifelse(Ing2_Runs>=100, 1, 0)), 
 68 |       Test_Century=ifelse((Ing1_Century==1)|(Ing2_Century==1), 1, 0),
 69 |       Test_DCentury=ifelse((!is.na(Ing1_Runs) & Ing1_Runs >=200)|(!is.na(Ing2_Runs) & Ing2_Runs>=200), 1, 0),
 70 |       Home_Away=ifelse(!is.na(Country) & Country=="India", "Home", "Away"),
 71 |       Ground_Country=as.factor(paste(Ground,Country, sep=","))
 72 |     )
 73 | head(tdf)
 74 | return(with(tdf, data.frame(TestNo, StartDate, Day, Month, 
 75 |               Year, Ing1_Runs, Ing1_NotOut, Ing2_Runs, Ing2_NotOut, Test_Runs,
 76 |               Opposition, Ground, Toss, Result, 
 77 |               Ing1_Century, Ing2_Century, 
 78 |               Test_Century, Test_DCentury, Home_Away, Country, 
 79 |               Ground_Country,
 80 |               laply(Opposition, function(x){getGeoCode(x, "Opposition")}),              
 81 |               laply(Ground_Country, function(x){getGeoCode(x, "Ground")}))
 82 |             ))
 83 | }
 84 | 
 85 | writeTransformedDS <- function(df, file.name) {
 86 |   write.csv(df, file.name)
 87 | }
 88 | 
 89 | writeJSON <- function(df, file.JSON, id) {
 90 |   
 91 |   jsonSt <- paste('{ 
 92 |                 "datasetName": "SachinTestRecords",
 93 |                 "id": "', id, '", 
 94 |                 "dList": {
 95 |                   "Year":', toJSON(levels(df$Year)), ',
 96 |                   "Month":', toJSON(levels(df$Month)), ',
 97 |                   "Day":', toJSON(levels(df$Day)), ',
 98 |                   "Opposition":', toJSON(levels(df$Opposition)), ',
 99 |                   "Ground":', toJSON(levels(df$Ground)), ',
100 |                   "Result":', toJSON(levels(df$Result)), ',
101 |                   "Toss":', toJSON(levels(df$Toss)),',
102 |                   "HomeOrAway":', toJSON(levels(df$Home_Away)),
103 |                 '},
104 |                 "mList": {
105 |                   "Ing1_Runs":"",
106 |                   "Ing2_Runs":"",
107 |                   "Test_Runs":"",
108 |                   "Ing1_Century":"",
109 |                   "Ing2_Century":"",
110 |                   "Test_Century":""
111 |                 },
112 |                 "dListType": {
113 |                   "Opposition": "Geo",
114 |                   "Ground": "Geo"
115 |                 },
116 |                 "source":"CricInfo"
117 |         }')
118 |   file.out <- file(file.JSON, 'wt')
119 |   cat(jsonSt, file=file.out, fill = TRUE)
120 |   close(file.out)
121 | }
122 | 
123 | # Read Master Blaster Raw File and Transform it
124 | mbDS <- transformDataSet(readDataSet(in.file.name))
125 | 
126 | #Write Transformed data set to a CSV file
127 | writeTransformedDS(mbDS, out.file.csv)
128 | 
129 | #Generate JSON with m and d
130 | writeJSON(mbDS, out.file.JSON, dsID)
131 | 
132 | #Save transformed dataset as data.frame for later reading
133 | assign(dsID, mbDS)
134 | names <- c(eval(dsID))
135 | save(list=names, file=out.file.R.dataframes)
136 | pads <- rbind(pads, data.frame(id=dsID, title="Master Blaster - Sachin Tendulkar(1940-2008)", 
137 |                                subtitle="This data set explores Sachin Tendulkar's glorious test cricket record.", 
138 |                    records=nrow(mbDS), analyzed=0, stories=0, json=out.file.JSON, data=out.file.csv,
139 |                                cache=out.file.R.dataframes, source="CricInfo")
140 |               )
141 | #save(mbDS, file=out.file.R.dataframes)
142 | 


--------------------------------------------------------------------------------
/MongoFns.R:
--------------------------------------------------------------------------------
  1 | # Interact with MongoDB
  2 | # Author: Jitender Aswani, Co-Founder @datadolph.in
  3 | # Date: 3/15/2013
  4 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
  5 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
  6 | # Packages Used: rmongodb
  7 | # All rights reserved.
  8 | 
  9 | require("rmongodb")
 10 | 
 11 | mongo.db <- list(user="", pass="", name="ddfin_dev", system.pads="ddfin_dev.system_pads", host="localhost")
 12 | 
 13 | #
 14 | #initiate a connection to admin database
 15 | #
 16 | getDefaultMongoDBCon <- function() { 
 17 |   mongo <- mongo.create()
 18 |   if (!mongo.is.connected(mongo)) {
 19 |     print("Unable to connect.  Error code:")
 20 |     print(mongo.get.err(mongo))
 21 |   } else 
 22 |     return(mongo)
 23 | }
 24 | 
 25 | #
 26 | #initiate a connection to a db
 27 | # This will create a db if it doesn't exist
 28 | #
 29 | getMongoDBCon <- function(db.name) { 
 30 |   mongo <- mongo.create(db=db.name)
 31 |   if (!mongo.is.connected(mongo)) {
 32 |     cat("Unable to connect.  Error code:", "\n")
 33 |     cat(mongo.get.err(mongo), "\n")
 34 |   } else 
 35 |     return(mongo)
 36 | }
 37 | 
 38 | #
 39 | #disconnect
 40 | #
 41 | disconnectMongoDB <- function(mongo) {
 42 |   #Close Connection
 43 |   if (mongo.is.connected(mongo)) {
 44 |     mongo.destroy(mongo)
 45 |   }
 46 | } 
 47 | 
 48 | #
 49 | #list all databases in Mongo
 50 | #
 51 | listAllDBs <- function() {
 52 |   #List dbs
 53 |   mongo <- getDefaultMongoDBCon()
 54 |   on.exit(disconnectMongoDB(mongo)) # On function exit, close connection
 55 |   cat(mongo.get.databases(mongo), "\n")
 56 | }
 57 | 
 58 | #
 59 | # drop a db
 60 | #
 61 | dropDB <- function(db.name) {
 62 |   mongo <- getDefaultMongoDBCon()
 63 |   on.exit(disconnectMongoDB(mongo)) # On function exit, close connection
 64 |   cat(mongo.drop.database(mongo, db.name))  
 65 | }
 66 | 
 67 | #
 68 | #List collections for a given db
 69 | #
 70 | listAllCollections <- function(db.name) {
 71 |   mongo <- getDefaultMongoDBCon()
 72 |   on.exit(disconnectMongoDB(mongo)) # On function exit, close connection
 73 |   cat(mongo.get.database.collections(mongo, db.name), "\n")
 74 | }
 75 | #listAllCollections(mongo.db$name)
 76 | 
 77 | #
 78 | # drop a collection
 79 | #
 80 | dropCollection <- function(db.collection){
 81 |   mongo <- getDefaultMongoDBCon()
 82 |   on.exit(disconnectMongoDB(mongo))
 83 |   
 84 |   cat(mongo.drop(mongo, db.collection), "\n")
 85 | }
 86 | #dropCollection(mongo.db$system.pads)
 87 | 
 88 | #
 89 | # empty a collection
 90 | #
 91 | emptyCollection <- function(db.collection){
 92 |   mongo <- getDefaultMongoDBCon()
 93 |   on.exit(disconnectMongoDB(mongo))
 94 |   
 95 |   cat(mongo.remove(mongo, db.collection), "\n")
 96 | }
 97 | #emptyCollection(mongo.db$system.pads) 
 98 | 
 99 | 
100 | #
101 | # insert systemPads
102 | #
103 | insertPadDoc <- function(doc){
104 |   
105 |   if(!is.null(mongo)){
106 |     status <- mongo.insert(mongo, mongo.db$system.pads, doc)
107 |     if(status){
108 |       if(verbose) cat("pad successfully inserted", "\n")
109 |     } else {
110 |       err <- mongo.get.last.err(mongo, mongo.db$name)
111 |       cat(mongo.get.server.err(mongo), "\n")
112 |       cat(mongo.get.server.err.string(mongo), "\n")
113 |     }
114 |   } else {
115 |     cat("Connection is null", "\n")
116 |   }
117 |   mongo.bson.destroy(doc)
118 | }
119 | 
120 | #
121 | # update systemPads
122 | #
123 | updatePadDoc <- function(pad.id, doc){
124 |   if(!is.null(mongo)){
125 |     buf <- mongo.bson.buffer.create()
126 |     mongo.bson.buffer.append(buf, "_id", pad.id)
127 |     criteria <- mongo.bson.from.buffer(buf)
128 |     
129 |     status <- mongo.update(mongo, mongo.db$system.pads, criteria, doc)
130 |     if(status){
131 |       if(verbose) cat("pad successfully updated", "\n")
132 |     } else {
133 |       err <- mongo.get.last.err(mongo, mongo.db$name)
134 |       cat(mongo.get.server.err(mongo), "\n")
135 |       cat(mongo.get.server.err.string(mongo), "\n")
136 |     }
137 |   } else {
138 |     cat("Connection is null", "\n")
139 |   }
140 |   mongo.bson.destroy(doc)
141 | }
142 | 
143 | #
144 | #insert a pad into system_pads
145 | # connection object is supplied
146 | #
147 | insertPadToMongo <- function(pad.id, pad){
148 |   # Insert document to collection 'system_pads'
149 |   doc <- mongo.bson.from.list(list(
150 |     "_id"=pad.id, 
151 |     pad=pad))
152 |   
153 |   if(verbose) cat("calling insert for pad ", pad.id, "\n")
154 |   insertPadDoc(doc)
155 | }
156 | 
157 | #
158 | # get a single system pad
159 | #
160 | getSingleSystemPad <- function(padId) {
161 |   
162 |   mongo <- getDefaultMongoDBCon()
163 |   on.exit(disconnectMongoDB(mongo))
164 |   
165 |   buf <- mongo.bson.buffer.create()
166 |   mongo.bson.buffer.append(buf, "_id", padId)
167 |   query <- mongo.bson.from.buffer(buf)
168 |   
169 |   # Find the first 100 records in collection system_pads
170 |   cursor <- mongo.find(mongo, mongo.db$system.pads, query)
171 |   
172 |   # Step though the matching records and display them
173 |   while (mongo.cursor.next(cursor))
174 |     l<- mongo.bson.to.list(mongo.cursor.value(cursor))
175 |   #assign("l", l,  envir=.GlobalEnv)
176 |   #l$pad$default$type = "timeseries"
177 |   #updatePadDoc(l$'_id', mongo.bson.from.list(l))
178 |   mongo.cursor.destroy(cursor)
179 | }
180 | #getSingleSystemPad("pad136971841842064")
181 | 
182 | #
183 | #get pads from system_pads
184 | #
185 | getSystemPads <- function() {
186 |   
187 |   mongo <- getDefaultMongoDBCon()
188 |   on.exit(disconnectMongoDB(mongo))
189 |   
190 |   # Find the first 100 records in collection system_pads
191 |   cursor <- mongo.find(mongo, mongo.db$system.pads, limit=100L)
192 |   
193 |   # Step though the matching records and display them
194 |   while (mongo.cursor.next(cursor))
195 |     print(mongo.cursor.value(cursor))
196 |   mongo.cursor.destroy(cursor)
197 | }
198 | #getSystemPads()
199 | 
200 | #
201 | #get pads count from system_pads
202 | #
203 | getSystemPadsCount <- function(){
204 |   mongo <- getDefaultMongoDBCon()
205 |   on.exit(disconnectMongoDB(mongo))
206 |   
207 |   # Find the first 100 records
208 |   #    in collection people of database test where age == 18
209 |   pads.count <- mongo.count(mongo, mongo.db$system.pads)
210 |   cat("total pads: ", pads.count, "\n")
211 | }
212 | #getSystemPadsCount()
213 | 


--------------------------------------------------------------------------------
/r_solr_integration.R:
--------------------------------------------------------------------------------
  1 | # Author: Rajani Aswani Co-Founder @datadolph.in
  2 | # Author: Jitender Aswani
  3 | # Date: 4/15/2013
  4 | # Description: R & Solr Integration Using Solr's REST APIs
  5 | # Packages Used: RCurl, RJSONIO
  6 | # Blog Reference: http://www.r-bloggers.com/updated-sentiment-analysis-and-a-word-cloud-for-netflix-the-r-way/
  7 | # Download
  8 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
  9 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 10 | # All rights reserved.
 11 | 
 12 | #
 13 | # Load Library
 14 | #
 15 | require(RCurl)
 16 | require(RJSONIO)
 17 | 
 18 | # define solr server address
 19 | solrServer <- "http://localhost:8080/solr-dev/"
 20 | 
 21 | #
 22 | # get row-wise JSON
 23 | #
 24 | getRowWiseJson <- function (jsonDT) {
 25 |   require("RJSONIO")
 26 |   row.json <- apply(jsonDT, 1, toJSON)
 27 |   json.st <- paste('[', paste(row.json, collapse=', '), ']')
 28 |   return (json.st)
 29 | }
 30 | 
 31 | getSolrServer <- function() {
 32 |   return(solrServer)
 33 | }
 34 | 
 35 | getUpdateURL <- function(){
 36 |   return(paste(getSolrServer(), "update/json?commit=true&wt=json", sep=""))
 37 | }
 38 | 
 39 | getCommitURL <- function() {
 40 |   return(paste(getSolrServer(),"update?commit=true&wt=json", sep=""))
 41 | }
 42 | 
 43 | getQueryURL <- function() {
 44 |   return(paste(getSolrServer(),"select?fl=id&wt=json&indent=true&fq=type:pad&q=", sep=""))
 45 | }
 46 | 
 47 | #
 48 | # query a field for the text and return docs
 49 | #
 50 | querySolr <- function(queryText, queryfield="all") {
 51 |   response <- fromJSON(getURL(paste(getQueryURL(), queryfield, ":", queryText, sep="")))
 52 |   if(!response$responseHeader$status) #if 0
 53 |     return(response$response$docs)
 54 | }
 55 | 
 56 | #
 57 | # delete all indexes from solr server
 58 | #
 59 | deleteAllIndexes <-function() {
 60 |   response <- postForm(getUpdateURL(),
 61 |                        .opts = list(postfields = '{"delete": {"query":"*:*"}}',
 62 |                                     httpheader = c('Content-Type' = 'application/json', 
 63 |                                                    Accept = 'application/json'),
 64 |                                     ssl.verifypeer=FALSE
 65 |                        )
 66 |   ) #end of PostForm
 67 |   return(fromJSON(response)$responseHeader[1])
 68 | }
 69 | 
 70 | #
 71 | # delete all indexes for a document type from solr server 
 72 | # in this example : type = sports
 73 | #
 74 | deleteSportsIndexes <-function() {
 75 |   response <- postForm(getUpdateURL(),
 76 |                        .opts = list(postfields = '{"delete": {"query":"type:sports"}}',
 77 |                                     httpheader = c('Content-Type' = 'application/json', 
 78 |                                                    Accept = 'application/json'),
 79 |                                     ssl.verifypeer=FALSE
 80 |                        )
 81 |   ) #end of PostForm
 82 |   return(fromJSON(response)$responseHeader[1])
 83 | }
 84 | 
 85 | #
 86 | # delete indexes for all baskeball category in sports type from solr server 
 87 | # in this example : type = sports and category: basketball
 88 | #
 89 | deleteSportsIndexesForCat <-function(category) {
 90 |   response <- postForm(getUpdateURL(),
 91 |                        .opts = list(postfields = 
 92 |                          paste('{"delete": {"query":"type:sports AND category:', category, '"}}', sep=""),
 93 |                                     httpheader = c('Content-Type' = 'application/json', 
 94 |                                                    Accept = 'application/json'),
 95 |                                     ssl.verifypeer=FALSE
 96 |                        )
 97 |   ) #end of PostForm
 98 |   return(fromJSON(response)$responseHeader[1])
 99 | }
100 | #deletePadIndexesForCat("baskeball")
101 | 
102 | 
103 | #
104 | #Post a new document to Solr
105 | #
106 | postDoc <- function(doc) { 
107 |   solr_update_url <- getUpdateURL()
108 |   jsonst <- toJSON(list(doc))
109 | 
110 |   response <- postForm(solr_update_url,
111 |                        .opts = list(postfields = jsonst,
112 |                                     httpheader = c('Content-Type' = 'application/json', 
113 |                                                    Accept = 'application/json'),
114 |                                     ssl.verifypeer=FALSE
115 |                        )) #end of PostForm
116 |   return(fromJSON(response)$responseHeader[1])
117 |   ########## Commit - only if it doesn't work the other way ###############
118 |   #return(fromJSON(getURL(getCommitURL())))
119 | }
120 | 
121 | #
122 | #Post JSON document to Solr
123 | #
124 | postJSON <- function(jsonst) {  #pad.meta.data (i.e. pmd is sent)
125 |   solr_update_url <- getUpdateURL()
126 | 
127 |   response <- try(postForm(solr_update_url,
128 |                            .opts = list(postfields = jsonst,
129 |                                         httpheader = c('Content-Type' = 'application/json', 
130 |                                                        Accept = 'application/json'),
131 |                                         ssl.verifypeer=FALSE
132 |                            )), silent=T) #end of PostForm
133 |   if(class(response) %in% c("try-error")){
134 |     print(response)     
135 |   } else {
136 |     print(response)
137 |     return(fromJSON(response)$responseHeader[1])
138 |   }
139 |   ########## Commit - only if it doesn't work the other way ###############
140 |   #return(fromJSON(getURL(getCommitURL())))
141 | }
142 | 
143 | #
144 | #test it out 
145 | #
146 | TestAll <- function() {  
147 |   status <- postJSON(toJSON(fromJSON(paste(readLines("./pads/solr-docs-IPL-T20.json"), collapse="\n"))))
148 |   
149 |   if(status) print("An error occurred creating indexes.")
150 |   
151 |   #delete test
152 |   status <- deleteAllIndexes() # Status of 0 means that the operation was successfully completed.
153 |   if(status) print("An error occurred while deleting all indexes.")
154 |   
155 |   #add doc test
156 |   status <- postDoc(doc) # Status of 0 means that the operation was successfully completed.
157 |   if(status) print("An error occurred while adding a document.")
158 |   
159 | 
160 |   #Query SOlr
161 |   docs <- querySolr("baseball", "tags") 
162 |   print(docs)
163 |   
164 |   docs <- querySolr("baseball") #if no field is defined, default filed all_text will be searched
165 |   print(docs)
166 | }
167 | 


--------------------------------------------------------------------------------
/GenerateWDIPADS.R:
--------------------------------------------------------------------------------
  1 | # Generate PADS from WDI data
  2 | # Get series list and the associated meta data from the database
  3 | #
  4 | # Author: Jitender Aswani, Co-Founder @datadolph.in
  5 | # Date: 3/15/2013
  6 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
  7 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
  8 | # All rights reserved.
  9 | 
 10 | setwd("/Users/homemac/R")
 11 | source("CreatePADS.R")
 12 | #
 13 | #initiate a connection to wdi database
 14 | #
 15 | getWDIDBCon <- function(multi.statements=FALSE) { 
 16 |   if(multi.statements)
 17 |     return(dbConnect(MySQL(), user="wdi", password="wdi", dbname="wdi", host="localhost", 
 18 |                      client.flag=CLIENT_MULTI_STATEMENTS))
 19 |   else 
 20 |     return(dbConnect(MySQL(), user="wdi", password="wdi", dbname="wdi", host="localhost"))
 21 | }
 22 | 
 23 | #
 24 | # get data for a given series and country
 25 | #
 26 | 
 27 | getSeriesData <- function(series.name, series.country){
 28 |   con <- getWDIDBCon()
 29 |   on.exit(disconnectDB(con)) # On function exit, close connection
 30 |   
 31 |   sql <- sprintf("select * from wdi.wdi_data where Indicator_Code = '%s' and Country_Name ='%s';", series.name, series.country)
 32 |   print(sql)
 33 |   res <- dbSendQuery(con, sql)
 34 |   series.data <- fetch(res, n = -1)
 35 |   dbClearResult(res)
 36 |   if(!is.null(series.data) && ncol(series.data) > 4)
 37 |     return(series.data[,5:ncol(series.data)])
 38 |   else 
 39 |     return(NULL)
 40 | }
 41 | #series.data <- getSeriesData('AG.AGR.TRAC.NO', 'C')
 42 | 
 43 |   
 44 | padifyWDI <- function(){
 45 |   pads <- getSystemPads()
 46 |   pads.cat <- getCategories()
 47 |   pads.subcat <- getSubcategories()
 48 |   solr.index.docs <- list()
 49 |   con <- getWDIDBCon()
 50 |   #read wdi data
 51 |   wdi.data <- dbReadTable(con, "wdi_data")
 52 |   #read sereis meta data - first four columns only  
 53 |   wdi.series <- dbReadTable(con, "wdi_series")[,1:4]
 54 |   #read country meta data - only two columns are needed for the time being
 55 |   wdi.cats <- dbReadTable(con, "wdi_categories")
 56 |   
 57 |   series <- list()
 58 |   series["source"] <- "World Bank"
 59 |   
 60 |   for(i in 1:nrow(wdi.series)){
 61 |     if(i > 39 && i < 46){
 62 |       series["name"] <- wdi.series$SeriesCode[i]
 63 |       series["title"] <- wdi.series$Indicator_Name[i] 
 64 |       series["longdef"] <- wdi.series$Long_definition[i]
 65 |       series["shortdef"] <- wdi.series$Short_definition[i]
 66 |       wdi.cats.matched <- wdi.cats[wdi.cats$SeriesCode==series$name,]
 67 |       series["category"] <- wdi.cats.matched$category_name
 68 |       series["subcategory"] <- wdi.cats.matched$subcategory_name
 69 |       series["category_id"]<- wdi.cats.matched$category_id
 70 |       series["subcategory_id"]<- wdi.cats.matched$subcategory_id
 71 |       #
 72 |       logMessage(paste("Starting loading series", i, series$name, sep=":"))
 73 |       print(paste("Starting loading series", i, series$name, sep=":"))
 74 |       
 75 |       #don't continue if number of na are greather than 1/5th of number of cols
 76 |       all.series.data <- wdi.data[wdi.data$Indicator_Code==series$name,]
 77 |       for(j in 1:nrow(all.series.data)){
 78 |           series.data <- all.series.data[j,]
 79 |           if(!is.null(series.data) && 
 80 |                ncol(series.data) > 4 && 
 81 |                sum(is.na(series.data)) < .9 * ncol(series.data)) {
 82 |             
 83 |             series["country"] <- series.data$Country_Name
 84 |           
 85 |             logMessage(paste("Starting country", series$country, sep=":"))
 86 |             print(paste("Starting country", series$country, sep=":"))
 87 |           
 88 |             
 89 |             series.data <- t(series.data[,5:ncol(series.data)])
 90 |             #generate data.frame
 91 |             series.data <- data.frame(gsub("X", "", rownames(series.data)),
 92 |                                       series.data[,1], 
 93 |                                       row.names=seq(1:nrow(series.data)), stringsAsFactors=FALSE)
 94 |             #change colnames
 95 |             colnames(series.data) <- c("date",  paste(series$name, series$country, sep="."))
 96 |             
 97 |             #Remove NA from series column
 98 |             series.data <- series.data[!is.na(series.data[2]),]
 99 |             #Create pad and get pad meta data
100 |             pmd <- createPAD(series.data, paste(series$title, series$country, sep=", "), 
101 |                       series$longdef, series$category, series$subcategory, 
102 |                       series$source, paste(series$name, series$country, sep="."), 
103 |                       tolower(paste(series$category, series$subcategory, series$source, series$country, sep=",")))
104 |             
105 |           
106 |             pads <- rbind(pads, list(pmd$id, pmd$title, pmd$desc, pmd$records, pmd$columns, series$category_id, 
107 |             series$subcategory_id, pmd$analyzed, pmd$stories, pmd$src, pmd$src_file, "", "", pmd$tags))
108 |             #
109 |             #generate solr doc
110 |             #
111 |             doc <- list("type"="pad", "id" = pmd$id, "title" = pmd$title, "desc"=pmd$desc, "category"=pmd$category, 
112 |                         "subcategory"=pmd$subcategory, "tags"=pmd$tags)
113 |             solr.index.docs[[length(solr.index.docs)+1]] <- doc            
114 |             
115 |           } else {
116 |             logMessage(paste(i, series$name, "was empty for ", series$country, sep=":"))
117 |             print("This series was empty.")
118 |           }       
119 |       }# inner for loop ends here
120 |     }
121 |   }# for loop for series ends here
122 |   
123 |   # now save solr doc
124 |   generateSolrDocFromAllDocs(solr.index.docs)
125 |   
126 |   # Save pads to a csv file
127 |   saveData(pads, "./pads/system_pads.csv")
128 |   
129 |   #now save pads to db
130 |   scon <- getDBCon()
131 |   dbRemoveTable(scon, "system_pads")
132 |   dbWriteTable(scon, name="system_pads", value=pads, row.names = F, overwrite = T)  
133 |   disconnectDB(scon)
134 |   
135 |   dumpLogs()
136 | }
137 | 
138 | padifyWDI()
139 | 
140 | 
141 | #use extreme caution when running this funciton - it will remove all meta, cahce, data and empty system_pads table
142 | deleteEverything <- function(){
143 |   unlink("./pads/meta/*.*")
144 |   unlink("./pads/cache/*.*")
145 |   unlink("./pads/data/*.*")
146 |   emptySystemPads()
147 |   unlink("./pads/solr-docs.json")
148 | }
149 | #read wdi data
150 | #wdi.data <- dbReadTable(getWDIDBCon(), "wdi_data")[1:10,]
151 | #wdi.data.i <- t(wdi.data)
152 | 


--------------------------------------------------------------------------------
/PresedentialElectionsResults.R:
--------------------------------------------------------------------------------
  1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
  2 | # Date: 2012-9-9
  3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
  4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
  5 | # Description: Financial Markets and President Elect: Do Financial Markets Favor A Republican Over A Democrat?
  6 | # Packages Used: RCurl, RJSONIO
  7 | # Blog Reference: http://allthingsbusinessanalytics.blogspot.com/2012/11/financial-markets-and-president-elect.html
  8 | # All rights reserved.
  9 | 
 10 | require(XML)
 11 | require(plyr)
 12 | require(quantmod)
 13 | 
 14 | setwd("C:/mydir")
 15 | 
 16 | #######################################################################
 17 | # Get SPY data for US election dates
 18 | #######################################################################
 19 | 
 20 | #Presedential elections are first tuesday after first monday every 4 years
 21 | election.years <- seq(1952,2012, by=4)
 22 | election.months <- seq(as.Date("1952/11/1"), by="4 year", length.out=16)
 23 | election.dates <- election.months + ((9-as.POSIXlt(election.months)$wday) %% 7)
 24 | election.dates.1dl <- election.dates + 1
 25 | 
 26 | getSymbols("^GSPC",src="yahoo", from="1950-1-1")
 27 | colnames(GSPC) <- c("Open", "High", "LOW", "Close", "Volume", "Adjusted.Close")
 28 | write.zoo(GSPC, file="pads/data/financials/SP500_historical_OHLC_data.csv", sep=",")
 29 | 
 30 | #Generate Monthly Returns for 2008 and save them as a file
 31 | write.zoo(monthlyReturn(GSPC["2008"]), file="pads/data/financials/SP500_2008_monthly_returns.csv", sep=",")
 32 | 
 33 | 
 34 | # only concerned about close
 35 | SPY.close <- Ad(GSPC)
 36 | 
 37 | #system("cat pads/data/financials/SP500_historical_OHLC_data.csv")
 38 | spy.daily.returns <- dailyReturn(SPY.close)
 39 | #one.day.return <- subset(spy.daily.returns, index(spy.daily.returns) %in% election.dates.1daylater)
 40 | #one.day.return$year <- format(index(one.day.return), "%Y")
 41 | 
 42 | getQuote <- function(year, SPY)
 43 | {
 44 |   #Determine election date
 45 |   election.month <- as.Date(paste(year, "11/1", sep="/"))
 46 |   election.date <- election.month + ((9-as.POSIXlt(election.month)$wday) %% 7)
 47 |   #election.date <- election.dates[which(format(election.dates, "%Y")==year)]
 48 | 
 49 |   #Get dates 1-day, 1-week, 4-weeks, 12-weeks and 52-weeks afte the election day
 50 |   quote.dates <- c(election.date, election.date+1, election.date+7, 
 51 |                    election.date+(4*7), election.date+(12*7), election.date+(52*7), election.date+(50*7*4))
 52 |   if(year == 2008) 
 53 |     quote.dates[length(quote.dates)] <- Sys.Date()
 54 |       
 55 |   # Check to see if the dates exist in SPY index, if not advance or lag by 1 day
 56 |   quote.dates <- as.Date(sapply(quote.dates, function(x) 
 57 |     if(length(SPY.close[x])){return(as.Date(x))}else{ return(as.Date(x)-1)}))
 58 |   
 59 |   #Get the quotes for all these dates
 60 |   quotes <- SPY[quote.dates]
 61 |   returns <- sapply(quote.dates, 
 62 |                     function(x) (as.numeric(quotes[x])/as.numeric(quotes[quote.dates[1]]) - 1)*100)
 63 |   r <- list()
 64 |   return(r[[year]]<-returns)
 65 | }
 66 | 
 67 | #Remove 2012 from this vector
 68 | election.years <- election.years[!election.years==2012] 
 69 | returns <- do.call(cbind, llply(election.years, function(x) getQuote(x, SPY.close)))
 70 | colnames(returns) <- election.years
 71 | #Remove first row
 72 | returns <- returns[-1,]
 73 | t.returns <- t(returns)
 74 | old.rownames <- rownames(t.returns)
 75 | obama.1d.return <- as.numeric(spy.daily.returns["2012-11-07", 1]) * 100
 76 | t.returns <- rbind(t.returns, c(obama.1d.return, NA, NA, NA, NA, NA))
 77 | rownames(t.returns) <- c(old.rownames, 2012)
 78 | colnames(t.returns) <- c("1_day_return", "1_week_return", "4_weeks_return", "12_weeks_return", 
 79 |                          "52_weeks_return", "term_return")
 80 | write.csv(t.returns, "pads/data/history/SP500 returns after US president elections_1.csv", row.names = TRUE)
 81 | 
 82 | #######################################################################
 83 | # Get Presidential Names
 84 | #######################################################################
 85 | 
 86 | wiki <- "http://en.wikipedia.org/wiki/United_States_presidential_election"
 87 | tables <- readHTMLTable(wiki, stringsAsFactors=FALSE)
 88 | #print(names(tables))
 89 | wiki.table <- tables[[4]]
 90 | #Create a new column by removing all the special chars from the winner column
 91 | wiki.table$sub <- gsub("\\*", "", wiki.table$Winner)
 92 | #Replace all meta chars
 93 | #data$sub <- gsub("[^A-Za-z0-9 ]+", ",", data$sub)  
 94 | wiki.table$sub <- gsub("\\(|\\)", ",", wiki.table$sub)
 95 | #Now split and create a new dataframe
 96 | us.prez.elect <- data.frame(do.call("rbind", strsplit(wiki.table$sub, ",", fixed=TRUE)))
 97 | drops <- c("X3","X4")
 98 | us.prez.elect <- us.prez.elect[,!(names(us.prez.elect) %in% drops)]
 99 | election_years <- gsub("[^A-Za-z0-9]+", "", wiki.table$"Election year")
100 | election_years[1] <- 1788
101 | election_years[20] <- 1864
102 | colnames(us.prez.elect) <- c("president", "party")
103 | rownames(us.prez.elect) <- election_years
104 | election.months <- seq(as.Date("1788/11/1"), by="4 year", length.out=(2016-1788)/4)
105 | election.dates <- election.months + ((9-as.POSIXlt(election.months)$wday) %% 7)
106 | us.prez.elect$election_date <- as.Date(election.dates)
107 | write.csv(us.prez.elect, "pads/data/history/US president elects.csv", row.names = TRUE)
108 | us.prez.elect.since.1950 <- subset(us.prez.elect, rownames(us.prez.elect) %in% election.years)
109 | 
110 | #
111 | # Merge the two data frames
112 | #
113 | new.col.names <- c('election_year', colnames(us.prez.elect.since.1950), colnames(t.returns))
114 | prez.data <- merge(us.prez.elect.since.1950, t.returns, by=0)
115 | colnames(prez.data) <- new.col.names
116 | write.csv(prez.data, "pads/data/history/US president elects and SP500 returns.csv", row.names = FALSE)
117 | 
118 | 
119 | #all.dates <- seq(as.Date("1952-1-1"), Sys.Date(), by="day")
120 | #election.year.dates <- subset(all.dates, years(all.dates) %in% election.years)
121 | #election.months <- subset(election.year.dates, months(election.year.dates)=="November")
122 | #elections.months.days <- subset(election.months, weekdays(election.months) %in% c("Monday", "Tuesday"))
123 | #df <- data.frame(date=elections.months.days, day=weekdays(elections.months.days))
124 | #election.years <- seq(as.Date("1952/1/1"), by="4 year", length.out=16)
125 | #http://statistics.berkeley.edu/classes/s133/dates.html
126 | #sapply(quote.dates, function(x) if(length(SPY.close[x])){cat("True\n")}else{ cat("False\n")}) 
127 | #x <- SPY.close[paste(election.date, election.date+(52*7), sep="/")]
128 | #d <- dailyReturn(x)
129 | #w <- weeklyReturn(x)
130 | #keeps <- c("y","a")
131 | #DF[keeps]
132 | 


--------------------------------------------------------------------------------
/SentimentAnalysis.R:
--------------------------------------------------------------------------------
  1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
  2 | # Date: 2012-30-1
  3 | # Description: Extracts tweets from twitter and run sentiment analysis on using list of sentiment words from Hu and Liu
  4 | # Packages Used: RCurl, XML, TwitteR, RJSONIO
  5 | # Blog Reference: http://www.r-bloggers.com/updated-sentiment-analysis-and-a-word-cloud-for-netflix-the-r-way/
  6 | # Download
  7 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
  8 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
  9 | # All rights reserved.
 10 | # Revised Sentiment Analyis using Hu & Liu's library of 6,800 negative and positive words
 11 | 
 12 | #Populate the list of sentiment words from Hu and Liu (http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html)
 13 | huliu.pwords <- scan(paste(SRC_DIR, 'opinion-lexicon/positive-words.txt', sep=""), what='character', comment.char=';')
 14 | huliu.nwords <- scan(paste(SRC_DIR,'opinion-lexicon/negative-words.txt', sep=""), what='character', comment.char=';')
 15 | # Add some extra words
 16 | huliu.nwords <- c(huliu.nwords,'wtf','wait','waiting','epicfail', 'crash', 'bug', 'bugy', 'bugs', 'slow', 'lie')
 17 | #Remove some words for example sap and Cloud
 18 | huliu.nwords <- huliu.nwords[!huliu.nwords=='sap']
 19 | huliu.nwords <- huliu.nwords[!huliu.nwords=='cloud']
 20 | 
 21 | #
 22 | # clean up a large character string
 23 | #
 24 | cleanText <- function(x) {
 25 |   # tolower
 26 |   x = tolower(x)
 27 |   # remove rt
 28 |   #x = gsub("rt", "", x)
 29 |   # remove at
 30 |   x = gsub("@\\w+", "", x)
 31 |   # remove punctuation
 32 |   x = gsub("[[:punct:]]", "", x)
 33 |   # remove control characters
 34 |   x <-  gsub('[[:cntrl:]]', '', x)
 35 |   # remove numbers
 36 |   x = gsub("[[:digit:]]", "", x)
 37 |   # remove links http
 38 |   x = gsub("http\\w+",  "", x)
 39 |   # remove tabs
 40 |   x = gsub("[ |\t]{2,}", " ", x)
 41 |   # remove new lines
 42 |   x = gsub("[ |\n]{1,}", " ", x)
 43 |   # remove blank spaces at the beginning
 44 |   x = gsub("^ ", "", x)
 45 |   # remove blank spaces at the end
 46 |   x = gsub(" $", "", x)
 47 |   return(x)
 48 | }
 49 | 
 50 | #
 51 | # clean up tweets or any other doucment for corpus and sentiment  analysis
 52 | #
 53 | cleanContent <- function(content){
 54 |   # clean out non-ASCII characters, remove numbers, puncuations, stop words
 55 |   content <- sapply(content, function(x) iconv(x, "latin1", "ASCII", sub=""))
 56 |   content <- cleanText(content) # clean up
 57 |   # remove stop-words
 58 |   content <- removeWords(content,
 59 |                          c(stopwords("english"),  "twitter", "wikipedia"))
 60 |   return(content)
 61 | }
 62 | #
 63 | # build a generic tag cloud
 64 | #
 65 | buildTagCloud  <- function (content, word.threshold=2){
 66 |   #cleanup
 67 |   content <- cleanContent(content)
 68 |   # make corpus for text mining
 69 |   content.corpus <- Corpus(VectorSource(content))
 70 |  
 71 |   #build a term document
 72 |   #content.dtm <- TermDocumentMatrix(content.corpus, control = list(stopwords = TRUE, minWordLength = 5))
 73 |   content.dtm <- TermDocumentMatrix(content.corpus, control = list(minWordLength = 5))
 74 |   # get a matrix
 75 |   content.m = as.matrix(content.dtm)
 76 |   # get word counts in decreasing order
 77 |   content.words <- sort(rowSums(content.m), decreasing=TRUE)
 78 |   # create a data frame with words and their frequencies
 79 |   content.df = data.frame(text=names(content.words), size=content.words)
 80 |   #write.csv(content.words, "company-word-tag.csv", row.names=F)
 81 |   return( content.df[content.df$size > word.threshold,])
 82 | }
 83 | 
 84 | #
 85 | # get sentiment score for each tweet
 86 | #
 87 | getSentimentScore <- function(tweets) {
 88 |   scores <- laply(tweets, function(singleTweet) {
 89 |     tweetWords <- unlist(str_split(tolower(singleTweet), '\\s+'))
 90 |     # compare our words to the dictionaries of positive & negative terms
 91 |     # match() returns the position of the matched term or NA, apply is.na to convert to boolean
 92 |     pos.matches <- !is.na(match(tweetWords, huliu.pwords))
 93 |     neg.matches <- !is.na(match(tweetWords, huliu.nwords))
 94 |     # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
 95 |     score <- sum(pos.matches) - sum(neg.matches)
 96 |     return(score)
 97 |   })
 98 |   return(data.frame(SentimentScore=scores, Tweet=tweets))
 99 | }
100 | 
101 | #
102 | #perform sentiment analysis
103 | #
104 | performSentimentAnalysis <- function(tweets){
105 |   #
106 |   # perform twitter sentiment analysis for each tweet
107 |   #
108 |   tweets <- cleanContent(tweets)
109 |   # call getSentiment score
110 |   ss <- getSentimentScore(tweets)
111 |  
112 |   # Get rid of tweets that have zero score and seperate +ve from -ve tweets
113 |   ss$posTweets <- as.numeric(ss$SentimentScore >=1)
114 |   ss$negTweets <- as.numeric(ss$SentimentScore <=-1)
115 |  
116 |   # Let's summarize now
117 |   summary <- list(TweetsFetched=length(ss$SentimentScore),
118 |                   PositiveTweets=sum(ss$posTweets), NegativeTweets=sum(ss$negTweets),
119 |                   AverageScore=round(mean(ss$SentimentScore),3))
120 |  
121 |   # some tweets have no score - positive offsets negative - so the next line is necessary
122 |   summary$TweetsWithScore <- summary$PositiveTweets + summary$NegativeTweets
123 |  
124 |   #Get Sentiment Score
125 |   summary$SentimentScore  <- round(summary$PositiveTweets/summary$TweetsWithScore, 2)
126 |   return(summary)
127 | }
128 | 
129 | #
130 | #search twitter using hash tags
131 | #
132 | searchTwitterHashtag <- function(tw.hashtag, certificate.path, how.many=300, what.lang="en") {
133 |   tweets <- try(searchTwitter(tw.hashtag, lang=what.lang, n=how.many, cainfo=certificate.path), silent=T)
134 |   if("try-error" %in% class(tweets))
135 |     return(data.frame(error="Oops an error occurred"))
136 |   return(tweets)
137 | }
138 | 
139 | #search twitter using handle
140 | searchTwitterHandle <- function(tw.handle, certificate.path, how.many=300) {
141 |   tweets <-  try(userTimeline(tw.handle, n=how.many, cainfo=certificate.path), silent=T)
142 |   if("try-error" %in% class(tweets))
143 |     return(list(error="Oops an error occurred"))
144 |   return(tweets)
145 | }
146 | 
147 | #download ca cert file
148 | downloadCACertFile <- function(){
149 |   download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.pem")
150 | }
151 | 
152 | # Certain words may not be relevant for your secenario to be counted as positive and negative on Hu and Liu list
153 | # Remove them before sending the words to this list
154 | # Fitler words
155 | filterWords <- function(words){
156 |   if(length(which(words %in% filter.words)) > 0)
157 |     words <- words[-which(words %in% filter.words)]
158 |   return(words)
159 | }
160 | 
161 | # example
162 | #get tweets and perform sentiment analysis
163 | tweets <- searchTwitterHandle(twitter.handle, CERTIFICATE_PATH, 300)
164 | summary <- performSentimentAnalysis(tweets)
165 | 
166 | 


--------------------------------------------------------------------------------
/GenerateCrimePADS.R:
--------------------------------------------------------------------------------
  1 | # Generate crime PADS
  2 | # Author: Jitender Aswani, Co-Founder @datadolph.in
  3 | # Date: 3/15/2013
  4 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
  5 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
  6 | # All rights reserved.
  7 | 
  8 | source("CreatePADS.R")
  9 | verbose <- T
 10 | #
 11 | # generate
 12 | #
 13 | generate <- function() {
 14 |   #initialize system
 15 |   initializeSystem()
 16 |   assign("crime.folder.path", "./pads/raw-data/crime", envir=.GlobalEnv)  
 17 |   assign("dataset", "US-Crime", envir=.GlobalEnv)
 18 |   
 19 |   #prepare pad meta data
 20 |   series <- list()
 21 |   series["source"] <- "  BJS, FBI, Uniform Crime Reports"
 22 |   series["category"] <- "Social"
 23 |   series["subcategory"] <- "US Crime Data"
 24 |   series["tags"] <- tolower(paste(series$category, series$subcategory, series$source, "usdoj, Supplementary Homicide Report, USA", sep=","))
 25 |   series["desc"] <- "Homicide Trends in United States: Data are based on annual estimates of homicide from previously published versions of Crime in the United States. Data for 1989 to 2008 reflect updated homicide estimates from Crime in the United States, 2008. Data for 2009 and 2010 reflect updated homicide estimates from Crime in the United States, 2010."
 26 |   series["pagetag"] <- "crunchbase"
 27 |   
 28 |   series.data <- read.csv(paste(crime.folder.path, "1.csv", sep="/"), stringsAsFactors=F)
 29 | 
 30 |   series["title"] <- "Homicide Rate per 100,000 Population (1950-2008)"
 31 |   padify(series, series.data[c(1,2)])    
 32 |   
 33 |   series["title"] <- "Homicide Incidents per 100,000 Population (1950-2008)"
 34 |   padify(series, series.data[c(1,3)])    
 35 |   
 36 |   series["title"] <- "Homicide Rate & Incidents per 100,000 Population (1950-2008)"
 37 |   padify(series, series.data)    
 38 |   
 39 |   series.data <- read.csv(paste(crime.folder.path, "2.csv", sep="/"), stringsAsFactors=F)
 40 |   series.data[2:4] <- data.frame(apply(series.data[2:4], 2, function(x) as.numeric(gsub("%", "", x, fixed=T))))  
 41 |   series["title"] <- "Victims and Offenders by Age in USA, Rate per 100,000 (1980-2008)"
 42 |   padify(series, series.data)    
 43 |   
 44 |   series.data <- read.csv(paste(crime.folder.path, "3.csv", sep="/"), stringsAsFactors=F)
 45 |   series.data[2:4] <- data.frame(apply(series.data[2:4], 2, function(x) as.numeric(gsub("%", "", x, fixed=T))))  
 46 |   series["title"] <- "Victims and Offenders by Gender in USA (1980-2008)"
 47 |   padify(series, series.data)    
 48 |   
 49 |   series.data <- read.csv(paste(crime.folder.path, "4.csv", sep="/"), stringsAsFactors=F)
 50 |   series.data[2:4] <- data.frame(apply(series.data[2:4], 2, function(x) as.numeric(gsub("%", "", x, fixed=T))))  
 51 |   series["title"] <- "Victims and Offenders by Race in USA (1980-2008)"
 52 |   padify(series, series.data)    
 53 |   
 54 |   series.data <- read.csv(paste(crime.folder.path, "2-1.csv", sep="/"), stringsAsFactors=F)
 55 |   series["title"] <- "Victims and Offenders by Age in USA, Rate per 100,000, (1980-2008)"
 56 |   padify(series, series.data)    
 57 |   
 58 |   series.data <- read.csv(paste(crime.folder.path, "3-1.csv", sep="/"), stringsAsFactors=F)
 59 |   series["title"] <- "Victims and Offenders by Gender in USA, Rate per 100,000 (1980-2008)"
 60 |   padify(series, series.data)    
 61 |   
 62 |   series.data <- read.csv(paste(crime.folder.path, "4-1.csv", sep="/"), stringsAsFactors=F)
 63 |   series["title"] <- "Victims and Offenders by Race in USA, Rate per 100,000 (1980-2008)"
 64 |   padify(series, series.data)    
 65 |   
 66 |   # Homicide victimization rates by age, (1980-2008)
 67 |   series.data <- read.csv(paste(crime.folder.path, "5.csv", sep="/"), stringsAsFactors=F)
 68 |   title <- "Homicide Victimization Rates By Age"
 69 |   period <- "(1980-2008)"
 70 |   colnames(series.data) <- gsub("X", "ages", colnames(series.data))
 71 |   col.names <- colnames(series.data) 
 72 |   for(i in 2:ncol(series.data))
 73 |   {
 74 |     series["title"] <- paste(title, ", ", col.names[i], period, sep="")
 75 |     print(i)
 76 |     padify(series, series.data[c(1,i)])    
 77 |   }
 78 |   series["title"] <- "Homicide Victimization Rates for All Ages (1980-2008)"
 79 |   padify(series, series.data)   
 80 |   
 81 |   
 82 |   # Homicide offending rates by age, 1980-2008
 83 |   series.data <- read.csv(paste(crime.folder.path, "5-1.csv", sep="/"), stringsAsFactors=F)
 84 |   title <- "Homicide Offending Rates By Age"
 85 |   period <- "(1980-2008)"
 86 |   colnames(series.data) <- gsub("X", "ages", colnames(series.data))
 87 |   col.names <- colnames(series.data) 
 88 |   for(i in 2:ncol(series.data))
 89 |   {
 90 |     series["title"] <- paste(title, ", ", col.names[i], period, sep="")
 91 |     print(i)
 92 |     padify(series, series.data[c(1,i)])    
 93 |   }
 94 |   series["title"] <- "Homicide Offending Rates for All Ages (1980-2008)"
 95 |   padify(series, series.data)
 96 |   
 97 |  #Average age of homicide victims and offenders, 1980-2008
 98 |   series.data <- read.csv(paste(crime.folder.path, "5-2.csv", sep="/"), stringsAsFactors=F)
 99 |   title <- "Average Age of Homicide"
100 |   period <- "(1980-2008)"
101 |   colnames(series.data) <- gsub("X", "ages", colnames(series.data))
102 |   col.names <- colnames(series.data) 
103 |   for(i in 2:ncol(series.data))
104 |   {
105 |     series["title"] <- paste(title, col.names[i], period, sep=" ")
106 |     print(i)
107 |     padify(series, series.data[c(1,i)])    
108 |   }
109 |   series["title"] <- "Average Age of Homicide Victims and Offenders (1980-2008)"
110 |   padify(series, series.data)  
111 |   
112 |   #Figure 6. Percent of homicides in which offender was known to victim by age of victim, 1980-2008
113 |   series.data <- read.csv(paste(crime.folder.path, "11.csv", sep="/"), stringsAsFactors=F)
114 |   title <- "Percent of Homicides in Which Offender was Known to Victim (1980-2008)"
115 |   padify(series, series.data)  
116 |   
117 |   #Figure 7. Number of homicides of children under age 5, by race of victim, 1980-2008
118 |   #*Other race includes American Indians, Alaska Natives, Asians, Hawaiians, and other Pacific Islanders.
119 |   series.data <- read.csv(paste(crime.folder.path, "12.csv", sep="/"), stringsAsFactors=F)
120 |   title <- "Number of homicides of children under age 5, by race of victim"
121 |   period <- "(1980-2008)"
122 |   col.names <- colnames(series.data) 
123 |   for(i in 2:ncol(series.data))
124 |   {
125 |     series["title"] <- paste(title, "-", col.names[i], period, sep=" ")
126 |     padify(series, series.data[c(1,i)])    
127 |   }
128 |   series["title"] <- "Number of homicides of children under age 5, by race of victim (1980-2008)"  
129 |   padify(series, series.data)  
130 |   
131 |   #Figure 8. Homicide victimization rates for children under age 5 by race of victim, 1980-2008  		
132 |   #Rate per 100,000 population			Rate per 100,000 population
133 |   series.data <- read.csv(paste(crime.folder.path, "12-1.csv", sep="/"), stringsAsFactors=F)
134 |   title <- "Rate of homicides of children under age 5, by race of victim"
135 |   period <- "(1980-2008)"
136 |   col.names <- colnames(series.data) 
137 |   for(i in 2:ncol(series.data))
138 |   {
139 |     series["title"] <- paste(title, "-", col.names[i], "per 100,000 population", period, sep=" ")
140 |     padify(series, series.data[c(1,i)])    
141 |   }
142 |   series["title"] <- "Rate of homicides of children under age 5, by race of victim, per 100,000 population (1980-2008)"  
143 |   padify(series, series.data)  
144 | 
145 |   #clean up
146 |   cleaupSystem()
147 |   updateCatPadCount()
148 | }
149 | generate()
150 | 


--------------------------------------------------------------------------------
/FetchData/FetchIPLData.R:
--------------------------------------------------------------------------------
  1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
  2 | # Date: 3/15/2013
  3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
  4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
  5 | # All rights reserved.
  6 | 
  7 | #http://www.wisdenrecords.com/Records/India/Test/Bowling/Most_Wickets_in_Career.html
  8 | #http://www.wisdenrecords.com/Records/India/Test/Batting/Most_Career_Runs.html
  9 | #http://www.wisdenrecords.com/Records/Pakistan/Test/Batting/Most_Career_Runs.html
 10 | #http://www.wisdenrecords.com/Records/India/Test/Batting/Most_Career_Runs.html
 11 | require("RJSONIO")
 12 | require("plyr")
 13 | library("RCurl")
 14 | 
 15 | init <- function() {
 16 |   assign("folder.path", "./pads/raw-data/cricket/", envir=.GlobalEnv)
 17 |   assign("years", 2008:2013, envir=.GlobalEnv)
 18 |   assign("ipl.base.url", 'http://dynamic.pulselive.com/dynamic/data/core/cricket/2012/ipl', envir=.GlobalEnv)
 19 |   assign("ipl.end.url.runs", '/stats/player/full/mostRuns.js', envir=.GlobalEnv)
 20 |   assign("ipl.end.url.wickets", "/stats/player/full/mostWickets.js", envir=.GlobalEnv)
 21 |   assign("ipl.end.url.standings", "/groupStandings.js", envir=.GlobalEnv)  
 22 | }
 23 | 
 24 | # Save the JSON 
 25 | saveStatsData <- function(out.file, jsonSt){
 26 |   file.out <- file(paste(folder.path, out.file, ".JSON", sep=""), 'wt')
 27 |   cat(jsonSt, file=file.out, fill = TRUE)
 28 |   close(file.out)
 29 | }
 30 | 
 31 | #write a csv file 
 32 | saveStatsDataAsCSV <- function(data, out.file){
 33 |   file.location <- paste(folder.path, out.file, ".csv", sep="")
 34 |   write.csv(data, file.location, row.names=F)
 35 | }
 36 | 
 37 | #
 38 | # batting
 39 | #
 40 | battingStastByPlayer <- function(l){
 41 |   #return (data.frame(matrix(unlist(l), nrow=1, byrow=T)))
 42 |   return (data.frame(l$team$fullName, l$player$fullName, l$player$nationality, l$player$dateOfBirth,
 43 |                      matrix(unlist(l$battingStats), nrow=1, byrow=T), stringsAsFactors = F))
 44 | }
 45 | 
 46 | getBattingStats <- function(ipl.json, year){
 47 |   ipl.batting.stats <- ldply(ipl.json$mostRuns[[1]]$topPlayers, battingStastByPlayer)
 48 |   colnames(ipl.batting.stats) <- c("team", "player", "country", "dob", names(ipl.json$mostRuns[[1]]$topPlayers[[1]]$battingStats))
 49 |   ipl.batting.stats$hs <- sapply(ipl.batting.stats$hs, function(x) sub("\\*", "", x))
 50 |   ipl.batting.stats$year <- year
 51 |   # get age 
 52 |   ipl.batting.stats$age_in_months <- sapply(ipl.batting.stats$dob, function(x) getAge(x, paste(year, "/04/01", sep=""), "m"))
 53 |   return(ipl.batting.stats)
 54 | }
 55 | 
 56 | battingStats <- function(year){
 57 |   ipl.url <- paste(ipl.base.url, year, ipl.end.url.runs, sep="")
 58 |   print(ipl.url)
 59 |   content <- try(getURL(ipl.url), silent=F)
 60 |   if(class(content) %in% c("try-error")) stop("Can't continue...")
 61 |   content <- sub("\\);", "", sub(paste("onMostRuns", "\\(", sep=""), "", content))
 62 |   saveStatsData(paste("ipl", year, "batting-stats", sep="-"), content)
 63 |   ipl.json <- fromJSON(content)
 64 |   return(getBattingStats(ipl.json, year))
 65 | }
 66 | 
 67 | battingStatsFromFiles <- function(year){
 68 |   file.name <- paste(folder.path, paste("ipl", year, "batting-stats", sep="-"), ".json", sep="")
 69 |   file.out <- file(file.name)
 70 |   ipl.json <- fromJSON(paste(readLines(file.out), collapse = "\n"))  
 71 |   close(file.out)
 72 |   #assign("ipl", ipl.json,  envir=.GlobalEnv)
 73 |   return(getBattingStats(ipl.json, year))
 74 | }
 75 | 
 76 | #
 77 | # Bowling stats
 78 | #
 79 | bowlingStastByPlayer <- function(l){
 80 |   #return (data.frame(matrix(unlist(l), nrow=1, byrow=T)))
 81 |   return (data.frame(l$team$fullName, l$player$fullName, l$player$nationality, l$player$dateOfBirth,
 82 |                      matrix(unlist(l$bowlingStats), nrow=1, byrow=T), stringsAsFactors = F))
 83 | }
 84 | 
 85 | getBowlingStats <- function(ipl.json, year){
 86 |   ipl.bowling.stats <- ldply(ipl.json$mostWickets[[1]]$topPlayers, bowlingStastByPlayer)
 87 |   colnames(ipl.bowling.stats) <- c("team", "player", "country", "dob", names(ipl.json$mostWickets[[1]]$topPlayers[[1]]$bowlingStats))
 88 |   ipl.bowling.stats$year <- year
 89 |   # get age 
 90 |   ipl.bowling.stats$age_in_months <- sapply(ipl.bowling.stats$dob, function(x) getAge(x, paste(year, "/04/01", sep=""), "m"))
 91 | 
 92 |   return(ipl.bowling.stats)
 93 | }
 94 | 
 95 | bowlingStats <- function(year){
 96 |   ipl.url <- paste(ipl.base.url, year, ipl.end.url.wickets, sep="")
 97 |   print(ipl.url)
 98 |   content <- try(getURL(ipl.url), silent=T)
 99 |   if(class(content) %in% c("try-error")) stop("Can't continue...")
100 |   content <- sub("\\);", "", sub(paste("onMostWickets", "\\(", sep=""), "", content))
101 |   saveStatsData(paste("ipl", year, "bowling-stats", sep="-"), content)
102 |   ipl.json <- fromJSON(content)
103 |   return(getBowlingStats(ipl.json, year))
104 | }
105 | 
106 | bowlingStatsFromFiles <- function(year){
107 |   file.name <- paste(folder.path, paste("ipl", year, "bowling-stats", sep="-"), ".json", sep="")
108 |   file.out <- file(file.name)
109 |   ipl.json <- fromJSON(paste(readLines(file.out), collapse = "\n"))  
110 |   close(file.out)
111 |   return(getBowlingStats(ipl.json, year))
112 | }
113 | 
114 | #
115 | # standings
116 | #
117 | standingStatsBySeason <- function(l){
118 |   return (data.frame(l$position, l$team$fullName, l$played, l$won, l$lost, l$tied, l$noResult, l$points, l$netRunRate))
119 | }
120 | 
121 | getStandingStats <- function(ipl.json, year){
122 |   ipl.standing.stats <- ldply(ipl.json$groups[[1]]$standings, standingStatsBySeason)
123 |   colnames(ipl.standing.stats) <- c("position", "team", "played", "won", "lost", "tied", "noResult", "points", "netrunrate")
124 |   ipl.standing.stats$year <- year
125 |   return(ipl.standing.stats)
126 | }
127 | 
128 | standingStats <- function(year){
129 |   ipl.url <- paste(ipl.base.url, year, ipl.end.url.standings, sep="")
130 |   print(ipl.url)
131 |   content <- try(getURL(ipl.url), silent=F)
132 |   if(class(content) %in% c("try-error")) stop("Can't continue...")
133 |   content <- sub("\\);", "", sub(paste("onGroupStandings", "\\(", sep=""), "", content))
134 |   saveStatsData(paste("ipl", year, "standings-stats", sep="-"), content)
135 |   ipl.json <- fromJSON(content)
136 |   #assign("ipl", ipl.json,  envir=.GlobalEnv)
137 |   return(getStandingStats(ipl.json, year))
138 | }
139 | 
140 | standingStatsFromFiles <- function(year){
141 |   file.name <- paste(folder.path, paste("ipl", year, "standings-stats", sep="-"), ".json", sep="")
142 |   file.out <- file(file.name)
143 |   ipl.json <- fromJSON(paste(readLines(file.out), collapse = "\n"))
144 |   close(file.out)
145 |   #assign("ipl", ipl.json,  envir=.GlobalEnv)
146 |   return(getStandingStats(ipl.json, year))
147 | }
148 | 
149 | 
150 | #
151 | # get IPL stats
152 | #
153 | getStats <- function(){
154 |   init()
155 |   #
156 |   #get batting stats
157 |   #
158 |   batting.stats <- adply(years, 1, battingStatsFromFiles)
159 |   colnames(batting.stats) <- c("season", colnames(batting.stats)[2:9], "fours", "sixes", 
160 |                                colnames(batting.stats)[12:13], "fiftys", "hundreds", 
161 |                                colnames(batting.stats)[16:ncol(batting.stats)])
162 |   
163 |   saveStatsDataAsCSV(batting.stats, "IPL-T20-Batting-Stats-For-All-Seasons")
164 | 
165 |   #
166 |   #get bowling stats
167 |   #
168 |   bowling.stats <- adply(years, 1, bowlingStatsFromFiles)
169 |   colnames(bowling.stats) <- c("season", colnames(bowling.stats)[2:10], "fours", 
170 |                                "sixes", colnames(bowling.stats)[13:21], "four_wickets", 
171 |                                "five_wickets", "ten_wickets", 
172 |                                colnames(bowling.stats)[25:ncol(bowling.stats)])
173 |    
174 |    saveStatsDataAsCSV(bowling.stats, "IPL-T20-Bowling-Stats-For-All-Seasons")
175 |   
176 |   #
177 |   # get group stats
178 |   #
179 |   group.standings <- adply(years, 1, standingStatsFromFiles)
180 |   colnames(group.standings) <- c("season", colnames(group.standings)[2:ncol(group.standings)])
181 |   saveStatsDataAsCSV(group.standings, "IPL-T20-Standing-Stats-For-All-Seasons")
182 | }
183 | 
184 | if(F){
185 |   
186 |   #http://dynamic.pulselive.com/dynamic/data/core/cricket/2012/ipl2013/groupStandings.js
187 |   #http://dynamic.pulselive.com/dynamic/data/core/cricket/2012/ipl2012/groupStandings.js
188 |   
189 |   ipl.json <- fromJSON(paste(readLines(paste(folder.path, "ipl-2013-sample-bat.json", sep="")), collapse = "\n"))
190 |   ipl <- read.csv(paste(folder.path, "IPLstats.csv", sep=""), stringsAsFactors=F)
191 |   
192 |   #http://dynamic.pulselive.com/dynamic/data/core/cricket/2012/ipl2012/stats/player/full/mostWickets.js
193 |   for(y in years){
194 |     aaply(ipl, 1, getStat, y)
195 |   }
196 |   
197 |   ipl.url <- "http://dynamic.pulselive.com/dynamic/data/core/cricket/2012/ipl2012/stats/player/full/mostRuns.js"
198 |   content <- getURL(ipl.url)
199 |   content <- sub("\\);", "", sub("onMostRuns\\(", "", content),)
200 |   
201 |   theurl <- "http://en.wikipedia.org/wiki/Brazil_national_football_team"
202 |   tables <- readHTMLTable(theurl)
203 |   n.rows <- unlist(lapply(tables, function(t) dim(t)[1]))
204 |   tables[[which.max(n.rows)]]
205 |   colnames(ipl.batting.stats) <- sapply(names(unlist(ipl.json$mostRuns[[1]]$topPlayers[[1]])), function(x) {gsub("(.*)\\.", "",x)})
206 |   gsub("(.*)\\.", "", "team.fullName")
207 | }  
208 |   
209 |   
210 | 


--------------------------------------------------------------------------------
/FetchData/FetchImmigrationData.R:
--------------------------------------------------------------------------------
  1 | require(RCurl)
  2 | require(data.table)
  3 | require(plyr)
  4 | setwd("/Users/homemac/dolphy/R/pads/raw-data/immigration")
  5 | 
  6 | 
  7 | fetchData <- function(url,yr) {
  8 |   #url <- "http://www.flcdatacenter.com/download/H1B_efile_FY02_text.zip"
  9 |   #yr <- 2
 10 |   #temp <- tempfile()
 11 |   #tempdir()
 12 |   temp <- paste("temp", yr, ".zip", sep="_")
 13 |   download.file(url,temp)
 14 |   #data <- read.csv(unz(temp, read.file))
 15 |   #write.csv(data,csv.file, row.names=F)
 16 |   #l.f <- unzip("H1B_efile_FY03_text.zip") #unzips the file
 17 |   unzip(temp) #unzips the file
 18 |   unlink(temp) 
 19 |   #"EFILE_FY2006.txt"
 20 |   #H1B_efile_FY02.txt
 21 | }
 22 | 
 23 | fy <- 5:6
 24 | u <- paste("http://www.flcdatacenter.com/download/H1B_efile_FY0", fy, "_text.zip", sep="")
 25 | sapply(seq_along(u), function(i) fetchData(u[i], fy[i]))
 26 | 
 27 | readTextData <- function() {
 28 |   filenames <- list.files(path=".", pattern="*.txt")
 29 | 
 30 |     print(i)
 31 |     #yr <-   2000 + as.integer(gsub("[^0-9]", "", gsub("(\\./)(.*)_(.*)_(.*)(\\.txt)", "\\4", i)))
 32 |     yr <-   2000 + as.integer(gsub("[^0-9]", "", gsub("(.*)_(.*)_(.*)(\\.txt)", "\\3", i)))
 33 |     filename <- gsub("(.*)(\\.txt)", "\\1", i)
 34 |     print(yr)
 35 |     print(filename)
 36 |     pad <- data.table(read.csv(i, stringsAsFactors=F))
 37 |     col.names <- colnames(pad)
 38 |     setnames(pad, col.names, tolower(col.names))
 39 |     #colnames(pad) <- tolower(colnames(pad))
 40 |     write.csv(pad,paste(filename, ".csv", sep=""), row.names=F)
 41 |   }
 42 | }
 43 | 
 44 | readCSVData <- function() {
 45 |   filenames <- list.files(path=".", pattern="*.csv")
 46 |   
 47 |   for(i in filenames) {
 48 |     print(i)
 49 |     #yr <-   2000 + as.integer(gsub("[^0-9]", "", gsub("(\\./)(.*)_(.*)_(.*)(\\.txt)", "\\4", i)))
 50 |     yr <-   2000 + as.integer(gsub("[^0-9]", "", gsub("(.*)_(.*)_(.*)(\\.csv)", "\\3", i)))
 51 |     pad <- data.table(read.csv(i, stringsAsFactors=F))
 52 |     col.names <- colnames(pad)
 53 |     setnames(pad, col.names, tolower(col.names))
 54 |     generateStats(pad, yr)
 55 |   }
 56 | }
 57 | 
 58 | generateStats <- function(dt, yr) {
 59 |   #
 60 |   # by total
 61 |   #
 62 |   dt.total <- dt[,list(year=yr, total=length(approval_status), 
 63 |                        approved=length(which(approval_status=="Certified")),
 64 |                        denied=length(which(approval_status=="Denied")), 
 65 |                        hold=length(which(approval_status=="Hold")), 
 66 |                        pending=length(which(approval_status=="Pending")))]
 67 |   
 68 |   # save this to csv
 69 |   write.csv(dt.total, paste("US_H1B_breakdown_by_status_", yr, ".csv", sep=""), row.names=F)
 70 |   # bind this to csv
 71 |   H1B.total <<- rbind(H1B.total, dt.total)
 72 |   
 73 |   #
 74 |   # by company
 75 |   #
 76 |   dt.by.company <- dt[, list(year=yr, total=length(approval_status), 
 77 |                              approved=length(which(approval_status=="Certified")), 
 78 |                              denied=length(which(approval_status=="Denied")), 
 79 |                              hold=length(which(approval_status=="Hold")), 
 80 |                              pending=length(which(approval_status=="Pending"))), 
 81 |                       by=list(employer=name)][order(-total)]
 82 |   # save this to csv
 83 |   write.csv(dt.by.company[1:15,], paste("US_H1B_top_companies_", yr, ".csv", sep=""), row.names=F)
 84 |   # bind this to csv
 85 |   H1B.by.company <<- rbind(H1B.by.company, dt.by.company)
 86 |   
 87 |   #
 88 |   # by state
 89 |   #
 90 |   dt.by.state <- dt[,list(year=yr, total=length(approval_status), 
 91 |                           approved=length(which(approval_status=="Certified")),
 92 |                           denied=length(which(approval_status=="Denied")), 
 93 |                           hold=length(which(approval_status=="Hold")), 
 94 |                           pending=length(which(approval_status=="Pending"))), 
 95 |                     by=state][order(-total)]
 96 |   # save this to csv
 97 |   write.csv(dt.by.state[1:15,], paste("US_H1B_top_states_", yr, ".csv", sep=""), row.names=F)
 98 |   # bind this to csv
 99 |   H1B.by.state <<- rbind(H1B.by.state, dt.by.state)
100 |   
101 |   #
102 |   # by city
103 |   #
104 |   
105 |   dt.by.city <- dt[,list(year=yr, total=length(approval_status), 
106 |                          approved=length(which(approval_status=="Certified")),
107 |                          denied=length(which(approval_status=="Denied")), 
108 |                          hold=length(which(approval_status=="Hold")), 
109 |                          pending=length(which(approval_status=="Pending"))), 
110 |                    by=city][order(-total)]
111 |   # save this to csv
112 |   write.csv(dt.by.city[1:15,], paste("US_H1B_top_cites_", yr, ".csv", sep=""), row.names=F)
113 |   # bind this to csv
114 |   H1B.by.city <<- rbind(H1B.by.city, dt.by.city)
115 |   
116 |   #
117 |   # by title
118 |   #
119 |   dt.by.title <- dt[,list(year=yr, total=length(approval_status), 
120 |                           approved=length(which(approval_status=="Certified")),
121 |                           denied=length(which(approval_status=="Denied")), 
122 |                           hold=length(which(approval_status=="Hold")), 
123 |                           pending=length(which(approval_status=="Pending"))), 
124 |                     by=job_title][order(-total)]
125 |   # save this to csv
126 |   write.csv(dt.by.title[1:15,], paste("US_H1B_top_titles_", yr, ".csv", sep=""), row.names=F)
127 |   # bind this to csv
128 |   H1B.by.title <<- rbind(H1B.by.title, dt.by.title)
129 |   
130 |   
131 |   
132 |   #contribs <- read.csv("path/to/file", colClasses=c(CTRIB_AMT="Currency"))
133 |   
134 |   #
135 |   # by wage
136 |   #
137 |   dt$wage_offered <- as.numeric(gsub("[\\$,]","",dt$wage_rate_1))
138 |   dt$prevailing_wages <- as.numeric(gsub("[\\$,]","",dt$prevailing_wage_1))
139 |   dt.by.wage <- dt[,list(year=yr, wage_offered, 
140 |                                  prevailing_wages, job_title, employer=name), 
141 |                            by=list(rate_per_1=="Year", 
142 |                                    approval_status=="Certified")][,rate_per_1:=NULL][,approval_status:=NULL][order(-wage_offered)][wage_offered < 200000]
143 |   # save this to csv
144 |   write.csv(dt.by.wage, paste("US_H1B_top_wages_", yr, ".csv", sep=""), row.names=F)
145 |   # bind this to csv
146 |   H1B.by.wages <<- rbind(H1B.by.wages, dt.by.wage)
147 | }
148 | 
149 | H1B.total <- data.table()
150 | H1B.by.company <- data.table()
151 | H1B.by.state <- data.table()
152 | H1B.by.city <- data.table()
153 | H1B.by.title <- data.table()
154 | H1B.by.wages <- data.table()
155 | 
156 | readCSVData()
157 | write.csv(H1B.total, "US_H1B_breakdown_by_status_2002-2007.csv", row.names=F)
158 | write.csv(H1B.by.company, "US_H1B_top_companies_2002-2007.csv", row.names=F)
159 | write.csv(H1B.by.state, "US_H1B_top_states_2002-2007.csv", row.names=F)
160 | write.csv(H1B.by.city, "US_H1B_top_cities_2002-2007.csv", row.names=F)
161 | write.csv(H1B.by.title, "US_H1B_top_titles_2002-2007.csv", row.names=F)
162 | write.csv(H1B.by.wages, "US_H1B_top_wages_2002-2007.csv", row.names=F)
163 | 
164 | 
165 | #lt <- count(ldf2, "approval_status")
166 | 
167 | # 'x' %in% colnames(df)
168 | lt2 <- as.data.frame.matrix(with(ldf2, table(name, approval_status)))
169 | lt2$company <- rownames(lt2)
170 | rownames(lt2) <- seq(1:nrow(lt2))
171 | 
172 | 
173 | 
174 | u <- "http://www.omegahat.org/RCurl/data.gz"
175 | 
176 | 
177 | 
178 | 
179 | setClass("Currency")
180 | setAs("character", "Currency",
181 |       function(from) as.numeric(sub("$","",from, fixed=TRUE)))
182 | 
183 | setClass("CurrencyCommas")
184 | setAs("character", "CurrencyCommas", 
185 |       function(from) as.numeric(gsub(",", "", from) ) )
186 | gsub("[\\$,]","","$1,075.20 ")
187 | if(url.exists(u)) {
188 |   content <- getBinaryURL(u)  #read in the https as a binary
189 | dump.file <- file("temp.zip", open = "wb")
190 | writeBin(content, dump.file)
191 | close(dump.file)               #use this to close the connection
192 | l <- unzip("temp.zip", list=TRUE) #unzips the file
193 | #dir()                    #Look at files in the directory.  We want "SRC2010.mdb"
194 | unlink("temp.zip")    #delete the zip file
195 | #  shell.exec("SRC2010.mdb")
196 | }
197 | library(RODBC)
198 | channel <- odbcConnectAccess("http://www.foreignlaborcert.doleta.gov/pdf/quarter_4_2011/H-1b_iCert_LCA_FY2011_Q4.mdb
199 | ")
200 | demographics <- sqlFetch(channel, sqtable="Demographic Factors", colnames = FALSE, rownames = FALSE)
201 | ave.class.sz <- sqlFetch(channel, sqtable="Average Class Size", colnames = FALSE, rownames = FALSE)
202 | att.susp <- sqlFetch(channel, sqtable="Attendance and Suspensions", colnames = FALSE, rownames = FALSE)
203 | drop.out <- sqlFetch(channel, sqtable="High School Completers", colnames = FALSE, rownames = FALSE)
204 | staff <- sqlFetch(channel, sqtable="Staff", colnames = FALSE, rownames = FALSE)
205 | close(channel)
206 | 
207 | 
208 | ## method 2 using curl
209 | CAINFO = paste(system.file(package="RCurl"), "/CurlSSL/ca-bundle.crt", sep = "")
210 | 
211 | cookie = 'cookiefile.txt'
212 | curlH = getCurlHandle(
213 |   cookiefile = cookie,
214 |   useragent =  "Mozilla/5.0 (Windows; U; Windows NT 5.1; en - US; rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6",
215 |   header = FALSE,
216 |   verbose = TRUE,
217 |   netrc = TRUE,
218 |   maxredirs = as.integer(20),
219 |   followlocation = TRUE,
220 |   ssl.verifypeer = TRUE)
221 | 
222 | 
223 | destfile = "log2.csv"
224 | content = getBinaryURL(url, curl = curlH, cainfo = CAINFO)
225 | ## write to file
226 | writeBin(content, destfile)
227 | ## read from binary object
228 | 
229 | ## read from binary object
230 | csv.data2 <- read.csv(textConnection(rawToChar(content)))
231 | head(csv.data2)
232 | csv.data2 == csv.data


--------------------------------------------------------------------------------
/CreatePADS.R:
--------------------------------------------------------------------------------
  1 | # Read a CSV file
  2 | # Classify the column types
  3 | # Determine measures and dimensions
  4 | # Generate meta data and store JSON
  5 | # Query Solr and store the information in SOLR
  6 | # Save the data as cache
  7 | 
  8 | # Author: Jitender Aswani, Co-Founder @datadolph.in
  9 | # Date: 3/15/2013
 10 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 11 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 12 | # All rights reserved.
 13 | 
 14 | require("RJSONIO")
 15 | require("plyr") 
 16 | require("data.table")
 17 | require("stringr")
 18 | require("lubridate")
 19 | require("zoo")
 20 | 
 21 | source("UtilPADS.R")
 22 | source("ClassifyData.R")
 23 | source("MySQLFunctions.R")
 24 | source("MongoFns.R")
 25 | source("DefaultChartForPAD.R")
 26 | 
 27 | # initializeSystem <- function(stack=0){
 28 | #   
 29 | #   # turn off scientific formatting - getOption("scipen")
 30 | #   options(scipen=999)
 31 | #   
 32 | #   #initialize solr docs
 33 | #   #assign("solr.index.docs", list(), envir=.GlobalEnv)
 34 | #   
 35 | #   #mongo 
 36 | #   mongo <- getDefaultMongoDBCon()
 37 | #   assign("mongo", mongo, envir=.GlobalEnv)
 38 | #   
 39 | #   #initialize stack
 40 | #   mysql.db <- list(user="ddfin_dev", pass="BfNdW87Ym9FmcYj7", name="ddfin_dev", host="localhost")
 41 | #   if(stack == 1) {
 42 | #     #prod 
 43 | #     mysql.db <- list(user="ddfin_prod", pass="8MQ5CRDzQufHKXTx", name="ddfin_prod", host="localhost")
 44 | #   }
 45 | #   assign("mysql.db", mysql.db, envir=.GlobalEnv)
 46 | #   
 47 | #   #mysql 
 48 | #   mysql <- getMSDBCon()
 49 | #   assign("mysql", mysql, envir=.GlobalEnv)
 50 | #   
 51 | #   #error counter
 52 | #   assign("error.count", 0, envir=.GlobalEnv)
 53 | #   
 54 | #   #initialize log file
 55 | #   assign("vec.log", vector(), envir=.GlobalEnv)
 56 | #   
 57 | #   #verbose mode
 58 | #   assign("verbose", F, envir=.GlobalEnv)
 59 | #   
 60 | #   #min log
 61 | #   assign("log.all", F, envir=.GlobalEnv)
 62 | # }
 63 | 
 64 | #
 65 | # loadStates
 66 | #
 67 | loadStates <- function(){
 68 |   states <- data.table(readFile("./pads/raw-data/states_names.csv"))
 69 |   setkey(states, state_code)
 70 |   assign("states", states, envir=.GlobalEnv)
 71 | }
 72 | 
 73 | 
 74 | 
 75 | #
 76 | # Initialize Padification Process
 77 | #
 78 | initializeSystem <- function(stack=0){
 79 |   
 80 |   # turn off scientific formatting - getOption("scipen")
 81 |   options(scipen=999)
 82 |   
 83 |   # day and months facors
 84 |   lMonths <- c("January","February","March", "April","May","June","July","August","September", "October","November","December")
 85 |   lDays <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
 86 |   assign("lMonths", lMonths, envir=.GlobalEnv)
 87 |   assign("lDays", lDays, envir=.GlobalEnv)
 88 |   
 89 |   #loadStates
 90 |   loadStates()
 91 |   
 92 |   #initialize solr docs
 93 |   #assign("solr.index.docs", list(), envir=.GlobalEnv)
 94 |   
 95 |   #mongo 
 96 |   mongo <- getDefaultMongoDBCon()
 97 |   assign("mongo", mongo, envir=.GlobalEnv)
 98 |   
 99 |   #initialize stack
100 |   mysql.db <- list(user="ddfin_dev", pass="b31nd1$$", name="ddfin_dev", host="localhost")  
101 |   if(stack == 1) {
102 |     #prod 
103 |     mysql.db <- list(user="ddfin_prod", pass="g0f0r1t", name="ddfin_prod", host="localhost")
104 |   }
105 |   assign("mysql.db", mysql.db, envir=.GlobalEnv)
106 |   
107 |   #mysql 
108 |   mysql <- getMSDBCon()
109 |   assign("mysql", mysql, envir=.GlobalEnv)
110 |   
111 |   #error counter
112 |   assign("error.count", 0, envir=.GlobalEnv)
113 |   
114 |   #initialize log file
115 |   assign("vec.log", vector(), envir=.GlobalEnv)
116 |   
117 |   #verbose mode
118 |   assign("verbose", F, envir=.GlobalEnv)
119 | 
120 |   #min log
121 |   assign("log.all", F, envir=.GlobalEnv)
122 | }
123 | 
124 | #
125 | #cleanup
126 | #
127 | cleaupSystem <- function() {
128 |   
129 |   #disconnect Mongo
130 |   disconnectMongoDB(mongo) 
131 |   
132 |   # disconnect msdb
133 |   disconnectMSDB(mysql)
134 |   
135 |   #save the logs
136 |   dumpLogs()
137 |   
138 |   # save the solr index to FS for later editing - this is now done through mysql
139 |   #persistSolrIndex(solr.index.docs, dataset)
140 | }
141 | 
142 | #
143 | # create pad - the padification process starts here
144 | #
145 | createPAD <- function(pad, title, desc, category, subcategory, data.source, source.file, 
146 |                       tags, pagetag) {
147 |   
148 |   if(ncol(pad) == 0)  stop('dolphy can not padify a pad that has zero columns')
149 |     
150 |   #only continue if there is one or more colum
151 |   if(verbose) print(paste("Starting the padification process for source file", source.file, sep=":"))
152 |   logMessage(paste("Starting the padification process for source file", source.file, sep=":"))
153 |   
154 |   #Very first task, classify pad
155 |   pad.classified <- classifyData(pad)
156 |   
157 |   #if(verbose) print(pad.classified)
158 |   
159 |   # Generate extra date columns
160 |   #if(length(which(col.classes == "Date")) > 0) {
161 |   #  pad.classified <- generateExtraDateCols(pad.classified, which(col.classes == "Date"))
162 |   #  col.classes <- laply(pad.classified, getClass)
163 |   #}
164 |   
165 |   meta.data <- list(id="", "title"=title, "desc"=desc, 
166 |                     "records"=as.integer(0), 
167 |                     "columns"=as.integer(0), 
168 |                     "category"=category, 
169 |                     "subcategory"=subcategory,  
170 |                     "src"=data.source, 
171 |                     "src_file"= source.file, 
172 |                     "cache_location"="", 
173 |                     "dList"=list(), "mList"=list(), 
174 |                     "tags"=tags,
175 |                     "pagetag"=pagetag
176 |   )  
177 |   
178 |   #Start filling up the meta data
179 |   meta.data$records <- nrow(pad.classified)
180 |   meta.data$columns <- ncol(pad.classified)
181 |   
182 |   #Replace all meta chars from column names including spaces
183 |   colnames(pad.classified) <-  replaceMetaChars(colnames(pad.classified))
184 |   
185 |   col.classes <- laply(pad.classified, getClass)
186 |   col.names <- colnames(pad.classified)
187 |   
188 |   #Create a dict (name=value pair)
189 |   cols <- list()
190 |   cols[col.names]=col.classes
191 |   meta.data$dList <- cols[which(unlist(cols)!="numeric")]
192 |   meta.data$mList <- cols[which(unlist(cols)=="numeric")]
193 |   #meta.data$mList <- names(which(sapply(pad.classified, is.numeric)))
194 |   #meta.data$dList <- setdiff(colnames(pad.classified), meta.data$mList)
195 |   
196 |   # Get unique id for pad
197 |   padID <- getPadUID()
198 |   #print(paste("Assing pad id of ", padID, sep="::"))
199 |   logMessage(paste("Assigning pad id of ", padID, sep="::"))
200 |   meta.data$id <- padID
201 |   #meta.data$meta_location <- getMetaDataFileURL(padID)
202 |   meta.data$cache_location <- getCacheDataFileURL(padID)
203 |   #meta.data$data_location <- getDataFileURL(padID)
204 |   
205 |   #Save transformed dataset as data.frame for later reading
206 |   assign(padID, pad.classified)
207 |   names <- c(eval(padID))
208 | 
209 |   #get default chart info....
210 |   meta.data <- getDefaultChartInfo(pad.classified, meta.data)
211 |   
212 |   #Save the cahce
213 |   if(verbose)  print("saving cache")
214 |   if(log.all) logMessage("Saving cache...")
215 |   save(list=names, file=meta.data$cache_location)
216 |  
217 |   #Save meta data
218 |   #jsonSt <- toJSON(meta.data)
219 |   #saveMetaData(meta.data$meta_location, jsonSt)
220 | 
221 |   #Save underlying data into csv
222 |   #saveData(pad.classified, meta.data$data_location)
223 |   
224 |   if(verbose) print("Padification Completed!")
225 |   logMessage("Padification Completed!")
226 |   return(meta.data)  
227 | }
228 | 
229 | #
230 | # padify
231 | #
232 | padify <- function(series, series.data, x.plot.band=NULL, y.plot.band=NULL){
233 |   #Remove rows that have NAs
234 |   series.data <- series.data[rowSums(is.na(series.data)) != ncol(series.data),]
235 |   
236 |   # Remove rows that have at least 1 NA
237 |   series.data <- series.data[complete.cases(series.data),]
238 |   
239 |   if("data.table" %in% class(series.data)) 
240 |     series.data <- as.data.frame(series.data, , stringsAsFactors=F)
241 |   #create pad
242 |   pmd <- try(createPAD(series.data, series$title, 
243 |                        series$desc, series$category, series$subcategory, 
244 |                        series$source, paste(series$name, series$country, sep="."), 
245 |                        series$tags, series$pagetag), silent=T)
246 |   
247 |   #assign("pmd", pmd, envir=.GlobalEnv)
248 |   #
249 |   # Add this pad to system pads table in the database, to mongo db and save the cache
250 |   #
251 |   #check to see if the padification went through
252 |   if(class(pmd) %in% c("try-error")) {
253 |     if(verbose) print("couldn't padify process.")
254 |     logMessage("couldn't padify the pad...")
255 |     assign("error.count", error.count+1, envir=.GlobalEnv)
256 |   } else {
257 |     if(!is.null(x.plot.band))
258 |       pmd$default$xaxis$plotbands <- x.plot.band
259 |     
260 |     if(!is.null(y.plot.band))
261 |       pmd$default$yaxis$plotbands <- y.plot.band
262 |     
263 |     if(verbose) print("adding the pad to mysql")
264 |     if(log.all) logMessage("Saving PAD meta data in the database...")
265 |     insertPadToMySQL(pmd)
266 |     
267 |     # insert into mongo
268 |     if(verbose)  print("adding the pad to mongo db")
269 |     if(log.all)  print("adding the pad to mongo db")
270 |     pad <- fromJSON(toJSON(pmd))
271 |     
272 |     insertPadToMongo(pmd$id, pad) 
273 |     return (pmd$id)
274 | #     #generate solr doc
275 | #     if(verbose)  print("adding the pad to solr vector")
276 | #     if(log.all)  print("adding the pad to mongo")
277 | #     doc <- list("type"="pad", "id" = pmd$id, "title" = pmd$title, "desc"=pmd$desc, "category"=pmd$category, 
278 | #                 "subcategory"=pmd$subcategory, "tags"=pmd$tags, "author"="system")
279 | #     solr.index.docs[[length(solr.index.docs)+1]] <<- doc     
280 |   }
281 | }


--------------------------------------------------------------------------------
/ClassifyData.R:
--------------------------------------------------------------------------------
  1 | # Classify Data
  2 | # Author: Jitender Aswani, Co-Founder @datadolph.in
  3 | # Date: 3/15/2013
  4 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
  5 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
  6 | # All rights reserved.
  7 | 
  8 | 
  9 | 
 10 | #
 11 | # Generate extra date cols
 12 | #
 13 | generateExtraDateCols <- function(data, col.index) {
 14 |   lMonths <- c("January","February","March", "April","May","June","July","August","September", "October","November","December")
 15 |   lDays <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
 16 |   #data$Day_generated=factor(weekdays(data[,col.index]), levels=lDays, ordered=TRUE)
 17 |   data$Day_g=factor(weekdays(data[,col.index]), levels=lDays)
 18 |   #data$Month_generated=factor(months(data[,col.index]), levels=lMonths, ordered=TRUE) 
 19 |   data$Month_g=factor(months(data[,col.index]), levels=lMonths)
 20 |   data$Quarter_g=as.factor(quarters(data[,col.index]))
 21 |   data$Year_g=as.factor(years(data[,col.index]))
 22 |   #data$Year=as.numeric(format(data[,col.index], "%Y"))
 23 |   #as.numeric(format(date1, "%m"))
 24 |   #format(date1, "%b") ## Month, char, abbreviated
 25 |   return(data)
 26 | }
 27 | #
 28 | # Define all supported date formats
 29 | #
 30 | getAllowedDateFormats <- function() {
 31 |   #d = '([0 ]?[1-9]|[12][0-9]|3[01])'      # 1-31 with leading zero or space for single digits
 32 |   #m = '([0 ]?[1-9]|1[0-2])'               # 01-12 with leading zero or space for single digits
 33 |   d = '([0][1-9]|[12][0-9]|3[01])'      # 1-31 with leading zero for single digits
 34 |   m = '([0][1-9]|1[0-2])'               # 01-12 with leading zero for single digit  
 35 |   Y = '(17[0-9][0-9]|18[0-9][0-9]|19[0-9][0-9]|20[0-9][0-9])'   # 1700 - 2099
 36 |   #Y = '(19[0-9][0-9]|20[0-9][0-9])'   # 1900 - 2099
 37 |   y = '([0-9][0-9])'                  # 00-99
 38 |   b='([a-zA-Z]{3})'
 39 |   B='([a-zA-Z]+)'
 40 |   #q = '([qQ][1-4])'
 41 |   #patterns <- c("%Y%m%d", "%m%d%y", "%m%d%Y", "%d%m%y", "%d%m%Y", "%y%m%d")
 42 |   #patterns <- c("Ymd", "mdy", "mdY", "dmy", "dmY", "ymd", "y", "Y", "by", "bY", "By", "BY")
 43 |   #removed %y from the list to avoid getting numerical column identified, moved %Y at the front to avoid 4 digit year being classified as %d%m%y
 44 |   patterns <- c("Y","Ymd", "mdy", "mdY", "dmy", "dmY", "ymd","by", "bY", "By", "BY")
 45 |   #patterns <- c("Y")
 46 |   separators <- c("", " ", "/", "-", ".")
 47 |   #strptime {base}  /^[a-zA-Z]{3}\s+[0-9]{2}$/
 48 |   allowed.date.list <- list()
 49 |   
 50 |   for (i in 1:length(patterns)) {
 51 |     for (j in 1:length(separators)) {
 52 |       p <- patterns[i]
 53 |       s <- separators[j]
 54 |       pattern.length <- nchar(p)
 55 |       chars <- substring(p,seq(1, pattern.length), seq(1, pattern.length))
 56 |       m.chars <- paste("%", chars, sep="")
 57 |       format <- paste(m.chars,collapse=s)
 58 |       if(!s %in% c("", " ")){
 59 |         s <- paste("\\", s, "", sep="" )
 60 |       }
 61 |       regex <- ''
 62 |       if( pattern.length == 3) {
 63 |         regex <- paste('^', eval(get(chars[1])), s, eval(get(chars[2])), s, eval(get(chars[3])), '$', sep="")
 64 |       } else if ( pattern.length == 2) {
 65 |         regex <- paste('^', eval(get(chars[1])), s, eval(get(chars[2])),'$', sep="")
 66 |       } else if ( pattern.length == 1) {
 67 |         regex <- paste('^', eval(get(chars[1])),'$', sep="")
 68 |       }
 69 |       allowed.date.list[[regex]] = format
 70 |     }
 71 |   }
 72 |   return (allowed.date.list)
 73 | }
 74 | 
 75 | #
 76 | # get date format for sample of values
 77 | #
 78 | getDateFormat <- function(sample.dates) {
 79 |   allowed.date.list <- getAllowedDateFormats()
 80 |   #sample.dates <- sample.dates[!is.na(sample.dates)]
 81 |   for(i in 1:length(allowed.date.list)) {
 82 |     #Check how many items matced a date pattern
 83 |     matched <- sum(grepl(names(allowed.date.list)[i], sample.dates))
 84 |     if(matched == length(sample.dates)) {
 85 |       #print(names(allowed.date.list)[i])
 86 |       return(allowed.date.list[[i]])
 87 |     }
 88 |   }
 89 |   return(NULL)
 90 | }
 91 |   
 92 | #Test this function  
 93 | testDates <- function() {
 94 |   #dates <- c("March 2010","December 2011", "February 2012", "March 2014", "Jan 2013")
 95 |   dates <- c("1962", "1949", "1932", "1963", "1925")#, 1937, 1912, 1944, 1930, 1958, 1939, 1919, 1915)
 96 |   #dates <- seq(as.Date("2000/1/1"), by="month", length.out=1000)
 97 |   #Remove NA: dates <- dates[!is.na(dates)]
 98 |   date.format <- getDateFormat(dates)
 99 |   #d <- as.Date(dates, date.format)
100 |   if(date.format %in% c("%y", "%Y")) {
101 |     dates = as.Date(paste("1 1 ", dates, sep=""), format=paste("%d %m ", date.format, sep=""))
102 |   } else if (date.format %in% c("%b%y","%b %y", "%b/%y", "%b-%y", "%b.%y", 
103 |                                 "%b%Y", "%b %Y", "%b/%Y", "%b-%Y", "%b.%Y", 
104 |                                 "%B%y", "%B %y", "%B/%y", "%B-%y", "%B.%y", 
105 |                                 "%B%Y", "%B %Y", "%B/%Y", "%B-%Y", "%B.%Y")) {
106 |     sep <- gsub("%[a-zA-Z]{1}(.*)%[a-zA-Z]{1}", "\\1", date.format)
107 |     dates = as.Date(paste("1", dates, sep=sep), format=paste("%d", date.format, sep=sep))
108 |      
109 |   } else {
110 |     dates = as.Date(dates, format=date.format)
111 |   }
112 | }
113 | 
114 | #
115 | # Test if date
116 | #
117 | isDate <- function(col, colname="date") {
118 |   allowed.date.headers <- c("date", "time", "index")
119 |   #if(tolower(colname) %in% allowed.date.headers || grepl("[date]")) {
120 |   #  print("date found in the header")
121 |   #}
122 |   return (getDateFormat(col))
123 | }
124 | 
125 | #
126 | # Test if logical
127 | #
128 | isColLogical <- function(col) {
129 |   if(length(levels(factor(col))) == 2) {
130 |     bool.types <- c('true', 'false', 't', 'f', '1', '0', 'y', 'n', 'yes', 'no')
131 |     countBool <- sum(sapply(tolower(col), function(x) x %in% bool.types), na.rm=TRUE)
132 |     if (countBool == length(col))
133 |       return(TRUE)
134 |     else 
135 |       return(FALSE)
136 |   } else
137 |       return (FALSE)
138 | }
139 | 
140 | #
141 | #test if number
142 | #
143 | isNumber <- function(col) {
144 |   countBool <- sum(sapply(col, is.numeric), na.rm=TRUE)
145 |   if (countBool == length(col))
146 |     return(TRUE)
147 |   else 
148 |     return(FALSE)
149 | }
150 | 
151 | #
152 | #Main funciton which takes a column (or a vector) and attempts to classify it...
153 | #
154 | classifyData <- function(data) {
155 |   #Default sample size
156 |   sample.size <- 30
157 |   if(nrow(data) <= 30)
158 |     sample.size <- nrow(data)
159 |   
160 |   #Remove rows that have NA
161 |   if(ncol(data) == 1) { # this is an intersting one - if pad has only one column, the above line returns a vector rather than a data.frame hence this loop
162 |       col.names <- colnames(data)
163 |       data <- as.data.frame(data[rowSums(is.na(data)) != ncol(data),])
164 |   
165 |       #Take a small sample
166 |       data.sample <-  as.data.frame(data[sample(1:nrow(data)[1], size=sample.size, replace=FALSE),])
167 |       colnames(data) <- col.names
168 |   } else {
169 |       data <- data[rowSums(is.na(data)) != ncol(data),]
170 |       #Take a small sample
171 |       data.sample <- data[sample(1:nrow(data)[1], size=sample.size, replace=FALSE),]
172 |   }  
173 |   cls <- sapply(data.sample, class)
174 |   #print(cls)
175 |   classified <- list()
176 |   col.names <- colnames(data.sample)
177 |   for (i in 1:ncol(data.sample)) {
178 |     
179 |     col <- data.sample[,i]
180 |     col <- col[!is.na(col)] #remove NA
181 | 
182 |     #check for boolean
183 |     if(isColLogical(col)) {
184 |       #storage.mode(data[,i]) = "logical"
185 |       data[,i] = as.factor(data[,i])
186 |       #classes.classified[[i]] = "logical"
187 |       next
188 |     }
189 |     
190 |     #check for date names (in date, time)
191 |     date.format <- isDate(col)
192 |     #print(date.format)
193 |     if(!is.null(date.format)) {
194 |       if(date.format %in% c("%y", "%Y")) {
195 |         data[,i] = as.Date(paste("1 1 ", data[,i], sep=""), format=paste("%d %m ", date.format, sep=""))
196 |       } else if (date.format %in% c("%b%y","%b %y", "%b/%y", "%b-%y", "%b.%y", 
197 |                                     "%b%Y", "%b %Y", "%b/%Y", "%b-%Y", "%b.%Y", 
198 |                                     "%B%y", "%B %y", "%B/%y", "%B-%y", "%B.%y", 
199 |                                     "%B%Y", "%B %Y", "%B/%Y", "%B-%Y", "%B.%Y")) {
200 |         sep <- gsub("%[a-zA-Z]{1}(.*)%[a-zA-Z]{1}", "\\1", date.format)
201 |         data[,i] = as.Date(paste("1", data[,i], sep=sep), format=paste("%d", date.format, sep=sep))
202 |       } else {
203 |         data[,i] = as.Date(data[,i], format=date.format)
204 |         generateExtraDateCols(data, i)
205 |       }
206 |       next
207 |     }
208 |     
209 |     #check for int
210 |     if (isNumber(col)){
211 |       data[,i] = as.numeric(data[,i])
212 |       next
213 |     }
214 |     #check for factor  
215 |       
216 |   }
217 |   if(verbose) print("Classification Done!")
218 |   logMessage("Classification Done!")
219 |   
220 |   return(data)
221 | }
222 | 
223 | 
224 | testClassification <- function() {
225 |   
226 |   #l <- c(1,0,1,0,0,0,1,0)
227 |   #z <- sample(c(TRUE,FALSE),1000000,rep=TRUE)
228 |   #z <- tolower(z)
229 |   #Geneates a matrix
230 |   #date.patterns <- unlist(
231 |   #  lapply(separtors, function(y)
232 |   #    lapply(patterns, function(x) paste(substring(x,seq(1,nchar(x), 2), seq(2,nchar(x), 2)), 
233 |   #                                       collapse=y)
234 |   #    )))
235 |   
236 |   #sample.df <- data.frame(bigmove=sample(c("y","n"),1000,rep=TRUE), 
237 |   #                        date=seq(as.Date("2000/1/1"), by="month", length.out=1000), 
238 |   #                        value=rep(1:100,100), stringsAsFactors = FALSE)
239 |   
240 |   sample.df <- data.frame(dates=c("1-31-2012", "1-1-2011", "1-1-2010", "1-1-2009"), pass=c(1,0,1,0), name=c("a", "bsd", "adaads", "dafds"), stringsAsFactors = FALSE)
241 |   print(sapply(sample.df, class))
242 |   data <- classifyData(sample.df)
243 |   print(sapply(data, class))
244 | }
245 | 
246 |   if(FALSE) 
247 |     {
248 |     #Symbol  Meaning  Example
249 |   #%d	day as a number (0-31)	01-31
250 |   #%a
251 |   #%A	abbreviated weekday 
252 |   #unabbreviated weekday	Mon
253 |   #Monday
254 |   #%m	month (00-12)	00-12
255 |   #%b
256 |   #%B	abbreviated month
257 |   #unabbreviated month	Jan
258 |   #January
259 |   #%y
260 |   #%Y	2-digit year 
261 |   #4-digit year	07
262 |   #2007
263 |    # metaChar = c("$","*","+",".","?","[","^","{","|","(","\\")
264 |    # 
265 |     #`~!@#$%^&*()_|+\-=?;:'",.<>\{\}\[\]\\\/]
266 |     
267 |   }


--------------------------------------------------------------------------------
/FetchData/GenerateFlightStats.R:
--------------------------------------------------------------------------------
  1 | #
  2 | # Generate Flight Stats
  3 | #
  4 | initForStats <- function(){
  5 |   assign("flights.folder.path", "./pads/raw-data/flights/", envir=.GlobalEnv)
  6 |   assign("verbose", T, envir=.GlobalEnv)
  7 |   flights.files <- list.files(path=flights.folder.path, pattern="*.csv.gz")
  8 |   assign("flights.files", flights.files, envir=.GlobalEnv)
  9 |   assign("period", length(flights.files), envir=.GlobalEnv)
 10 | }
 11 | 
 12 | 
 13 | getFlightsStatusByAirlines <- function(flights, yr){
 14 |   #
 15 |   # get flights stats By airlines
 16 |   #    
 17 |   if(verbose) cat("Getting stats for airlines:", '\n')
 18 |   airlines.stats <- flights[, list(dep_airports=length(unique(origin)),
 19 |                                    flights=length(origin),
 20 |                                    flights_cancelled=sum(cancelled, na.rm=T),
 21 |                                    flights_diverted=sum(diverted, na.rm=T),
 22 |                                    flights_departed_late=length(which(depdelay > 0)),
 23 |                                    flights_arrived_late=length(which(arrdelay > 0)),
 24 |                                    total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]),
 25 |                                    avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])),
 26 |                                    median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)])),                 
 27 |                                    miles_traveled=sum(distance, na.rm=T)
 28 |   ),
 29 |                             by=uniquecarrier][, year:=yr]
 30 |   #change col order
 31 |   setcolorder(airlines.stats, c("year", colnames(airlines.stats)[-ncol(airlines.stats)]))
 32 |   #save this data
 33 |   saveData(airlines.stats, paste(flights.folder.path, "stats/5/airlines_stats_", yr, ".csv", sep=""))
 34 |   #clear up space
 35 |   rm(airlines.stats)  
 36 |   
 37 |   #
 38 |   # get flights stats By airlines by month
 39 |   #
 40 |   if(verbose) cat("Getting stats for airlines by month:", '\n')
 41 |   airlines.stats <- flights[, list(dep_airports=length(unique(origin)),
 42 |                                    flights=length(origin),
 43 |                                    flights_cancelled=sum(cancelled, na.rm=T),
 44 |                                    flights_diverted=sum(diverted, na.rm=T),
 45 |                                    flights_departed_late=length(which(depdelay > 0)),
 46 |                                    flights_arrived_late=length(which(arrdelay > 0)),
 47 |                                    total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]),
 48 |                                    avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])),
 49 |                                    median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)])),
 50 |                                    miles_traveled=sum(distance, na.rm=T)
 51 |   ),
 52 |                             by=list(uniquecarrier, month)][, year:=yr]
 53 |   #change col order
 54 |   setcolorder(airlines.stats, c("year", colnames(airlines.stats)[-ncol(airlines.stats)]))
 55 |   #save this data
 56 |   saveData(airlines.stats, paste(flights.folder.path, "stats/6/airlines_stats_monthly_", yr, ".csv", sep=""))
 57 |   #clear up space
 58 |   rm(airlines.stats)
 59 |   
 60 | }
 61 | 
 62 | getFlightsStatsByAirport <- function(flights, yr){
 63 |   #
 64 |   # get flights stats By airport
 65 |   #   
 66 |   if(verbose) cat("Getting stats for airport:", '\n')
 67 |   airport.stats <- flights[, list(flights=length(uniquecarrier),
 68 |                                   flights_cancelled=sum(cancelled, na.rm=T),
 69 |                                   flights_departed_late=length(which(depdelay > 0)),
 70 |                                   total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]),
 71 |                                   avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])),
 72 |                                   median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)]))
 73 |   ),
 74 |                            by=origin][, year:=yr]
 75 |   #change col order
 76 |   setcolorder(airport.stats, c("year", colnames(airport.stats)[-ncol(airport.stats)]))
 77 |   #save this data
 78 |   saveData(airport.stats, paste(flights.folder.path, "stats/3/airport_stats_", yr, ".csv", sep=""))
 79 |   #clear up space
 80 |   rm(airport.stats)  
 81 |   
 82 |   #
 83 |   # get flights stats By airport by month
 84 |   #
 85 |   if(verbose) cat("Getting stats for airport by month:", '\n')
 86 |   airport.stats <- flights[, list(flights=length(uniquecarrier),
 87 |                                   flights_cancelled=sum(cancelled, na.rm=T),
 88 |                                   flights_departed_late=length(which(depdelay > 0)),
 89 |                                   total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]),
 90 |                                   avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])),
 91 |                                   median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)]))
 92 |   ), by=list(origin, month)][, year:=yr]
 93 |   #change col order
 94 |   setcolorder(airport.stats, c("year", colnames(airport.stats)[-ncol(airport.stats)]))
 95 |   #save this data
 96 |   saveData(airport.stats, paste(flights.folder.path, "stats/4/airport_stats_monthly_", yr, ".csv", sep=""))
 97 |   #clear up space
 98 |   rm(airport.stats)
 99 | }
100 | 
101 | getFlightStatsForYear <- function(flights, yr){
102 |   #
103 |   # starting with flights data at aggregate level for every year
104 |   #  
105 |   if(verbose) cat("Getting flight stats: ", '\n')
106 |   flights.stats <- flights[, list(airlines = length(unique(uniquecarrier)),
107 |                                   flights=length(uniquecarrier),
108 |                                   flights_cancelled=sum(cancelled, na.rm=T),
109 |                                   flights_diverted=sum(diverted, na.rm=T),
110 |                                   flights_departed_late=length(which(depdelay > 0)),
111 |                                   flights_arrived_late=length(which(arrdelay > 0)),
112 |                                   total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]),
113 |                                   avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])),
114 |                                   median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)])),
115 |                                   miles_traveled=sum(distance, na.rm=T),
116 |                                   dep_airports=length(unique(origin)),
117 |                                   arr_airports=length(unique(dest)),
118 |                                   all_airports=length(union(unique(origin), unique(dest))),
119 |                                   flights_delayed_reason_carrier=sum(!is.na(carrierdelay)), 
120 |                                   flights_delayed_reason_weather=sum(!is.na(weatherdelay)),
121 |                                   flights_delayed_reason_security=sum(!is.na(securitydelay))
122 |   )][, year:=yr]
123 |   #save this data
124 |   saveData(flights.stats, paste(flights.folder.path, "stats/1/flights_stats_", yr, ".csv", sep=""))
125 |   #clear up space
126 |   rm(flights.stats)
127 |   
128 |   #
129 |   # get flights stats By month for every year
130 |   # 
131 |   if(verbose) cat("Getting flight stats by month: ", '\n')
132 |   flights.stats.month <- flights[, list(airlines = length(unique(uniquecarrier)),
133 |                                         flights=length(uniquecarrier),
134 |                                         flights_cancelled=sum(cancelled, na.rm=T),
135 |                                         flights_diverted=sum(diverted, na.rm=T),
136 |                                         flights_departed_late=length(which(depdelay > 0)),
137 |                                         flights_arrived_late=length(which(arrdelay > 0)),
138 |                                         total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]),
139 |                                         avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])),
140 |                                         median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)])),
141 |                                         miles_traveled=sum(distance, na.rm=T)
142 |   ),
143 |                                  by=month][, year:=yr]
144 |   #change col order
145 |   setcolorder(flights.stats.month, c("year", colnames(flights.stats.month)[-ncol(flights.stats.month)]))
146 |   #save this data
147 |   saveData(flights.stats.month, paste(flights.folder.path, "stats/2/flights_stats_by_month_", yr, ".csv", sep=""))
148 |   #clear up space
149 |   rm(flights.stats.month)
150 |   
151 | }
152 | 
153 | mapFlightStats <- function(){
154 |   for(j in 1:period) {
155 |     if( j > 2) {
156 |       yr <- as.integer(gsub("[^0-9]", "", gsub("(.*)(\\.csv)", "\\1", flights.files[j])))
157 |       flights.data.file <- paste(flights.folder.path, flights.files[j], sep="")
158 |       if(verbose) cat(yr, ": Reading : ", flights.data.file, "\n")
159 |       #flights.data.file <- "./pads/raw-data/flights/1987.csv.gz"
160 |       flights <- data.table(read.csv(flights.data.file, stringsAsFactors=F))
161 |       col.names <- colnames(flights)
162 |       setnames(flights, col.names, tolower(col.names))
163 |       flights <- flights[, list(year, month, uniquecarrier, origin, 
164 |                                 dest, cancelled, diverted, depdelay, 
165 |                                 arrdelay, distance, carrierdelay,
166 |                                 weatherdelay,securitydelay)]
167 |       setkeyv(flights, c("year", "uniquecarrier", "dest", "origin", "month")) 
168 |       if(verbose) cat("Starting analysis on: ", yr, "\n") 
169 |       getFlightStatsForYear(flights, yr)
170 |       getFlightsStatusByAirlines(flights, yr)
171 |       getFlightsStatsByAirport(flights, yr)
172 |     }
173 |   }
174 | }  
175 | 
176 | 
177 | 
178 | #
179 | #reduce all results
180 | #
181 | reduceFlightStats <- function(){
182 |   n <- 1:6
183 |   folder.path <- paste("./pads/raw-data/flights/stats/", n, "/", sep="")
184 |   print(folder.path)
185 |   for(i in n){
186 |     filenames <- paste(folder.path[i], list.files(path=folder.path[i], pattern="*.csv"), sep="") 
187 |     dt <- do.call("rbind", lapply(filenames, read.csv, stringsAsFactors=F))
188 |     print(nrow(dt))
189 |     saveData(dt, paste("./pads/raw-data/flights/stats/", i, ".csv", sep=""))
190 |   }
191 | }
192 | 
193 | #
194 | # Run this job - initialize, generate stats for individiual years and then aggregate them together 
195 | # to get single file for fligths, airports and airlines
196 | #
197 | runJob <- function(){
198 |   initForStats()
199 |   #mapFlightStats()
200 |   #reduceFlightStats()
201 | }


--------------------------------------------------------------------------------
/flights_map_reduce.R:
--------------------------------------------------------------------------------
  1 | # Authors: Jitender Aswani, Co-Founder @ datadolph.in
  2 | # Date: 2013-15-5
  3 | # Description: The Airline data set consists of flight arrival and departure details for all commercial flights from 1987 to 2008. 
  4 | #     The approximately 120MM records (CSV format) occupy 12GB space.  Data can be downloaded from here: http://stat-computing.org/dataexpo/2009/
  5 | #     This R code simulates Map-Reduce functionality to analyze 22 years of historical data on flights. 
  6 | # Packages Used: data.table & plyr
  7 | # Blog Reference: http://blog.datadolph.in/2013/06/big-data-analysis-performance-story-of-chicago-ohare-airport/
  8 | # Blog Reference: http://allthingsr.blogspot.com/2013/06/simulating-map-reduce-in-r-for-big-data.html
  9 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
 10 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
 11 | # All rights reserved.
 12 | 
 13 | #
 14 | # initialize and read all 22 compressed CSV files
 15 | #
 16 | initForStats <- function(){
 17 |   assign("flights.folder.path", "./raw-data/flights/", envir=.GlobalEnv)
 18 |   assign("verbose", T, envir=.GlobalEnv)
 19 |   flights.files <- list.files(path=flights.folder.path, pattern="*.csv.gz")
 20 |   assign("flights.files", flights.files, envir=.GlobalEnv)
 21 |   assign("period", length(flights.files), envir=.GlobalEnv)
 22 | }
 23 | 
 24 | #
 25 | # get flights stats By airlines
 26 | # 
 27 | getFlightsStatusByAirlines <- function(flights, yr){   
 28 |   #
 29 |   # by Year
 30 |   #
 31 |   if(verbose) cat("Getting stats for airlines:", '\n')
 32 |   airlines.stats <- flights[, list(dep_airports=length(unique(origin)),
 33 |                                    flights=length(origin),
 34 |                                    flights_cancelled=sum(cancelled, na.rm=T),
 35 |                                    flights_diverted=sum(diverted, na.rm=T),
 36 |                                    flights_departed_late=length(which(depdelay > 0)),
 37 |                                    flights_arrived_late=length(which(arrdelay > 0)),
 38 |                                    total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]),
 39 |                                    avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])),
 40 |                                    median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)])),                 
 41 |                                    miles_traveled=sum(distance, na.rm=T)
 42 |   ),
 43 |                             by=uniquecarrier][, year:=yr]
 44 |   #change col order
 45 |   setcolorder(airlines.stats, c("year", colnames(airlines.stats)[-ncol(airlines.stats)]))
 46 |   #save this data
 47 |   saveData(airlines.stats, paste(flights.folder.path, "stats/5/airlines_stats_", yr, ".csv", sep=""))
 48 |   #clear up space
 49 |   rm(airlines.stats)  
 50 |   
 51 |   #
 52 |   # by month
 53 |   #
 54 |   if(verbose) cat("Getting stats for airlines by month:", '\n')
 55 |   airlines.stats <- flights[, list(dep_airports=length(unique(origin)),
 56 |                                    flights=length(origin),
 57 |                                    flights_cancelled=sum(cancelled, na.rm=T),
 58 |                                    flights_diverted=sum(diverted, na.rm=T),
 59 |                                    flights_departed_late=length(which(depdelay > 0)),
 60 |                                    flights_arrived_late=length(which(arrdelay > 0)),
 61 |                                    total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]),
 62 |                                    avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])),
 63 |                                    median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)])),
 64 |                                    miles_traveled=sum(distance, na.rm=T)
 65 |   ),
 66 |                             by=list(uniquecarrier, month)][, year:=yr]
 67 |   #change col order
 68 |   setcolorder(airlines.stats, c("year", colnames(airlines.stats)[-ncol(airlines.stats)]))
 69 |   #save this data
 70 |   saveData(airlines.stats, paste(flights.folder.path, "stats/6/airlines_stats_monthly_", yr, ".csv", sep=""))
 71 |   #clear up space
 72 |   rm(airlines.stats)
 73 |   
 74 | }
 75 | 
 76 | #
 77 | # get flights stats By airport
 78 | # 
 79 | getFlightsStatsByAirport <- function(flights, yr){
 80 |   #
 81 |   # by year
 82 |   #
 83 |   if(verbose) cat("Getting stats for airport:", '\n')
 84 |   airport.stats <- flights[, list(flights=length(uniquecarrier),
 85 |                                   flights_cancelled=sum(cancelled, na.rm=T),
 86 |                                   flights_departed_late=length(which(depdelay > 0)),
 87 |                                   total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]),
 88 |                                   avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])),
 89 |                                   median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)]))
 90 |   ),
 91 |                            by=origin][, year:=yr]
 92 |   #change col order
 93 |   setcolorder(airport.stats, c("year", colnames(airport.stats)[-ncol(airport.stats)]))
 94 |   #save this data
 95 |   saveData(airport.stats, paste(flights.folder.path, "stats/3/airport_stats_", yr, ".csv", sep=""))
 96 |   #clear up space
 97 |   rm(airport.stats)  
 98 |   
 99 |   #
100 |   # by month
101 |   #
102 |   if(verbose) cat("Getting stats for airport by month:", '\n')
103 |   airport.stats <- flights[, list(flights=length(uniquecarrier),
104 |                                   flights_cancelled=sum(cancelled, na.rm=T),
105 |                                   flights_departed_late=length(which(depdelay > 0)),
106 |                                   total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]),
107 |                                   avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])),
108 |                                   median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)]))
109 |   ), by=list(origin, month)][, year:=yr]
110 |   #change col order
111 |   setcolorder(airport.stats, c("year", colnames(airport.stats)[-ncol(airport.stats)]))
112 |   #save this data
113 |   saveData(airport.stats, paste(flights.folder.path, "stats/4/airport_stats_monthly_", yr, ".csv", sep=""))
114 |   #clear up space
115 |   rm(airport.stats)
116 | }
117 | 
118 | #
119 | # get flights stats
120 | #  
121 | getFlightStatsForYear <- function(flights, yr){
122 |   #
123 |   # for every year
124 |   #  
125 |   if(verbose) cat("Getting flight stats: ", '\n')
126 |   flights.stats <- flights[, list(airlines = length(unique(uniquecarrier)),
127 |                                   flights=length(uniquecarrier),
128 |                                   flights_cancelled=sum(cancelled, na.rm=T),
129 |                                   flights_diverted=sum(diverted, na.rm=T),
130 |                                   flights_departed_late=length(which(depdelay > 0)),
131 |                                   flights_arrived_late=length(which(arrdelay > 0)),
132 |                                   total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]),
133 |                                   avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])),
134 |                                   median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)])),
135 |                                   miles_traveled=sum(distance, na.rm=T),
136 |                                   dep_airports=length(unique(origin)),
137 |                                   arr_airports=length(unique(dest)),
138 |                                   all_airports=length(union(unique(origin), unique(dest))),
139 |                                   flights_delayed_reason_carrier=sum(!is.na(carrierdelay)), 
140 |                                   flights_delayed_reason_weather=sum(!is.na(weatherdelay)),
141 |                                   flights_delayed_reason_security=sum(!is.na(securitydelay))
142 |   )][, year:=yr]
143 |   #save this data
144 |   saveData(flights.stats, paste(flights.folder.path, "stats/1/flights_stats_", yr, ".csv", sep=""))
145 |   #clear up space
146 |   rm(flights.stats)
147 |   
148 |   #
149 |   # by month for every year
150 |   # 
151 |   if(verbose) cat("Getting flight stats by month: ", '\n')
152 |   flights.stats.month <- flights[, list(airlines = length(unique(uniquecarrier)),
153 |                                         flights=length(uniquecarrier),
154 |                                         flights_cancelled=sum(cancelled, na.rm=T),
155 |                                         flights_diverted=sum(diverted, na.rm=T),
156 |                                         flights_departed_late=length(which(depdelay > 0)),
157 |                                         flights_arrived_late=length(which(arrdelay > 0)),
158 |                                         total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]),
159 |                                         avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])),
160 |                                         median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)])),
161 |                                         miles_traveled=sum(distance, na.rm=T)
162 |   ),
163 |                                  by=month][, year:=yr]
164 |   #change col order
165 |   setcolorder(flights.stats.month, c("year", colnames(flights.stats.month)[-ncol(flights.stats.month)]))
166 |   #save this data
167 |   saveData(flights.stats.month, paste(flights.folder.path, "stats/2/flights_stats_by_month_", yr, ".csv", sep=""))
168 |   #clear up space
169 |   rm(flights.stats.month)
170 |   
171 | }
172 | 
173 | #
174 | #map all calculations 
175 | #
176 | mapFlightStats <- function(){
177 |   for(j in 1:period) {
178 |     if( j > 2) {
179 |       yr <- as.integer(gsub("[^0-9]", "", gsub("(.*)(\\.csv)", "\\1", flights.files[j])))
180 |       flights.data.file <- paste(flights.folder.path, flights.files[j], sep="")
181 |       if(verbose) cat(yr, ": Reading : ", flights.data.file, "\n")
182 |       flights <- data.table(read.csv(flights.data.file, stringsAsFactors=F))
183 |       col.names <- colnames(flights)
184 |       setnames(flights, col.names, tolower(col.names))
185 |       flights <- flights[, list(year, month, uniquecarrier, origin, 
186 |                                 dest, cancelled, diverted, depdelay, 
187 |                                 arrdelay, distance, carrierdelay,
188 |                                 weatherdelay,securitydelay)]
189 |       setkeyv(flights, c("year", "uniquecarrier", "dest", "origin", "month")) 
190 |       if(verbose) cat("Starting analysis on: ", yr, "\n") 
191 |       getFlightStatsForYear(flights, yr)
192 |       getFlightsStatusByAirlines(flights, yr)
193 |       getFlightsStatsByAirport(flights, yr)
194 |     }
195 |   }
196 | }  
197 | 
198 | 
199 | #
200 | #reduce all results
201 | #
202 | reduceFlightStats <- function(){
203 |   n <- 1:6
204 |   folder.path <- paste("./raw-data/flights/stats/", n, "/", sep="")
205 |   print(folder.path)
206 |   for(i in n){
207 |     filenames <- paste(folder.path[i], list.files(path=folder.path[i], pattern="*.csv"), sep="") 
208 |     dt <- do.call("rbind", lapply(filenames, read.csv, stringsAsFactors=F))
209 |     print(nrow(dt))
210 |     saveData(dt, paste("./raw-data/flights/stats/", i, ".csv", sep=""))
211 |   }
212 | }
213 | 
214 | #
215 | # Run this job - initialize, generate stats for individual years and then aggregate them together 
216 | # to get single file for flights, airports and airlines
217 | #
218 | runJob <- function(){
219 |   initForStats()
220 |   mapFlightStats()
221 |   reduceFlightStats()
222 | 


--------------------------------------------------------------------------------
/Util.R:
--------------------------------------------------------------------------------
  1 | # Utility Funcitons
  2 | # Author: Jitender Aswani, Co-Founder @datadolph.in
  3 | # Date: 3/15/2013
  4 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
  5 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
  6 | # All rights reserved.
  7 | 
  8 | 
  9 | 
 10 | #
 11 | #open a log file with today's date
 12 | #
 13 | openLogfile <- function(){
 14 |   file.log <- file( paste("logs/pads_status.log",dataset, Sys.Date(), sep="_"), 'w')
 15 |   #initialize log file
 16 |   assign("file.log", file.log, envir=.GlobalEnv)}
 17 | 
 18 | #
 19 | #close the log file
 20 | #
 21 | closeLogfile <- function(){
 22 |   close(file.log)  
 23 | }
 24 | 
 25 | #
 26 | # log a message in a vector
 27 | #
 28 | logMessage <- function(mesg){
 29 |   #tryCatch(cat(paste(Sys.Date(), mesg, sep=":"), file = file.log), 
 30 |   #         error=function(e){
 31 |   #           openLogfile()   
 32 |   #           cat(paste(Sys.Date(), mesg, sep=":"), file = file.log)
 33 |   #          }, 
 34 |   #         finally={
 35 |   #        }
 36 |   #)
 37 |   #cat("\n", file = file.log)           
 38 |   vec.log[length(vec.log)+1] <<- paste(Sys.Date(), mesg, sep=":")
 39 | }
 40 | 
 41 | #
 42 | # dump logs to the file system
 43 | #
 44 | dumpLogs <- function(){
 45 |   openLogfile()
 46 |   cat(paste(vec.log, collpase="\n"), file = file.log)
 47 |   closeLogfile()
 48 | }
 49 | 
 50 | #
 51 | # Clean Meta files folder
 52 | #
 53 | cleanMetaFiles <- function(){
 54 |   folder.meta <- "./pads/meta/"
 55 |   meta.files <- list.files(path=folder.meta, pattern="*.json")
 56 |   for(i in meta.files) {
 57 |     meta.file.name <- paste(folder.meta, i, sep="")
 58 |     #delete the file
 59 |     unlink(meta.file.name)
 60 |   }   
 61 | }
 62 | 
 63 | #
 64 | # Clean csv files folder
 65 | #
 66 | cleanDataFiles <- function(){
 67 |   folder.data <- "./pads/data/"
 68 |   data.files <- list.files(path=folder.data, pattern="*.csv")
 69 |   for(i in data.files) {
 70 |     data.file.name <- paste(folder.data, i, sep="")
 71 |     #delete the file
 72 |     unlink(data.file.name)
 73 |   }   
 74 | }
 75 | 
 76 | #
 77 | # Clean cache files folder
 78 | #
 79 | cleanCacheFiles <- function(){
 80 |   folder.cache <- "./pads/cache/"
 81 |   cache.files <- list.files(path=folder.cache, pattern="*.rdata")
 82 |   for(i in cache.files) {
 83 |     cache.file.name <- paste(folder.cache, i, sep="")
 84 |     #delete the file
 85 |     unlink(cache.file.name)
 86 |   }   
 87 | }
 88 | 
 89 | # 
 90 | #read a csv file 
 91 | #
 92 | readFile <- function(file.location) {
 93 |   out <- tryCatch(
 94 |       read.csv(file.location, as.is=TRUE, header=TRUE, stringsAsFactors=FALSE, strip.white=TRUE),
 95 |       error=function(e) {
 96 |         message(paste("Unable to read the file:", pad.location))
 97 |         message("Error message:")
 98 |         message(e)
 99 |         # Choose a return value in case of error
100 |         return(NA)
101 |       },
102 |       finally={
103 |         #message(paste("Processed URL:", url))
104 |         #message("Some other message at the end")
105 |     }
106 |   )    
107 |   return(out)
108 | }
109 | 
110 | #
111 | #write a csv file 
112 | #
113 | saveData <- function(data, file.location){
114 |   write.csv(data, file.location, row.names=F)
115 | }
116 | 
117 | 
118 | #
119 | # remove leading/trailing spaces from a data.frame
120 | #
121 | trimData <- function(data){
122 |   return(data.frame(lapply(data, function(x) gsub("(^ +)|( +$)", "", x)), stringsAsFactors=F))
123 | }
124 | 
125 | #
126 | # replace NA in every column with zeros
127 | #
128 | replaceNAWithZeros <- function(data)  {
129 |   data[is.na(data)] <- 0 # replace NA with 0s
130 |   return(data)
131 | }
132 | 
133 | #
134 | # replace NULL in every cell with zeros
135 | #
136 | replaceNULLWithZeros <- function(data)  {
137 |   data[is.null(data)] <- 0 # replace null with 0s
138 |   return(data)
139 | }
140 | 
141 | #
142 | # replace NULL in every cell with NA
143 | #
144 | replaceNULLWithNA <- function(data)  {
145 |   data[is.null(data)] <- NA # replace null with 0s
146 |   return(data)
147 | }
148 | 
149 | #
150 | #get row wise json
151 | #
152 | getRowWiseJson <- function (jsonDT) {
153 |   require("RJSONIO")
154 |   row.json <- apply(jsonDT, 1, toJSON)
155 |   json.st <- paste('[', paste(row.json, collapse=', '), ']')
156 |   return (json.st)
157 | }
158 | 
159 | #
160 | #get unique ID for pads
161 | #
162 | getPadUID <- function(){
163 |   systime <- as.character(unclass(Sys.time()))
164 |   return(paste("pad", substr(systime, 1,10), substr(systime, 12,16), sep=""))
165 | }
166 | 
167 | #
168 | #get GUID 
169 | #
170 | getGUID <- function() {
171 |   baseuuid <- paste(sample(c(letters[1:6],0:9),30,replace=TRUE),collapse="")
172 |   
173 |   paste(
174 |     substr(baseuuid,1,8),
175 |     "-",
176 |     substr(baseuuid,9,12),
177 |     "-",
178 |     "4",
179 |     substr(baseuuid,13,15),
180 |     "-",
181 |     sample(c("8","9","a","b"),1),
182 |     substr(baseuuid,16,18),
183 |     "-",
184 |     substr(baseuuid,19,30),
185 |     sep="",
186 |     collapse=""
187 |   )
188 | }
189 | 
190 | #
191 | #get csv data file url
192 | #
193 | getDataFileURL <- function(id) {
194 |   return(paste("./pads/data/",id, ".csv", sep=""))
195 | }
196 | 
197 | #
198 | #get JSON data file url
199 | #
200 | getMetaDataFileURL <- function(id) {
201 |   return(paste("./pads/meta/",id, ".json", sep=""))
202 | }
203 | 
204 | #
205 | #get  R.data file url
206 | #
207 | getCacheDataFileURL <- function(id) {
208 |   return(paste("./pads/cache/",id, ".rdata", sep=""))
209 | }
210 | 
211 | #
212 | # Save the JSON Meta data for a PAD
213 | #
214 | saveMetaData <- function(out.file, jsonSt){
215 |   file.out <- file(out.file, 'wt')
216 |   cat(jsonSt, file=file.out, fill = TRUE)
217 |   close(file.out)
218 | }
219 | 
220 | #
221 | #replace meta chars from an array of col names
222 | #
223 | replaceMetaChars <- function(col.names) {
224 |   return(gsub("[^A-Za-z0-9]+", "_", col.names))
225 | }
226 | 
227 | #
228 | #remove meta chars from an array of values
229 | #
230 | removeMetaChars <- function(values) {
231 |   return(gsub("[^A-Za-z0-9 ]+", "", values))
232 | }
233 | 
234 | #
235 | #remove non-numeric chars from an array of values
236 | #
237 | removeNonNumericChars <- function(values) {
238 |   return(gsub("[^0-9.]+", "", values, fixed=T))
239 | }
240 | 
241 | 
242 | #
243 | #remove meta from numeric columns
244 | #
245 | removeMetaFromNumeric <- function(values) { 
246 |   #as.numeric( gsub("(,+)|(%+)|(\\$+)","",x) ) 
247 |   return(as.numeric(gsub("[^0-9.]","",values)))
248 | }
249 | 
250 | #
251 | #camel case
252 | #
253 | tocamel <- function(s, strict = FALSE) {
254 |   cap <- function(s) paste(toupper(substring(s,1,1)),
255 | {s <- substring(s,2); if(strict) tolower(s) else s},
256 |                            sep = "", collapse = " " )
257 |   sapply(strsplit(s, split = " "), cap, USE.NAMES = !is.null(names(s)))
258 | }
259 | 
260 | 
261 | #
262 | # clean a name
263 | #
264 | cleanName <- function(name) {
265 |   clean1 <-gsub("[,]*[ ]*[&]*[ ]+(the|inc|llc|llp|ltd|limited|lp|ll|affiliates|incorporated|incorporation|corp|co|corp|corporation|stores|international|n\\.a|u\\.s\\.a)*[\\.]*[,]*$", "", name,  ignore.case = T)
266 |   clean2 <- gsub("[,]*[ ]*[&]*[ ]+(the|inc|llc|llp|ltd|limited|lp|affiliates|incorporated|incorporation|corp|co|corp|corporation|stores|international|n\\.a|u\\.s\\.a)*[\\.]*[,]*$", "", clean1,  ignore.case = T)
267 |   clean3 <- gsub("[&]", "and", clean2)
268 |   clean4 <- gsub("-", "", clean3)
269 |   #clean5 <- gsub("[^a-zA-Z]", "_" , clean4)
270 |   #clean5 <- gsub("(_)+", "_" , unique(clean4))
271 |   return (clean4)
272 | }
273 | 
274 | #
275 | # Get Class function to return a single class for every object
276 | #
277 | getClass <- function(obj) {
278 |   obj.class <- class(obj)
279 |   if(length(obj.class) > 1)
280 |     return(obj.class[2])
281 |   else
282 |     return(obj.class)
283 | }
284 | 
285 | #
286 | # Generate solr doc from pad meta
287 | #
288 | generateSolrDocFromPADMeta <- function(meta.data) {
289 |   generateSolrDoc(meta.data$id, meta.data$title, meta.data$category, 
290 |                        meta.data$subcategory,meta.data$tags)
291 | }
292 | 
293 | #
294 | # Generate solr doc and append it to a solr file for later processing
295 | #
296 | generateSolrDoc <- function(id, title, category, subcategory, tags) {
297 |   file.out <- file("./pads/solr-docs.json", "r+")
298 |   lines <- readLines(file.out)
299 |   if(length(lines) != 0)
300 |     solr.index.docs <- fromJSON(paste(lines, collapse=""))
301 |   doc <- list("type"="pad", "id" = id, "title" = title, "category"=category, 
302 |               "subcategory"=subcategory, "tags"=tags)
303 |   solr.index.docs[[length(solr.index.docs)+1]] <- doc
304 |   cat(toJSON(solr.index.docs), file=file.out, fill = TRUE)
305 |   close(file.out)
306 | }
307 | 
308 | #
309 | # Save solr.index.doc to file system for later processing
310 | #
311 | persistSolrIndex <- function(solr.index.docs, source.name="") {
312 |   file.out <- file(paste("./pads/solr-docs-", source.name, ".json", sep=""), "w")
313 |   cat(toJSON(solr.index.docs), file=file.out, fill = TRUE)
314 |   close(file.out)  
315 | }
316 | 
317 | #
318 | # Update multiple pads in a System PADS table
319 | #
320 | updateSystemPads <- function(file="./pads/QualifiedPADS.csv") {
321 |   pads <- read.csv(file,  stringsAsFactors=FALSE)
322 |   for(i in 1:nrow(pads)) {
323 |     print(paste("#", i, "Updating...", pads$id[i], sep=" "))
324 |     updateSystemPad(pads$id[i], pads$title[i], pads$desc[i], pads$category[i], 
325 |                     pads$subcategory[i], pads$src[i], pads$tags[i])
326 |     
327 |     ###### Update solr document
328 |     generateSolrDoc(pads$id[i], pads$title[i], pads$category[i], pads$subcategory[i],  
329 |                     pads$src[i], pads$tags[i])
330 |   }
331 | }
332 | 
333 | #
334 | # get age 
335 | #
336 | getAge <- function(dob, as.of.when, units){
337 |   switch(units, 
338 |          w = {# weeks
339 |               return(difftime(strptime(as.of.when, format = "%Y/%m/%d"),
340 |                 strptime(dob, format = "%Y/%m/%d"),units="weeks"))
341 |          },
342 |          m = {
343 |                 # months
344 |                 return((as.yearmon(strptime(as.of.when, format = "%Y/%m/%d"))-
345 |                   as.yearmon(strptime(dob, format = "%Y/%m/%d")))*12)
346 |          },
347 |          y = {
348 |            return(year(strptime(as.of.when, format = "%Y/%m/%d"))-
349 |              year(strptime(dob, format = "%Y/%m/%d")))
350 |          },
351 |   )
352 | }
353 | #
354 | # get time period for a give vector of dates
355 | #
356 | getTimePeriod <- function (date.vector){
357 |   yr <-  year(date.vector)
358 |   min.year <- min(yr, na.rm=T)
359 |   max.year <- max(yr, na.rm=T)
360 |   time.period <- paste(min.year, max.year, sep="-")
361 |   return(time.period)
362 | }
363 | 
364 | #
365 | # escpae meta chars
366 | #
367 | escapeMetaChars <- function(values){
368 |     return(gsub("[^A-Za-z0-9 ]+", "", values))
369 | }
370 | 
371 | #
372 | # remove special chars
373 | #
374 | removeSpecialChars <- function (node) {
375 |   #val = gsub("[[:space:]]", "", xmlValue(node))
376 |   val = gsub("[\xc2\xa0]", "", xmlValue(node))
377 | }
378 | 
379 | #
380 | # set class num.with.dollar
381 | #
382 | setClass("num.with.dollar")
383 | setAs("character", "num.with.dollar",
384 |       function(from) as.numeric(gsub("$","",from, fixed=TRUE)))
385 | 
386 | #
387 | # set class num.with.commas
388 | #
389 | setClass("num.with.commas")
390 | setAs("character", "num.with.commas", 
391 |       function(from) as.numeric(gsub(",", "", from,  fixed=TRUE)))
392 | 
393 | #
394 | # set class mix.dates
395 | # an example - 02009-09-25 September 25, 2009
396 | #
397 | setClass("mixed.dates")
398 | setAs("character", "mixed.dates", 
399 |       function(from) as.Date(gsub("^0", "", str_extract(from, "^[0-9-]+")), "%Y-%m-%d"))
400 | 
401 | #
402 | # revenrse string
403 | # strReverse(c("abc", "Statistics"))
404 | #
405 | strReverse <- function(x)
406 |   sapply(lapply(strsplit(x, NULL), rev), paste, collapse="")
407 | 
408 | #
409 | # get older data
410 | #
411 | # x <- mdy(stats$effective_date)
412 | getOlderDate <- function(x, year=1930){
413 |   m <- year(x) %% 100
414 |   year(x) <- ifelse(m > year %% 100, 1900+m, 2000+m)
415 |   x
416 | }
417 | 


--------------------------------------------------------------------------------
/learning/TwitterMap.R:
--------------------------------------------------------------------------------
  1 | #########################################################################################
  2 | #   An R function to make a personalized map of people you follow and who follow you on twitter. 
  3 | #   R functions Copyright (C) 2011 Jeff Leek (jtleek@gmail.com), and the Simply Statistics Blog
  4 | #   (http://simplystatistics.tumblr.com, http://twitter.com/simplystats)
  5 | #
  6 | #   This program is free software: you can redistribute it and/or modify
  7 | #   it under the terms of the GNU General Public License as published by
  8 | #   the Free Software Foundation, either version 3 of the License, or
  9 | #   (at your option) any later version.
 10 | #
 11 | #   This program is distributed in the hope that it will be useful,
 12 | #   but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 14 | #   GNU General Public License for more details, see <http://www.gnu.org/licenses/>.
 15 | #
 16 | #
 17 | #   These functions depend on the packages: twitteR, maps, geosphere, and RColorBrewer. It will
 18 | #   attempt to install them if they are not installed when you source this function. Care
 19 | #   should be used when using this function since the twitteR API has rate limiting in place.
 20 | #   If you have a large number of followers, or run the function many times, you may be
 21 | #   rate limited. 
 22 | #
 23 | #
 24 | #   How to use: 
 25 | #       # Source the function
 26 | #       source("http://biostat.jhsph.edu/~jleek/code/twitterMap.R")
 27 | #
 28 | #      # Make your twittermap
 29 | #      twitterMap("simplystats")
 30 | #
 31 | #      #If your location can't be found or latitude longitude can't be calculated
 32 | #      #choose a bigger city near you. The list of cities used by twitterMap
 33 | #      #can be found like so:
 34 | #      data(world.cities)
 35 | #      grep("Baltimore",world.cities[,1])
 36 | #
 37 | #      # Then make the map using that big city
 38 | #      twitterMap("simplystats",userLocation="Baltimore")
 39 | #   
 40 | #      #If you want both your followers and people you follow in a plot you can do:
 41 | #      twitterMap("simplystats",plotType="both")
 42 | #      
 43 | ########################################################################################
 44 | getPckg <- function(pckg) install.packages(pckg, repos = "http://cran.r-project.org")
 45 | 
 46 | pckg = try(require(twitteR))
 47 | if(!pckg) {
 48 | cat("Installing 'twitteR' from CRAN\n")
 49 | getPckg("twitteR")
 50 | require("twitteR")
 51 | }
 52 | 
 53 | pckg = try(require(maps))
 54 | if(!pckg) {
 55 | cat("Installing 'maps' from CRAN\n")
 56 | getPckg("maps")
 57 | require("maps")
 58 | }
 59 | 
 60 | pckg = try(require(geosphere))
 61 | if(!pckg) {
 62 | cat("Installing 'geosphere' from CRAN\n")
 63 | getPckg("geosphere")
 64 | require("geosphere")
 65 | }
 66 | 
 67 | 
 68 | pckg = try(require(RColorBrewer))
 69 | if(!pckg) {
 70 | cat("Installing 'RColorBrewer' from CRAN\n")
 71 | getPckg("RColorBrewer")
 72 | require("RColorBrewer")
 73 | }
 74 | 
 75 | 
 76 | twitterMap <- function(userName,userLocation=NULL,fileName="twitterMap.pdf",nMax = 1000,plotType=c("followers","both","following")){
 77 | 	   
 78 |   # Get location data
 79 |   cat("Getting data from Twitter, this may take a moment.\n")
 80 |   tmp = getUser(userName)
 81 |   if(is.null(userLocation)){
 82 |     userLocation = location(tmp)
 83 |     userLocation = trim(userLocation)
 84 |     if(nchar(userLocation) < 2){stop("We can not find your location from Twitter")}
 85 |   }
 86 | 
 87 |   followers=tmp$getFollowers(n=nMax)
 88 |   followersLocation = sapply(followers,function(x){location(x)})
 89 |   following = tmp$getFriends(n=nMax)
 90 |   followingLocation = sapply(following,function(x){location(x)})
 91 | 
 92 | 
 93 |   # Load the geographic data
 94 |   data(world.cities)
 95 |   data(us.cities)
 96 |   data(canada.cities)
 97 | 
 98 |   # Find the latitude and longitude of the user
 99 |   cat("Getting geographic (latitude/longitude) of Twitter users.\n")
100 |   userLL <- findLatLon(userLocation)$latlon
101 |   if(any(is.na(userLL))){stop("We can't find the latitude and longitude of your location from Twitter")}
102 | 
103 | 
104 |   # Find the latitude and longitude of each of the followers/following
105 |   # and calcualte the distance to the user
106 |   
107 |   followersLL = matrix(NA,nrow=length(followers),ncol=4)
108 |   followingLL = matrix(NA,nrow=length(following),ncol=4)
109 | 
110 |   for(i in 1:length(followers)){
111 |     if(length(followersLocation[[i]]) > 0){
112 |       tmpLL = findLatLon(trim(followersLocation[[i]]))
113 |       if(any(!is.na(tmpLL$latlon))){
114 |         followersLL[i,] = c(unlist(tmpLL$latlon),distCosine(userLL,tmpLL$latlon),unlist(tmpLL$cont))
115 |       }
116 |     }
117 |   }
118 | 
119 |    for(i in 1:length(following)){
120 |     if(length(followingLocation[[i]]) > 0){
121 |       tmpLL = findLatLon(trim(followingLocation[[i]]))
122 |       if(any(!is.na(tmpLL$latlon))){
123 |         followingLL[i,] =  c(unlist(tmpLL$latlon),distCosine(userLL,tmpLL$latlon),unlist(tmpLL$cont))
124 |       }
125 |     }
126 |   }
127 | 
128 |   followingLL = followingLL[order(-followingLL[,3]),]
129 |   followersLL = followersLL[order(-followersLL[,3]),]
130 | 
131 |   followingLL = followingLL[!is.na(followingLL[,1]),]
132 |   followersLL = followersLL[!is.na(followersLL[,1]),]
133 |   
134 | 
135 |   cat("Plotting results.\n")
136 |   # Set up the colors
137 |   cols = brewer.pal(7,"Set2")
138 | 
139 |   # Both followers and following
140 |   if(plotType=="both"){
141 |     pdf(fileName,height=12,width=10)
142 |     data(worldMapEnv)
143 |     par(mfrow=c(2,1),mar=rep(0,4))
144 |     map('world',col="#191919",bg="black",fill=T,mar=rep(0,4),border=0)
145 | 
146 |     mtext(paste("@",userName," Follower Map",sep=""),col="lightgrey")
147 |     nFollowers = dim(followersLL)[1]
148 |     for(i in 1:nFollowers){
149 |       greatC = getGreatCircle(userLL,followersLL[i,1:2])
150 |       lines(greatC,col=cols[followersLL[i,4]],lwd=0.8)
151 |     }
152 |     
153 |     legend(-180,0,legend = c(paste("Asia",sum(followersLL[,4]==1)),paste("Africa",sum(followersLL[,4]==2)),paste("N. America",sum(followersLL[,4]==3)),paste("S. America",sum(followersLL[,4]==4)),paste("Australia/N.Z.",sum(followersLL[,4]==5)),paste("Europe",sum(followersLL[,4]==6))),text.col=cols[1:6],bg="black",cex=0.75)
154 |     
155 |  
156 |     map('world',col="#191919",bg="black",fill=T,mar=rep(0,4),border=0)
157 |     mtext(paste("@",userName," Following Map",sep=""),col="lightgrey")
158 |     nFollowing = dim(followingLL)[1]
159 |     for(i in 1:nFollowing){
160 |       greatC = getGreatCircle(userLL,followingLL[i,1:2])
161 |       lines(greatC,col=cols[followingLL[i,4]],lwd=0.8)
162 |     }
163 |     
164 |     legend(-180,0,legend = c(paste("Asia",sum(followingLL[,4]==1)),paste("Africa",sum(followingLL[,4]==2)),paste("N. America",sum(followingLL[,4]==3)),paste("S. America",sum(followingLL[,4]==4)),paste("Australia/N.Z.",sum(followingLL[,4]==5)),paste("Europe",sum(followingLL[,4]==6))),text.col=cols[1:6],bg="black",cex=0.75)
165 | 
166 |     mtext("Created by @simplystats twitterMap",side=1,adj=1,cex=0.8,col="grey")
167 |     dev.off()
168 |   }
169 | 
170 |   ## Just followers
171 |   if(plotType=="followers"){
172 |     pdf(fileName,height=6,width=10)
173 |     data(worldMapEnv)
174 |     map('world',col="#191919",bg="black",fill=T,mar=rep(0,4),border=0)
175 | 
176 |     mtext(paste("@",userName," Follower Map",sep=""),col="lightgrey")
177 |     nFollowers = dim(followersLL)[1]
178 |     for(i in 1:nFollowers){
179 |       greatC = getGreatCircle(userLL,followersLL[i,1:2])
180 |       lines(greatC,col=cols[followersLL[i,4]],lwd=0.8)
181 |     }
182 |     
183 |     legend(-180,0,legend = c(paste("Asia",sum(followersLL[,4]==1)),paste("Africa",sum(followersLL[,4]==2)),paste("N. America",sum(followersLL[,4]==3)),paste("S. America",sum(followersLL[,4]==4)),paste("Australia/N.Z.",sum(followersLL[,4]==5)),paste("Europe",sum(followersLL[,4]==6))),text.col=cols[1:6],bg="black",cex=0.75)
184 |     mtext("Created by @simplystats twitterMap",side=1,adj=1,cex=0.8,col="grey")
185 |     dev.off()
186 |  
187 |   }
188 | 
189 |   ## Just following
190 |   if(plotType=="following"){
191 |     pdf(fileName,height=6,width=10)
192 |     data(worldMapEnv)
193 |     map('world',col="#191919",bg="black",fill=T,mar=rep(0,4),border=0)
194 |     mtext(paste("@",userName," Following Map",sep=""),col="lightgrey")
195 |     nFollowing = dim(followingLL)[1]
196 |     for(i in 1:nFollowing){
197 |       greatC = getGreatCircle(userLL,followingLL[i,1:2])
198 |       lines(greatC,col=cols[followingLL[i,4]],lwd=0.8)
199 |     }
200 |     
201 |     legend(-180,0,legend = c(paste("Asia",sum(followingLL[,4]==1)),paste("Africa",sum(followingLL[,4]==2)),paste("N. America",sum(followingLL[,4]==3)),paste("S. America",sum(followingLL[,4]==4)),paste("Australia/N.Z.",sum(followingLL[,4]==5)),paste("Europe",sum(followingLL[,4]==6))),text.col=cols[1:6],bg="black",cex=0.75)
202 | 
203 |     mtext("Created by @simplystats twitterMap",side=1,adj=1,cex=0.8,col="grey")
204 |     dev.off()
205 |     
206 |   }
207 |   
208 | }
209 | 
210 | 
211 | findLatLon <- function(loc){
212 |   latlon = NA
213 |   cont = NA
214 | 
215 |   # Asia = 1, Africa = 2, North America = 3, South America = 4, Australia/New Zealand = 5, Europe = 6
216 |   continents = matrix(NA,nrow=length(unique(world.cities[,2])),ncol=2)
217 |   continents[,1] = unique(world.cities[,2])
218 |   continents[1:10,2] = c(1,1,1,2,1,1,1,1,1,1)
219 |   continents[11:20,2]= c(1,1,2,1,1,2,1,2,2,2)
220 |   continents[21:30,2] = c(2,1,6,6,6,6,6,6,6,6)
221 |   continents[31:40,2] = c(6,6,6,6,2,4,4,1,2,1)
222 |   continents[41:50,2] = c(4,6,1,4,6,1,3,1,6,6)
223 |   continents[51:60,2] = c(3,2,4,2,6,1,6,1,3,2)
224 |   continents[61:70,2] = c(1,2,2,2,3,6,3,3,6,6)
225 |   continents[71:80,2] = c(1,1,2,6,3,4,3,4,6,1)
226 |   continents[81:90,2] = c(3,3,3,2,2,6,6,6,6,4)
227 |   continents[91:100,2] = c(2,5,2,2,3,1,1,1,1,1)
228 |   continents[101:110,2] = c(1,2,1,1,1,3,2,5,1,6)
229 |   continents[111:120,2] = c(1,6,1,1,2,6,1,1,6,2)
230 |   continents[121:130,2] = c(6,6,6,1,1,3,4,3,4,2)
231 |   continents[131:140,2] = c(6,6,2,2,1,1,1,4,1,1)
232 |   continents[141:150,2] = c(1,2,2,1,1,1,4,6,6,2)
233 |   continents[151:160,2] = c(4,1,1,1,1,2,4,6,2,2)
234 |   continents[161:170,2] = c(1,2,2,1,6,2,1,1,6,1)
235 |   continents[171:180,2] = c(1,1,1,2,6,2,2,6,1,1)
236 |   continents[181:190,2] = c(2,6,2,1,6,6,3,3,3,3)
237 |   continents[191:200,2] = c(2,2,2,2,3,2,3,2,3,1)
238 |   continents[201:210,2] = c(3,2,2,2,2,2,2,1,6,2)
239 |   continents[211:220,2] = c(1,3,1,6,2,4,3,6,3,4)
240 |   continents[221:230,2] = c(1,1,1,3,2,3,3,6,1,6)
241 |   continents[231:232,2] = c(2,1)
242 | 
243 | 
244 |   # Get the first element of the location
245 |  # firstElement = strsplit(loc,"[^[:alnum:]]")[[1]][1]
246 |   firstElement = strsplit(loc,",")[[1]][1]
247 |   if(is.na(firstElement)){firstElement="zzzzzzzzz"}
248 | 
249 |   # See if it is a city
250 |   tmp = grep(firstElement,world.cities[,1],fixed=TRUE)
251 |   tmp2 = grep(firstElement,state.name,fixed=TRUE)
252 |   tmp3 = grep(firstElement,world.cities[,2],fixed=TRUE)
253 | 
254 |   if(length(tmp) == 1){
255 |     latlon = world.cities[tmp,c(5,4)]
256 |     cont = continents[which(world.cities[tmp,2]==continents[,1]),2]
257 |   }else if(length(tmp) > 1){
258 |     tmpCities = world.cities[tmp,]
259 |     latlon = tmpCities[which.max(tmpCities$pop),c(5,4)]
260 |     cont = continents[which(tmpCities[which.max(tmpCities$pop),2]==continents[,1]),2]
261 |   }else if(length(tmp2) == 1){
262 |     latlon = c(state.center$x[tmp2],state.center$y[tmp2])
263 |     cont = 3
264 |   }else if(length(tmp3) > 0){
265 |     tmpCities = world.cities[tmp3,]
266 |     latlon = tmpCities[which.max(tmpCities$pop),c(5,4)]
267 |     cont = continents[which(tmpCities[which.max(tmpCities$pop),2]==continents[,1]),2]
268 |   }
269 |   
270 |   return(list(latlon=latlon,cont=as.numeric(cont)))
271 | 
272 | }
273 | 
274 | 
275 | getGreatCircle = function(userLL,relationLL){
276 |   tmpCircle = greatCircle(userLL,relationLL)
277 |   start = which.min(abs(tmpCircle[,1] - userLL[1,1]))
278 |   end = which.min(abs(tmpCircle[,1] - relationLL[1]))
279 |   greatC = tmpCircle[start:end,]
280 |   return(greatC)
281 | }
282 | 
283 | trim <- function (x) gsub("^\\s+|\\s+$", "", x)


--------------------------------------------------------------------------------