├── learning
├── CheckLibraryPath.R
├── TwitterMapUse.R
├── EnvExample.R
├── Plot.R
├── Kmeans.R
├── GoogleVis-Motion.R
├── fileUpload.R
├── GoogleVis-Geo_Table.R
├── RjsonIO.R
├── geoCodeXML.R
├── MasterBlaster.R
├── ListLoadedPads.R
├── PlayingWithEnv.R
├── DQQE-Utility.R
├── ggplot2.R
├── ggplot.R
├── loadPADS.R
├── JSON.R
├── Clinic.R
├── WikiRead.R
├── NYTWebService.R
├── SP500MarketInflection.R
├── VCorpus.R
├── DataParser.R
├── MiscR.R
├── TransformDataSet.R
├── geoCode.R
├── TransformFailedBanks.R
├── SQLQueryDataFrame.R
├── DGQueryEngine.R
├── DGQEWebSockets.R
├── DKOMSentimentAnalysis.R
├── TwiterSentimentAnalysis.R
├── Olympics.R
├── birbal.R
├── TransformMasterBlasterData.R
└── TwitterMap.R
├── GenerateMaplightPADS.R
├── README.md
├── FetchData
├── FetchFailedBanksData.R
├── FetchBeerRatings.R
├── FetchRussell2000.R
├── FetchIPLData.R
├── FetchImmigrationData.R
└── GenerateFlightStats.R
├── GeneratePADS.R
├── GenerateFredPADS.R
├── GenerateWDI.R
├── MongoFns.R
├── r_solr_integration.R
├── GenerateWDIPADS.R
├── PresedentialElectionsResults.R
├── SentimentAnalysis.R
├── GenerateCrimePADS.R
├── CreatePADS.R
├── ClassifyData.R
├── flights_map_reduce.R
└── Util.R
/learning/CheckLibraryPath.R:
--------------------------------------------------------------------------------
1 | #Check library path
2 | pkg <- installed.packages()
3 | pkg[which(pkg[, 1] == "base"), 2]
--------------------------------------------------------------------------------
/learning/TwitterMapUse.R:
--------------------------------------------------------------------------------
1 | source("http://biostat.jhsph.edu/~jleek/code/twitterMap.R")
2 | twitterMap("spoonan", userLocation="Palo Alto", plotType="both")
--------------------------------------------------------------------------------
/GenerateMaplightPADS.R:
--------------------------------------------------------------------------------
1 | #
2 | # Generate maplight
3 | #
4 |
5 | require(data.table)
6 |
7 | assign("folder.path", "./pads/raw-data/maplight/", envir=.GlobalEnv)
8 | #assign("data.file", "data.csv.gz", envir=.GlobalEnv)
9 | #ml.data.f <- data.table(read.csv(paste(folder.path, data.file, sep="")))
10 |
11 | assign("data.cache", "ml-data.Rdata", envir=.GlobalEnv)
12 | ml.data <- data.table(read.csv(paste(folder.path, data.cache, sep="")))
13 |
--------------------------------------------------------------------------------
/learning/EnvExample.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | e <- new.env()
8 | e$foo <- 42
9 | attach(e, name='datagram')
10 | rm(list=ls()) # Remove all in global env
11 | foo # Still there!
12 | #...and to detach it:
13 | #detach('datagram')
--------------------------------------------------------------------------------
/learning/Plot.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | #setContentType ("image/png")
8 | temp <- tempfile ()
9 | y = rnorm (100)
10 | png (temp, type="cairo")
11 | plot (1:100, y, t='l')
12 | dev.off ()
13 | sendBin (readBin (temp, 'raw', n=file.info(temp)$size))
14 | unlink (temp)
--------------------------------------------------------------------------------
/learning/Kmeans.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | newiris <- iris
8 | newiris$Species <- NULL
9 | kc <- kmeans(newiris, 3)
10 | table(iris$Species, kc$cluster)
11 | plot(newiris[c("Sepal.Length", "Sepal.Width")], col = kc$cluster)
12 | points(kc$centers[,c("Sepal.Length", "Sepal.Width")], col = 1:3, pch = 8, cex=2)
--------------------------------------------------------------------------------
/learning/GoogleVis-Motion.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | library("googleVis")
8 |
9 | data(Fruits)
10 | #typeof(Fruits)
11 | M <- gvisMotionChart(Fruits, idvar="Fruit", timevar="Year")
12 | plot(M)
13 |
14 | #Chart Outout
15 | #print(M, 'chart')
16 | #cat(M$html$header)
17 | #cat(M$html$chart)
18 | #cat(M$html$footer)
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | R
2 | =
3 | In this repository, the datadolph.in team is sharing all of the R codebase that it developed to analyze large quantities of data.
4 | datadolph.in team has benefited tremendously from fellow R bloggers and other open source communities and is proud to contribute
5 | all of its codebase into the community.
6 |
7 | The codebase includes ETL and interatiion scripts on -
8 | - R-Solr Integration
9 | - R-Mongo Interaction
10 | - R-MySQL Interaction
11 | - Fetching, cleansing and transforming data
12 | - Classfication (idenfiy column types)
13 | - Default chart generation (based on simple heuristics and matching a dimension with a measure)
14 |
--------------------------------------------------------------------------------
/learning/fileUpload.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | #Set content type for output
8 | setContentType("text/html")
9 | post <- str(POST)
10 | print(post)
11 | cat(post)
12 | cat("
")
13 |
14 | #Files
15 | FileStr <- str(FILES)
16 | cat(FileStr)
17 | cat("
")
18 |
19 | #Or you can use the paste function
20 | cat(paste(FILES,names(FILES)))
21 | cat("
")
22 |
23 | destination <- file.path('/Users/homemac/Toga/tmp',FILES$FirstFile$name)
24 | file.copy(FILES$FirstFile$tmp_name,destination,overwrite=TRUE)
--------------------------------------------------------------------------------
/learning/GoogleVis-Geo_Table.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | library("googleVis")
8 | #Geo
9 | G <- gvisGeoChart(Exports, "Country", "Profit", options=list(width=200, height=100))
10 | #Table
11 | T <- gvisTable(Exports, options=list(width=200, height=270))
12 | #Motion
13 | M <- gvisMotionChart(Fruits, "Fruit", "Year", options=list(width=400, height=370))
14 | #Merge Charts
15 | GT <- gvisMerge(G,T, horizontal=FALSE)
16 | #Merge Horizontal
17 | GTM <- gvisMerge(GT, M, horizontal=TRUE, tableOptions="bgcolor=\"#CCCCCC\" cellspacing=10")
18 | plot(GTM)
--------------------------------------------------------------------------------
/learning/RjsonIO.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | setwd("/Users/homemac/Toga/Alto")
8 | library("RJSONIO")
9 | l <- fromJSON('[{"winner":"68694999", "votes":[ {"ts":"Thu Mar 25 03:13:01 UTC 2010", "user":{"name":"Lamur","user_id":"68694999"}}, {"ts":"Thu Mar 25 03:13:08 UTC 2010", "user":{"name":"Lamur","user_id":"68694999"}}], "lastVote":{"timestamp":1269486788526,"user":{"name":"Lamur","user_id":"68694999"}},"startPrice":0}]')
10 | m <- lapply(l[[1]]$votes, function(x) c(x$user$name, x$user$user_id, x$ts))
11 | ma <- matrix(unlist(m), ncol=3, byrow=TRUE)
12 | p <- data.frame(ma)
--------------------------------------------------------------------------------
/learning/geoCodeXML.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | getDocNodeVal <- function(doc, path)
8 | {
9 | sapply(getNodeSet(doc, path), function(el) xmlValue(el))
10 | }
11 |
12 |
13 | getGeoCode <- function(str)
14 | {
15 | library(XML)
16 | #str <- gsub(' ','%20',str)
17 | url <- paste('http://maps.google.com/maps/api/geocode/xml?sensor=false&address=',str)
18 | response <- xmlTreeParse(url, useInternal=TRUE)
19 |
20 | lat <- getDocNodeVal(response, "/GeocodeResponse/result/geometry/location/lat")
21 | lng <- getDocNodeVal(response, "/GeocodeResponse/result/geometry/location/lng")
22 | return (c(lat,lng))
23 | }
24 |
25 | t <- getGeoCode("Bangalore, India")
--------------------------------------------------------------------------------
/learning/MasterBlaster.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | library("RJSONIO")
8 | rm (list = ls())
9 |
10 | # Read SachinTestRecords.csv
11 |
12 | mb <- read.csv("datasets/SachinTestRecords.csv", as.is=TRUE, header=TRUE, stringsAsFactors=FALSE, strip.white=TRUE)
13 | #mb <- read.csv("datasets/SachinTestRecords.csv", header=TRUE,strip.white=TRUE)
14 | nRows <- nrow(mb)
15 | nCols <- ncol(mb)
16 | cNames <- colnames(mb)
17 | cClass <- sapply(mb, class)
18 | iCols <- list(name=cNames[1], class=class(mb[,1]), sample=head(mb[,1]))
19 | for(i in 2:5) {
20 | iCols <- list(iCols, list(name=cNames[i], class=class(mb[,i]), sample=head(mb[,i])))
21 | }
22 | metaData <- list(nRows=nRows, nCols=nCols, iCols)
23 | jsonString <- toJSON(metaData)
--------------------------------------------------------------------------------
/learning/ListLoadedPads.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | setContentType("text/html")
8 | #Step 2 - load environment
9 | pads.loaded <- ls(pattern='pad[0-9]', envir=.GlobalEnv)
10 | #pads.loaded <- ls()
11 | #cat(is.environment(.rAenv))
12 | cat("This is a test script. It checks for pre-loaded pads.
Following pads are pre-loaded in this environment:
");
13 | cat(paste("No. of Pads: ", length(pads.loaded), "
", sep=""))
14 | getDetails <- function(x)
15 | {
16 | d <- get(x)
17 | return(paste("PadName:", x, ", Records:", nrow(d), ", Columns:", length(d), ", ColNames:", paste(colnames(d), collapse="; "), ", Col Classes:", paste(lapply(d, class), collapse="||"), sep=""))
18 | }
19 | v <- vapply(pads.loaded, getDetails, "Pad Was Not Found.")
20 | cat(paste(v, collapse="
"))
--------------------------------------------------------------------------------
/learning/PlayingWithEnv.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 | rm (list = ls("PADSEnv"))
7 | padsEnv <- new.env()
8 | padsFile <- "/Users/homemac/pads.json"
9 |
10 | getPADS <- function()
11 | {
12 | library("RJSONIO")
13 | con <- file(padsFile)
14 | data.json <- fromJSON(paste(readLines(con), collapse=""))
15 | close(con)
16 | #Convert the list to a matrix and then to a data frame
17 | m <- matrix(unlist(data.json), ncol=2, byrow=TRUE)
18 | pads <- data.frame(m, stringsAsFactors=FALSE)
19 | colnames(pads) <- c("id", "location")
20 | return (pads)
21 | }
22 |
23 | padsEnv$pads <- getPADS()
24 | #Load all the PADS
25 | sysID <- NULL
26 | for(i in 1:length(padsEnv$pads)) {
27 | sysID[i] <- load(padsEnv$pads$location[i], padsEnv)
28 | }
29 | padsEnv$pads$sysID <- sysID
30 | attach(padsEnv, name='PADSEnv')
31 | # All Done .rAenv
32 | rm(list=ls())
33 |
--------------------------------------------------------------------------------
/learning/DQQE-Utility.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | getResult <- function(dt, expr, gby) {
8 | return(dt[,eval(expr), by=eval(gby)])
9 | }
10 | dt <- data.table(iris)
11 | v1 <- "Sepal.Length"
12 | v2 <- "Species"
13 | e <- parse(text = paste("sum(", v1, ", na.rm = TRUE)"))
14 | b <- parse(text = v2)
15 | #rDT2 <- dt[, eval(e), by = eval(b)]
16 | dtR <- getResult(dt = dt, expr = e, gby = b)
17 |
18 |
19 | i <- which(colnames(padDT)==m)
20 | getData(padDT, sum(i, na.rm=TRUE), d)
21 | #getData(padDT, parse(text=paste("sum(", m, ",na.rm=TRUE)", sep="")), d) lapply(padDT, class)
22 |
23 |
24 | getResult <- function(dt, expr, gby) {
25 | e <- substitute(expr)
26 | b <- substitute(gby)
27 | return(dt[,eval(e),by=b])
28 | }
29 | v1 <- "Sepal.Length"
30 | v2 <- "Species"
31 |
32 |
33 | getResult(dt, sum(get(v1), na.rm=TRUE), v2)
34 | #e <- substitute(expr)
35 | #b <- substitute(gby)
36 |
--------------------------------------------------------------------------------
/FetchData/FetchFailedBanksData.R:
--------------------------------------------------------------------------------
1 | library(XML)
2 | library("chron")
3 | library(plyr)
4 | library(stringr)
5 |
6 | source("UtilPADS.R")
7 |
8 | run <- function() {
9 | #col names
10 | col.names <- c("bank_name","city","st","closing_date","acquiring_institution","assets_in_mil_usd")
11 | #wiki url
12 | url <- "http://en.wikipedia.org/wiki/List_of_bank_failures_in_the_United_States_(2008%E2%80%93present)"
13 | # read wiki page
14 | page <- htmlParse(url, encoding="UTF-8")
15 | # read all tables
16 | all.tables <- try(readHTMLTable(page, colClasses=c('numeric', 'character', 'character', 'character', 'mixed.dates',
17 | 'character', 'num.with.commas'), stringsAsFactors = FALSE,
18 | trim=TRUE, elFun = removeSpecialChars))
19 |
20 | #bind all the tables together
21 | s.d <- do.call(rbind, all.tables)
22 | # fix row names
23 | rownames(s.d) <- 1:nrow(s.d)
24 | # remove first column
25 | s.d <- s.d[-c(1)]
26 | # assing column names
27 | colnames(s.d) <- col.names
28 | saveData(s.d, "./pads/raw-data/failed-banks/failed_banks_assets_fdic.csv")
29 | }
30 |
31 | run()
--------------------------------------------------------------------------------
/learning/ggplot2.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | pres <- read.csv("http://www.stanford.edu/~messing/primaryres.csv", as.is=T)
8 |
9 | # sort data in order of percent of vote:
10 | pres <- pres[order(pres$Percentage, decreasing=T), ]
11 |
12 | # only show top 15 candidates:
13 | pres <- pres[1:15,]
14 |
15 | # create a precentage variable
16 | pres$Percentage <- pres$Percentage*100
17 |
18 | # reorder the Candidate factor by percentage for plotting purposes:
19 | pres$Candidate <- reorder(pres$Candidate, pres$Percentage)
20 |
21 | # To install ggplot2, run the following line after deleting the #
22 | #install.packages("ggplot2")
23 |
24 | library(ggplot2)
25 | ggplot(pres, aes(x = Percentage, y = factor(Candidate) )) +
26 | geom_point() +
27 | theme_bw() + opts(axis.title.x = theme_text(size = 12, vjust = .25))+
28 | xlab("Percent of Vote") + ylab("Candidate") +
29 | opts(title = expression("New Hampshire Primary 2012"))
--------------------------------------------------------------------------------
/learning/ggplot.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | library (ggplot2)
8 |
9 | v1 <- c(1,2,3,3,4)
10 | v2 <- c(4,3,1,1,9)
11 | v3 <- c(3,5,7,2,9)
12 | gender <- c("m","f","m","f","f")
13 |
14 | d.data <- data.frame (v1, v2, v3, gender)
15 |
16 | molten <- melt(d.data)
17 |
18 | Average <- ddply(molten, c("gender", "variable"), function(z){
19 | c(Mean = mean(z$value))}
20 | )
21 |
22 | ggplot (data=Average, aes(x = variable, y = Mean, fill = gender)) +
23 | geom_bar(position = position_dodge()) +
24 | coord_flip() +
25 | geom_text (position = position_dodge(0.5),
26 | aes(label=round(Mean, 1)), vjust=0.5, hjust=4,colour="white", size=7)
27 |
28 | ggplot (data=Average, aes(x = variable, y = Mean)) +
29 | geom_bar() +
30 | geom_text(aes(label=round(Mean, 1)), vjust=0.5,
31 | hjust=4,colour="white", size=7) +
32 | facet_wrap(~gender)
--------------------------------------------------------------------------------
/GeneratePADS.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | # Generate PADS - new way - read a csv file that has the list of pads
8 | #
9 | source("CreatePADS.R")
10 |
11 | #
12 | # generate Pads
13 | #
14 | generatePads <- function() {
15 | #initialize system
16 | initializeSystem()
17 | assign("folder.path", "./pads/raw-data/internet_stats/", envir=.GlobalEnv)
18 | assign("pads.file", "pads_meta.csv", envir=.GlobalEnv)
19 | assign("dataset", "Internet-Stats", envir=.GlobalEnv)
20 | pads <- readFile(paste(folder.path, pads.file, sep=""))
21 | for(i in 1:nrow(pads)){
22 | #prepare pad meta data
23 | series <- list()
24 | series["source"] <- pads$source[i]
25 | series["category"] <- pads$category[i]
26 | series["subcategory"] <- pads$subcategory[i]
27 | series["tags"] <- pads$tags[i]
28 | series["desc"] <- pads$desc[i]
29 | series["title"] <- pads$title[i]
30 | series["pagetag"] <- "internet"
31 | series.data <- trimData(readFile(paste(folder.path, pads$file[i], sep="")))
32 | series.data[2:ncol(series.data)] <- sapply(series.data[2:ncol(series.data)], removeMetaFromNumeric)
33 | padify(series, series.data)
34 | }
35 | #clean up
36 | cleaupSystem()
37 | updateCatPadCount()
38 | }
39 |
40 | generatePads()
41 |
--------------------------------------------------------------------------------
/learning/loadPADS.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | #setContentType("text/html")
8 | setwd("/Users/homemac/Toga")
9 | #read pre-analyzed data-sets file
10 | #padsFile <- "pads/pads.json"
11 | #getPADS <- function()
12 | #{
13 | # library("RJSONIO")
14 | # con <- file(padsFile)
15 | # data.json <- fromJSON(paste(readLines(con), collapse=""))
16 | # close(con)
17 | #Convert the list to a matrix and then to a data frame
18 | # m <- matrix(unlist(data.json), ncol=10, byrow=TRUE)
19 | # pads <- data.frame(m, stringsAsFactors=FALSE)
20 | #colnames(pads) <- c("id", "location")
21 | # return (pads)
22 | #}
23 | #SYS <- new.env(hash = TRUE, size = NA)
24 | #Read pads configuration
25 | #pads <- getPADS()
26 | pads <- data.frame(read.csv("Alto/pads/pads.csv", header=TRUE, sep=",",stringsAsFactors=FALSE ))
27 | for(i in 1:nrow(pads)) {
28 | cat(pads$cache[i])
29 | cat("
")
30 | #Load all the PADS
31 | #cat(pads$cache[i])
32 | #cat("
")
33 | cat(try(load(pads$cache[i], envir=.GlobalEnv), FALSE))
34 | #try(load(pads$cache[i], envir=SYS), FALSE)
35 | #unlink(pads$cache[i])
36 | #cat("
")
37 |
38 | }
39 | cat(ls(pattern="pad[0-9]", envir=.GlobalEnv))
40 | # All Done - Al these variables get loaded in .GlobalEnv, run "search" to see.
41 |
42 |
--------------------------------------------------------------------------------
/learning/JSON.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | library("rjson")
8 | x <- list( alpha = 1:5, beta = "Bravo", gamma = list(a=1:3, b=NULL), delta = c(TRUE, FALSE) )
9 | json <- toJSON( x )
10 | fromJSON( json )
11 | #named vectors are treated as JSON objects (lists)
12 | toJSON(islands[1:4])
13 | #data.frames must be converted into a list before converting into JSON
14 | plot(cars, pch=2)
15 | json_cars <- toJSON(as.list(cars))
16 | points( data.frame( fromJSON( json_cars ) ), col="red", pch=3 )
17 | #special R types are encoded as strings
18 | testString <- c(1,2,3,4,NA,NaN,Inf,8,9);
19 | toJSON(testString);
20 |
21 | sample_json <- ' { "breakfast" : [ "milk", "fruit loops", "juice" ], "lunch" : [ "left over sushi" ] } '
22 | parser <- newJSONParser()
23 | parser$addData( sample_json )
24 | food <- parser$getObject()
25 | print( food )
26 |
27 | #This is equivalent to using FromJSON( sample_json )
28 | #However, sample_json can be split into several parts:
29 | ### EXAMPLE 2:
30 | part_1 <- '{ "breakfast" : [ "milk", "fruit loops", "juice" ], '
31 | part_2 <- '"lunch" : [ "left over sushi" ]'
32 | part_3 <- '} [1,2,3,4,5]' #close off the first object, and create a 2nd JSON object, which i
33 | parser <- newJSONParser()
34 | parser$addData( part_1 )
35 | parser$getObject() #returns NULL - since part_1 isn't complete
36 | parser$addData( part_2 )
37 | parser$getObject() #returns NULL - since part_2 still isn't complete
38 | parser$addData( part_3 )
39 | parser$getObject() #returns the first food object
40 | parser$getObject() #returns the second array
--------------------------------------------------------------------------------
/learning/Clinic.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | ## generate data for medical example
8 | clinical.trial <-
9 | data.frame(patient = 1:100,
10 | age = rnorm(100, mean = 60, sd = 6),
11 | year.enroll = sample(paste("19", 85:99, sep = ""),
12 | 100, replace = TRUE),
13 | treatment = gl(2, 50,
14 | labels = c("Treatment", "Control")),
15 | center = sample(paste("Center", LETTERS[1:5]), 100, replace = TRUE))
16 | ## set some ages to NA (missing)
17 | is.na(clinical.trial$age) <- sample(1:100, 20)
18 | summary(clinical.trial)
19 |
20 | ## a simple example of a table call
21 | table(clinical.trial$center)
22 |
23 | ## a logical vector is created and passed into table
24 | table(clinical.trial$age < 60)
25 | #FALSE TRUE
26 | # 41 39
27 | ## the useNA argument shows the missing values, too
28 | table(clinical.trial$age < 60, useNA = "always")
29 | #FALSE TRUE
30 | # 41 39 20
31 |
32 | ## the table of missing age by center
33 | table(clinical.trial$center, is.na(clinical.trial$age))
34 | # FALSE TRUE
35 | # Center A 16 6
36 | # Center B 8 2
37 | # Center C 23 5
38 | # Center D 20 3
39 | # Center E 13 4
40 | ## centers with most missing ages listed in order
41 | ## highest to lowest
42 | sort(table(clinical.trial$center, is.na(clinical.trial$age))[, 2],
43 | decreasing = TRUE)
44 | #Center A Center C Center E Center D Center B
45 | # 6 5 4 3 2
46 |
47 |
48 | c1 <- cut(clinical.trial$age, breaks = seq(30, 80, by = 10))
49 | table(c1)
--------------------------------------------------------------------------------
/learning/WikiRead.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | library(XML)
8 | library(RColorBrewer)
9 | library(plyr)
10 | library(quantmod)
11 | library(Heatplus)
12 |
13 | # get the list of symbols
14 | l <- readHTMLTable('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[[2]]
15 | l <- as.vector(l$Ticker)
16 | l <- l[c(-59, -71, -80, -124, -141, -147, -275, -283, -292, -299, -309, -316, -360, -378, -381, -406, -439, -470, -471)]
17 |
18 | getMonthlyReturns <- function(sym) {
19 | y <- to.monthly(getSymbols(sym, auto.assign=FALSE, from='2007-01-01'))
20 | as.vector(ClCl(y)*100)
21 | }
22 |
23 | d <- unlist(llply(l, getMonthlyReturns, .progress="text"))
24 | # bounds at -10% and +10% for visual clarity
25 | d[d < -10] <- -10
26 | d[d > 10] <- 10
27 |
28 | heatmap_2(t(matrix(d, ncol=481)), col=brewer.pal(9, 'PuBu'), Rowv=NA, Colv=NA, do.dendro=c(FALSE,FALSE), scale='none', legend=2, main="S&P 500 since 2007 (monthly returns)")
29 |
30 |
31 |
32 | http://blog.datapunks.com/2011/10/sp-500-components-heatmap-in-r/
33 |
34 | getWikiFC <- function(d) {
35 | l <- try(readHTMLTable("http://en.wikipedia.org/wiki/Forbes%27_list_of_the_most_valuable_football_clubs",
36 | header=FALSE, skip.rows=1, which=d$tableNo, stringsAsFactors = FALSE))
37 | l$year = d$year
38 | ifelse(d$year > 2010, return(with(l, data.frame(year, club=V2, country=V3, value=V4, revenue=V6))),
39 | return(with(l, data.frame(year, club=V2, country=V3, value=V4, revenue=V7))))
40 | }
41 | d <- data.frame(year=2012:2007, tableNo=2:7)
42 | mostValuedSoccerClubs <- ddply(d, .(year), getWikiFC)
43 |
44 |
45 |
46 |
47 | l <- readHTMLTable("http://en.wikipedia.org/wiki/Forbes%27_list_of_the_most_valuable_football_clubs")[[7]]
--------------------------------------------------------------------------------
/FetchData/FetchBeerRatings.R:
--------------------------------------------------------------------------------
1 | #
2 | # Get beer data
3 | #
4 | # Author: Jitender Aswani, Co-Founder @datadolph.in
5 | # Date: 3/15/2013
6 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
7 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
8 | # All rights reserved.
9 | require(data.table)
10 | startup <- function() {
11 | #initialize system
12 | initializeSystem(0)
13 |
14 | assign("folder.path", "./pads/raw-data/crunchbase/", envir=.GlobalEnv)
15 | assign("comps.file", "cb_companies_june_2013.csv", envir=.GlobalEnv)
16 | assign("comps.rounds.file", "cb_companies_rounds.csv", envir=.GlobalEnv)
17 | assign("investors.file", "cb_investors.csv", envir=.GlobalEnv)
18 |
19 | assign("dataset", "crunchbase", envir=.GlobalEnv)
20 | #assign("verbose", TRUE, envir=.GlobalEnv)
21 |
22 | loadCompaniesStats()
23 | loadCompanyRounds()
24 | loadInvestosStats()
25 |
26 | #prepare pad meta data
27 | series <- list()
28 | series["source"] <- "CrunchBase"
29 | series["category"] <- "Financial Sector"
30 | series["subcategory"] <- "Investment"
31 | series["tags"] <- tolower(paste(series$source, "VC, venture capital, startups, US startups, investments, angel, series-a, series-b, series-c, funding, seed, biotech, ecommerce, enterprise, software, mobile, web", sep=","))
32 | series["pagetag"] <- "crunchbase"
33 | series["desc"] <- "Built using data from CrunchBase extracted on June 6, 2013."
34 | assign("series", series, envir=.GlobalEnv)
35 | }
36 |
37 | assign("folder.path", "./pads/raw-data/beer-data/", envir=.GlobalEnv)
38 | assign("data.file", "beer_reviews.csv.gz", envir=.GlobalEnv)
39 |
40 | beer.data <- data.table(read.csv(paste(folder.path, data.file, sep="")))
41 | setkey(beer.data, beer_beerid)
42 | #get beers with 500 or more reviews
43 | b.d.500 <- beer.data[, list(no_of_reviews=nrow(.SD)), by=beer_beerid][no_of_reviews >= 500]
44 | setkey(b.d.500, beer_beerid)
45 | #merge to get all records ofbeers which have 500 or more reviews
46 | b.d.500 <- merge(b.d.500, beer.data, all=F)
--------------------------------------------------------------------------------
/FetchData/FetchRussell2000.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | library(XML)
8 | library(plyr)
9 | library(quantmod)
10 | library(data.table)
11 | library("zoo")
12 |
13 | folder.path = "./raw-data/fin-markets/"
14 | russell <- new.env()
15 | # get the list of symbols from the csv file
16 | ru2000 <- read.csv(paste(folder.path, "russell-2000-components.csv", sep=""), stringsAsFactors=F)
17 | colnames(ru2000) <- c("name", "ticker")
18 | #ru2000$name <- gsub(ru2000$ticker, "", ru2000$name)
19 |
20 | getDailyReturns <- function(sym) {
21 | y <- tryCatch(getSymbols(sym, auto.assign=FALSE, from="2007-01-01"), error = identity)
22 |
23 | if(inherits(y, "error"))
24 | cat("Symbol '", sym, "' not downloadable!\n", sep = "")
25 | else {
26 | y <- tryCatch(cbind(y, OpCl(y)*100), error = identity)
27 | if(inherits(y, "error"))
28 | cat("Symbol '", sym, "' not downloadable!\n", sep = "")
29 | else {
30 | write.csv(data.frame( date=index(y), coredata(y) ),row.names=FALSE, file=paste(sym, ".csv", sep=""))
31 | cat("Sucessfully saved the stock data to %s",sym)
32 | }
33 | }
34 | }
35 |
36 | ru2000.DR <- llply(ru2000$Ticker[1109:nrow(ru2000)], getDailyReturns, .progress="text")
37 |
38 |
39 | #transpose d to have tickers as columns and date as rows
40 | ru2000.DR.T <- t(ru2000.DR)
41 | colnames(ru2000.DR.T)<- head(ru2000$Ticker)
42 | AACC <- data.frame[seq(as.Date("2007-01-01"), Sys.Date(), by="day"), ru2000.DR.T$AACC]
43 | #SP100.MR <- data.table(SP100.MR)
44 | plot(SP100.MR$AAP, ylab="Apple", main="Montly Returns Stock"
45 | read.zoo(SP100.MR)
46 | #d <- unlist(llply(l, getMonthlyReturns, .progress="text"))
47 | # bounds at -10% and +10% for visual clarity
48 | d[d < -10] <- -10
49 | d[d > 10] <- 10
50 |
51 | heatmap_2(t(matrix(d, ncol=100)), col=brewer.pal(9, 'PuBu'),
52 | Rowv=NA, Colv=NA, do.dendro=c(FALSE,FALSE), scale='none', legend=2,
53 | main="S&P 100 since 2007 (monthly returns)")
--------------------------------------------------------------------------------
/learning/NYTWebService.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | setwd("/Users/homemac/Toga/Alto")
8 | library(RJSONIO)
9 |
10 | ### set parameters ###
11 | api <- "YOUR_KEY" # API key goes here!!
12 | q <- "health+care+reform" # Query string, use + instead of space
13 | records <- 500 # total number of records to return, note limitations above
14 | # calculate parameter for offset
15 | os <- 0:(records/10-1)
16 |
17 | # read first set of data in
18 | uri <- paste ("http://api.nytimes.com/svc/search/v1/article?format=json&query=", q, "&offset=", os[1], "&fields=date&api-key=", api, sep="")
19 | raw.data <- readLines(uri, warn="F") # get them
20 | res <- fromJSON(raw.data) # tokenize
21 |
22 | dat <- unlist(res$results) # convert the dates to a vector
23 | # read in the rest via loop
24 | for (i in 2:length(os)) {
25 | # concatenate URL for each offset
26 | uri <- paste ("http://api.nytimes.com/svc/search/v1/article?format=json&query=", q, "&offset=", os[i], "&fields=date&api-key=", api, sep="")
27 | raw.data <- readLines(uri, warn="F")
28 | res <- fromJSON(raw.data)
29 | dat <- append(dat, unlist(res$results)) # append
30 | }
31 |
32 | # aggregate counts for dates and coerce into a data frame
33 | cts <- as.data.frame(table(dat))
34 | # establish date range
35 | dat.conv <- strptime(dat, format="%Y%m%d") # need to convert dat into POSIX format for this
36 | daterange <- c(min(dat.conv), max(dat.conv))
37 | dat.all <- seq(daterange[1], daterange[2], by="day")
38 | # all possible days
39 | # compare dates from counts dataframe with the whole data range
40 | # assign 0 where there is no count, otherwise take count
41 | # (take out PSD at the end to make it comparable)
42 | dat.all <- strptime(dat.all, format="%Y-%m-%d")
43 |
44 | # cant' seem to be able to compare Posix objects with %in%, so coerce them to character for this:
45 | freqs <- ifelse(as.character(dat.all) %in% as.character(strptime(cts$dat, format="%Y%m%d")), cts$Freq, 0)
46 |
47 | plot (freqs, type="l", xaxt="n", main=paste("Search term(s):",q), ylab="# of articles", xlab="date")
48 | axis(1, 1:length(freqs), dat.all)
49 | lines(lowess(freqs, f=.2), col = 2)
--------------------------------------------------------------------------------
/learning/SP500MarketInflection.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | library(XML)
8 | library(plyr)
9 | library(quantmod)
10 | #library(data.table)
11 | #library("zoo")
12 |
13 | # get the list of symbols from Wiki
14 | sp500 <- readHTMLTable('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies',
15 | which=2, header=FALSE, skip.rows=1)
16 | sp500.tickers <- as.vector(sp500$V1)
17 | #Remove LOW
18 | sp500.tickers <- sp500.tickers[-which(sp500.tickers=="LOW")]
19 | sp500.tickers <- sp500.tickers[-which(sp500.tickers=="PSX")]
20 | sp500.tickers <- sp500.tickers[-which(sp500.tickers=="XYL")]
21 | sp500.tickers <- sp500.tickers[-which(sp500.tickers=="TRIP")]
22 | sp500.tickers <- sp500.tickers[-which(sp500.tickers=="WPX")]
23 |
24 | sp500.tickers[which(sp500.tickers=="BFB")]="BF-B"
25 |
26 | isFiftyTwoWeekLow <- function(sym) {
27 | #to <- "2012/6/01"
28 | #to <- "2012/4/02"
29 | to <- "2011/10/04"
30 | from <- last(seq(as.Date(to), by="-1 week", length.out=52))
31 | #last(seq(Sys.Date(), by="-1 week", length.out=52))
32 | print(sym)
33 | y <- getSymbols(sym, auto.assign=FALSE, from=from, to=to)
34 | low <- ifelse(Cl(last(y))[[1]] < Cl(seriesLo(last(y, '52 weeks')))[[1]], 1, 0)
35 | return (low)
36 | }
37 | #Percent of stocks trading at 52 weeks low
38 | sp500.low <- ldply(sp500.tickers, isFiftyTwoWeekLow, .progress="text")
39 | per.stocks.trading.low.Jun1 <- (sum(sp500.low$V1)/length(sp500.low$V1))*100
40 |
41 | #Get SP500 Index data
42 | getSymbols("^gspc")
43 | Cl(last(GSPC))[[1]] #Get Last Close
44 | Cl(seriesLo(last(GSPC, '52 weeks')))[[1]] #Get Low from last 52weeks
45 |
46 | #April 2nd - SP500 closed at 1419, multi-year high - What Percent of stocks trading at 52 weeks low
47 | sp500.low.Apr2 <- ldply(sp500.tickers, isFiftyTwoWeekLow, .progress="text")
48 | per.stocks.trading.low.Apr2 <- (sum(sp500.low.Apr2$V1)/length(sp500.low.Apr2$V1))*100
49 |
50 |
51 | #October 4th - SP500 closed at 1123.95 - What Percent of stocks were trading at 52 weeks low
52 | sp500.low.Oct4 <- ldply(sp500.tickers, isFiftyTwoWeekLow, .progress="text")
53 | per.stocks.trading.low.Oct4 <- (sum(sp500.low.Oct4$V1)/length(sp500.low.Oct4$V1))*100
54 |
55 |
--------------------------------------------------------------------------------
/learning/VCorpus.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | library(tm)
8 | library(twitteR)
9 |
10 | tweets <- searchTwitter("#DKOM", n=1500 )
11 | df <- do.call("rbind", lapply(tweets, as.data.frame))
12 |
13 | #tweets1 <- searchTwitter("#Netflix", n=1500 )
14 | #df1 <- do.call("rbind", lapply(tweets, as.data.frame))
15 | #dim(df)
16 |
17 | # build a corpus, which is a collection of text documents
18 | # VectorSource specifies that the source is character vectors.
19 | myCorpus <- Corpus(VectorSource(df$text))
20 |
21 | #After that, the corpus needs a couple of transformations, including changing letters to lower case, removing punctuations/numbers and removing stop words. The general English stop-word list is tailored by adding "available" and "via" and removing "r".
22 | myCorpus <- tm_map(myCorpus, tolower)
23 | # remove punctuation
24 | myCorpus <- tm_map(myCorpus, removePunctuation)
25 | # remove numbers
26 | myCorpus <- tm_map(myCorpus, removeNumbers)
27 | # remove stopwords
28 | # keep "r" by removing it from stopwords
29 | myStopwords <- c(stopwords('english'), "available", "via")
30 | idx <- which(myStopwords == "r")
31 | myStopwords <- myStopwords[-idx]
32 | myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
33 |
34 | #Stemming Words
35 | dictCorpus <- myCorpus
36 | # stem words in a text document with the snowball stemmers,
37 | # which requires packages Snowball, RWeka, rJava, RWekajars
38 | myCorpus <- tm_map(myCorpus, stemDocument)
39 | # inspect the first three ``documents"
40 | #inspect(myCorpus[1:3])
41 |
42 | # stem completion
43 | myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=dictCorpus)
44 |
45 | #inspect(myCorpus[1:3])
46 |
47 | #Building a Document-Term Matrix
48 | myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 1))
49 | inspect(myDtm[266:270,31:40])
50 |
51 | #Frequent Terms and Associations
52 | findFreqTerms(myDtm, lowfreq=10)
53 |
54 | # which words are associated with "r"?
55 | #findAssocs(myDtm, 'r', 0.30)
56 |
57 | # which words are associated with "mining"?
58 | # Here "miners" is used instead of "mining",
59 | # because the latter is stemmed and then completed to "miners". :-(
60 | #findAssocs(myDtm, 'miners', 0.30)
61 |
62 |
63 | library(wordcloud)
64 | m <- as.matrix(myDtm)
65 | # calculate the frequency of words
66 | v <- sort(rowSums(m), decreasing=TRUE)
67 | myNames <- names(v)
68 | #k <- which(names(v)=="miners")
69 | #myNames[k] <- "mining"
70 | d <- data.frame(word=myNames, freq=v)
71 | wordcloud(d$word, d$freq, min.freq=3)
72 |
--------------------------------------------------------------------------------
/learning/DataParser.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | library(RJSONIO)
8 | library(stringr)
9 |
10 | #Set content type for output
11 | setContentType("text/html")
12 | #Query Engine - dfp datafilepath
13 | dfp <- NULL
14 | udir <- NULL
15 |
16 | #Convert to a ROW JSON
17 | getRowWiseJson <- function (jsonDT) {
18 | row.json <- apply(jsonDT, 1, toJSON)
19 | json.st <- paste('[', paste(row.json, collapse=', '), ']')
20 | return (json.st)
21 | }
22 |
23 |
24 | # Read SachinTestRecords.csv
25 | readDataSet <- function(file.name) {
26 | return(
27 | read.csv(file.name, na.strings="-", as.is=TRUE, header=TRUE,
28 | stringsAsFactors=FALSE, strip.white=TRUE)
29 | )
30 | }
31 | readRequest <- function() {
32 | #GetString <- str(GET)
33 | #cat(GetString)
34 | dfp <<- GET$dfp
35 | udir <<- GET$udir
36 | #cat(d, "," ,m, "," ,f, "," ,qds, sep=" ")
37 | #cat("
")
38 | }
39 | writeResponse <- function(df, datafile, dsID) {
40 | sJSON = paste('{"upad":"', datafile,'","dsID":"', dsID,'","colTypes":',toJSON(unlist(lapply(df, class), use.names=FALSE)),', "rows":', getRowWiseJson(head(df,50)), '}', sep="")
41 | #Write response back
42 | cat(sJSON)
43 | }
44 | #Step 1 - Read Request, query string
45 | readRequest()
46 |
47 | #Step 2 - load environment
48 | df <- readDataSet(dfp)
49 | #Remove white space in col names
50 | colnames(df) <- str_replace(colnames(df), ' ', '')
51 |
52 | #Get Unique ID
53 | dsID <- paste("upad", substr(as.character(unclass(Sys.time())), 12,16), sep="")
54 | out.file.csv <- paste(udir,"analyzed/", dsID, ".csv", sep="")
55 | #out.file.JSON <- paste("pads/meta/",dsID, ".JSON", sep="")
56 | out.file.R.dataframes <- paste(udir, dsID, ".RData", sep="")
57 | write.csv(df, out.file.csv)
58 |
59 | # Write JSON
60 | #jsonSt <- paste('{
61 | # "name":"', file.name, '",
62 | # "id":"', dsID, '",', sep="");
63 | #dlist <- paste()
64 | # "dList": {
65 | # "Player":', toJSON(levels(factor(ipl$Player))), '
66 | # },
67 | # "mList": {
68 | # "Mat":"",
69 | # "Sixes":""
70 | # },
71 | # "dListType": {
72 | # },
73 | # "source":"CricInfo"
74 | # }', sep="")
75 | #file.out <- file(out.file.JSON, 'wt')
76 | #cat(jsonSt, file=file.out, fill = TRUE)
77 | #close(file.out)
78 | #Save transformed dataset as data.frame for later reading
79 | assign(dsID, df)
80 | names <- c(eval(dsID))
81 | save(list=names, file=out.file.R.dataframes)
82 |
83 | #Step 4 - Write Response
84 | writeResponse(df, out.file.R.dataframes, dsID)
85 |
--------------------------------------------------------------------------------
/learning/MiscR.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | responseDF <- switch(fnType,
8 | sum = getSum(df, d, m),
9 | count = getCount(df, d, m),
10 | 'default'
11 | )
12 | if(!is.null(g) && g == "t")
13 | responseDF <- geoCode(responseDF)
14 | return(responseDF)
15 |
16 | {
17 | "name": "Australia",
18 | "center": new google.maps.LatLng(41.878113, -87.629798),
19 | "data": 2842518
20 | },
21 |
22 | JSON <- with(head(mbDS), paste('{"d":"',mbDS$Country,
23 | '", "center": new google.maps.LatLng(',
24 | mbDS$Country.Lat, ',', mbDS$Country.Lng,
25 | '), "data":', mbDS$Test.Runs,
26 | '},', sep=""))
27 |
28 | m <- "Country"
29 | lat <- paste(m,".Lat", sep="")
30 | sumDS <- ddply(mbDS, .(Country), summarise, Test.Runs=sum(Test.Runs, na.rm=TRUE))
31 | JSON <- with(sumDS, paste('{"d":"',sumDS[m],
32 | '", "center": new google.maps.LatLng(',
33 | mbDS[lat][sumDS[m]==mbDS[m]], ',', mbDS$Country.Lng[mbDS$Country=="India"],
34 | '), "data":', sumDS$Test.Runs,
35 | '},', sep=""))
36 |
37 |
38 | lookupLat <- function(m, i="India")
39 | {
40 | lat <- paste(m,".Lat",sep="")
41 | return(mbDS[lat][mbDS[m]==i][1])
42 | }
43 | lookupLat("Country", "India")
44 | }
45 | }
46 | with(sumDS, data.frame(Country, Test.Runs, laply(Country, function(x){getGeoCode(x, x)})))
47 |
48 | ddply(mbDS, "Opposition", function(x) c(Lat=unique(x$Opposition.Lat, Lng=unique(x$Opposition.Lng))))
49 |
50 | with(sumDS, data.frame(Country, Test.Runs, laply(Country, function(x){lookupGeoCode(x)})))
51 |
52 |
53 | with(responeDF, paste('{"d":"',responseDF[m]
54 | '", "center": new google.maps.LatLng(',
55 | df[latName][df[m]==responeDF[m]][1], ',', df[lngName][df[m]==responeDF[m]][1],
56 | '), "data":', mbDS$Test.Runs,
57 | '},', sep=""))
58 |
59 |
60 | sumDS <- ddply(mbDS, .(Opposition), summarise, Test.Runs=sum(Test.Runs, na.rm=TRUE))
61 |
62 | d <- "Opposition"
63 | latName <- paste(d, ".Lat", sep="")
64 | lngName <- paste(d, ".Lng", sep="")
65 | m <- "Test.Runs"
66 | responseDF <- sumDS
67 | df <- mbDS
68 | with(responseDF, paste('{"d":"',responseDF[d],
69 | '", "center": new google.maps.LatLng(',
70 | df[latName][df[d]==responseDF[d]][1],
71 | ',', df[lngName][df[d]==responseDF[d]][1],
72 | '), "data":', responseDF[m],
73 | '},', sep=""))
74 |
--------------------------------------------------------------------------------
/learning/TransformDataSet.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | rm (list = ls())
8 | setwd("~/Toga/Alto")
9 | library("chron")
10 | library("plyr")
11 | library("RJSONIO")
12 | library("data.table")
13 | library("ggplot2")
14 | in.file.name <- "datasets/BollywoodCinema-1940-2008.csv"
15 | out.file.name <- "datasets/masterblaster/TransformedSachinTestRecords.csv"
16 | out.file.JSON <- "datasets/masterblaster/SachinTestRecords.JSON"
17 | out.file.R.dataframes <- "datasets/masterblaster/MB.RData"
18 | dt <- data.table(read.csv(in.file.name, na.strings="-", as.is=TRUE, header=TRUE,
19 | stringsAsFactors=FALSE, strip.white=TRUE))
20 |
21 | lMonths <- c("January","February","March", "April","May","June","July","August","September", "October","November","December")
22 | lDays <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
23 |
24 |
25 | # Line plot
26 | ggplot(totmidc, aes(variable, value)) + geom_line() + xlab("") + ylab("")
27 |
28 | # Bar plot
29 | # Note that the parameter stat="identity" passed to geom_bar()
30 | ggplot(totmidc, aes(x=variable, y=value)) + geom_bar(stat="identity") + xlab("") + ylab("")
31 |
32 |
33 | d <- dt[,list(Deols=sum(grepl("deol|Dharmendra",Cast, ignore.case=TRUE)),
34 | Kapoors=sum(grepl("kapoor",Cast, ignore.case=TRUE)),
35 | Khans=sum(grepl("khan",Cast, ignore.case=TRUE)),
36 | Bachchans=sum(grepl("bachchan",Cast, ignore.case=TRUE))),
37 | by=year]
38 | d1 <- melt(d, id.var="year")
39 | ggplot(d1)+geom_bar(aes(x=year,y=value,fill=variable),stat='identity')
40 |
41 | ggplot(d, aes(year))+ geom_bar() +facet_wrap(~n)
42 | ggplot(d, aes(x=factor(year), y=c(Deols, Kapoors))) + geom_bar(stat="identity") + xlab("") + ylab("Deols")
43 | n <- c("Deols", "Kapoors")
44 | ggplot(d, aes(x=year, )) + geom_bar(stat="identity") + facet_wrap(n)
45 |
46 |
47 |
48 | dat<-data.frame(num=1:3,usage=c(4,2,5),cap=c(10,20,10),diff=c(6,18,5))
49 | dat.melt<-melt(dat,id.var=c('num','cap'))
50 | ggplot(dat.melt)+geom_bar(aes(x=num,y=value,fill=variable),stat='identity')
51 |
52 | #IPL DS
53 | in.file.name <- "pads/data/pad56638.csv" pad89706
54 | dt <- data.table(read.csv(in.file.name, na.strings="-", as.is=TRUE, header=TRUE,
55 | stringsAsFactors=FALSE, strip.white=TRUE))
56 | dt[,Sixes, by=Player][order(-Sixes)]
57 |
58 | getCount <- function(dat, expr, gby) {
59 | e <- substitute(expr)
60 | b <- substitute(gby)
61 | print(dat[,eval(e),by=b])
62 | }
63 | getCount(dt, sum(Sixes), Player)
64 |
65 | q <- quote(Sixes)
66 | q1 <- quote(Player)
67 | dt[,eval(q)), by=q1 ][order(-eval(q))]
68 | dt[,q,by=q1]
69 |
70 | #IPL DS
71 | in.file.name <- "pads/data/pad89706.csv"
72 | mbdt <- data.table(read.csv(in.file.name, na.strings="-", as.is=TRUE, header=TRUE,
73 | stringsAsFactors=FALSE, strip.white=TRUE))
74 | mbdt[,list(sum(as.integer(Test_Runs), na.rm=TRUE)), by=Year][order(Year)]
75 |
--------------------------------------------------------------------------------
/learning/geoCode.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | library("plyr")
8 |
9 | mb.df <- "MB.RData"
10 | load(mb.df)
11 |
12 | getGeoCode <- function(str)
13 | {
14 | print(str)
15 | library("RJSONIO")
16 | str <- gsub(' ','%20',str)
17 | connectStr <- paste('http://maps.google.com/maps/api/geocode/json?sensor=false&address=',str, sep="")
18 | print(connectStr)
19 | #data.json <- fromJSON(paste(readLines(url("http://maps.google.com/maps/api/geocode/json?sensor=false&address=South%20Africa")), collapse=""))
20 | data.json <- fromJSON(paste(readLines(url(connectStr)), collapse=""))
21 | close(con)
22 | data.json <- unlist(data.json)
23 | #print(data.json["results.geometry.location.lat"])
24 | #return (lat = data.json["results.geometry.location.lat"])
25 | return (c(lat=data.json["results.geometry.location.lat"], lng=data.json["results.geometry.location.lng"]))
26 | }
27 | printVal <- function(val){
28 | print(val)
29 | return ("YO")
30 | }
31 | shortDS <- with(head(mbDS, 20), data.frame(Opposition, Ground, Toss))
32 | shortDS <- mutate(shortDS, Oppostion, Ground, Toss, lat=printVal(Opposition))
33 |
34 | cn <- levels(mbDS$Opposition)
35 | llply(cn, function(x) {getGeoCode(x)})
36 | ds <- transform(mbDS, lat=getGeoCode(Opposition))
37 | d <- data.frame(Country=cn)
38 | d$lng <- apply(d, 1, function(x) {getGeoCode(x)})
39 |
40 |
41 | printVal <- function(val){
42 | print(val)
43 | return ("YO")
44 | }
45 | ddply(shortDS, "Opposition", function(x){ getGeoCode(x$Opposition[1])})
46 |
47 |
48 |
49 | #************* Working Version
50 | df <- NULL
51 | #code <- with(head(mbDS, 4), getGeoCode(Opposition))
52 | #m <- ddply(mbDS, "Opposition", function(x){c("lat", "lag")})
53 | #t <- getGeoCode("Bangalore, India")
54 | cn <- levels(mbDS$Opposition)
55 | for(i in 1:length(cn)) {
56 | t <- getGeoCode(cn[i])
57 | r <- c(cn[i], t)
58 | df <- rbind(df, r)
59 | }
60 | #************* Working Version
61 |
62 | getGeoCode <- function(str)
63 | {
64 | library("RJSONIO")
65 | str <- gsub(' ','%20',str)
66 | connectStr <- paste('http://maps.google.com/maps/api/geocode/json?sensor=false&address=',str, sep="")
67 | #print(connectStr)
68 | con <- url(connectStr)
69 | data.json <- fromJSON(paste(readLines(con), collapse=""))
70 | close(con)
71 | data.json <- unlist(data.json)
72 | #print(data.json["results.geometry.location.lat"])
73 | #return (lat = data.json["results.geometry.location.lat"])
74 | return (c(lat=data.json["results.geometry.location.lat"], lng=data.json["results.geometry.location.lng"]))
75 | }
76 |
77 | shortDS <- with(head(mbDS, 20), data.frame(Opposition, Ground.Country, Toss))
78 | #v <- shortDS$Ground.Country
79 | #g <- laply(v, function(x){getGeoCode(x)})
80 | shortDS <- with(shortDS, data.frame(Opposition, Ground.Country, Toss,
81 | laply(Ground.Country, function(val){getGeoCode(val)})))
82 | closeAllConnections()
83 |
84 |
--------------------------------------------------------------------------------
/GenerateFredPADS.R:
--------------------------------------------------------------------------------
1 | # Generate FRED PADS
2 | # Author: Jitender Aswani, Co-Founder @datadolph.in
3 | # Date: 3/15/2013
4 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
5 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
6 | # All rights reserved.
7 | #
8 |
9 | #Generate PADS from FRED data
10 | setwd("/Users/homemac/R")
11 | source("CreatePADS.R")
12 |
13 | padifyFRED <- function(){
14 | folder.path = "./raw-data/FRED/"
15 | file.name = "FRED_series.csv"
16 | fred <- read.csv(paste(folder.path, file.name, sep=""), stringsAsFactors=F)
17 | #pads <- data.frame(pad_id=character(), title=character(), desc=character(), records=numeric(), columns=numeric(),
18 | # category_id=character(), subcategory_id=character(), analyzed=numeric(), stories_created=numeric(),
19 | # stories_published=numeric(), "source"=character(), source_data_file=character(),
20 | # created_on=character(), last_updated=character(), tags=character())
21 | pads <- getSystemFREDPads()
22 | solr.index.docs <- list()
23 | series <- list()
24 | series["source"] <- "FRED"
25 | #for(i in 1:nrow(fred)){
26 | for(i in 1:2){
27 | #series["title"] <- ifelse(nchar(fred$Title[i]) > 58,
28 | # paste(substr(fred$Title[i], 1, 58), "...", sep=""), fred$Title[i])
29 | series["title"] <- fred$Title[i]
30 | series["desc"] <- paste(fred$Title[i], " (Units:", fred$Units[i],
31 | ", Frequency:", fred$Frequency[i], ", Seasonal Adjustmet: ",
32 | fred$Seasonal.Adjustment[i], ")", sep="")
33 | series["category"] <- fred$category_name[i]
34 | series["subcategory"] <- fred$subcategory_name[i]
35 | series["category_id"]<- fred$category_id[i]
36 | series["subcategory_id"]<- fred$subcategory_id[i]
37 |
38 | series.data <- read.csv(paste(folder.path, "data/", gsub(" ", "", gsub("\\\\", "/", fred$File[i])), sep=""),stringsAsFactors=F)
39 | print(paste("Starting file #", i, " name: " , fred$File[i], sep=""))
40 | #print(series.data)
41 | #print(series)
42 | #Create pad and get pad meta data
43 | pmd <- createPAD(series.data, series$title,
44 | series$desc, series$category, series$subcategory,
45 | series$source, paste(series$name, series$country, sep="."),
46 | tolower(paste(series$category, series$subcategory, series$source, sep=",")))
47 | #l <- list(pmd$id, pmd$title, pmd$desc, pmd$records, pmd$columns, series["category_id"],
48 | # series["subcategory_id"], pmd$analyzed, pmd$stories_created, pmd$stories_published,
49 | # pmd$src, pmd$src_file, "", "", pmd$tags)
50 |
51 | pads <- rbind(pads,
52 | data.frame(pad_id=pmd$id[1], title=pmd$title[1], desc=pmd$desc[1],
53 | records=pmd$records[1], columns=pmd$columns[1],
54 | category_id=series$category_id, subcategory_id=series$subcategory_id,
55 | analyzed=pmd$analyzed[1], stories_created=pmd$stories_created[1],
56 | stories_published=pmd$stories_published[1],
57 | "source"=pmd$src[1], source_data_file=pmd$src_file[1], created_on="", last_updated="",
58 | tags=pmd$tags[1]))
59 | #
60 | #generate solr doc
61 | #
62 | doc <- list("type"="pad", "id" = pmd$id, "title" = pmd$title, "desc"=pmd$desc, "category"=pmd$category,
63 | "subcategory"=pmd$subcategory, "tags"=pmd$tags, "author"="system")
64 | solr.index.docs[[length(solr.index.docs)+1]] <- doc
65 | }
66 | # now save solr doc
67 | generateSolrDocFromAllDocs(solr.index.docs)
68 |
69 | # Save pads to a csv file
70 | saveData(pads, "./pads/system_pads.csv")
71 |
72 | #now save pads to db
73 | #scon <- getDBCon()
74 | #dbRemoveTable(scon, "system_pads")
75 | #dbWriteTable(scon, name="system_pads", value=pads, row.names = F, overwrite = T)
76 | #disconnectDB(scon)
77 |
78 | }
79 |
80 | padifyFRED()
--------------------------------------------------------------------------------
/learning/TransformFailedBanks.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | rm (list = ls())
8 | setwd("~/Toga/Alto")
9 | library("chron")
10 | library("plyr")
11 | library("RJSONIO")
12 | #Get Unique ID
13 | dsID <- paste("pad", substr(as.character(unclass(Sys.time())), 12,16), sep="")
14 | out.file.csv <- paste("pads/data/",dsID, ".csv", sep="")
15 | out.file.JSON <- paste("pads/meta/",dsID, ".JSON", sep="")
16 | out.file.R.dataframes <- paste("pads/cache/", dsID, ".RData", sep="")
17 | in.file.name <- "datasets/failedbanks/failedbanksdata.csv"
18 |
19 | lMonths <- c("January","February","March", "April","May","June","July","August","September", "October","November","December")
20 | lDays <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
21 |
22 | # Read *.csv
23 | readDataSet <- function(file.name) {
24 | return(
25 | read.csv(file.name, na.strings="-", as.is=TRUE, header=TRUE,
26 | stringsAsFactors=FALSE, strip.white=TRUE)
27 | )
28 | }
29 |
30 | # Transform raw dataset
31 | transformDataSet <- function(df) {
32 | tdf <- mutate(df,
33 | Date=as.Date(Date, "%m/%d/%Y"),
34 | Day=factor(weekdays(Date), levels=lDays, ordered=TRUE),
35 | Month=factor(months(Date), levels=lMonths, ordered=TRUE),
36 | Year=years(Date),
37 | Bank.Name=Bank.Name,
38 | City=as.factor(City),
39 | State=as.factor(State),
40 | Assets=as.numeric(gsub('[\\$,]', '', Assets)),
41 | Deposits=as.numeric(gsub('[\\$,]', '', Deposits)),
42 | Branches=as.numeric(Branches),
43 | FDIC.Cost=as.numeric(gsub('[\\$,]', '', FDIC.Cost)),
44 | Lat=as.numeric(Lat),
45 | Lng=as.numeric(Lng)
46 | )
47 | return(with(tdf, data.frame(Date, Day, Month, Year,
48 | Bank.Name, City, State, Assets, Deposits, Branches, FDIC.Cost,
49 | Lat, Lng
50 | ))
51 | )
52 | }
53 |
54 | writeTransformedDS <- function(df, file.name) {
55 | write.csv(df, file.name)
56 | }
57 |
58 | writeJSON <- function(df, file.JSON, id) {
59 |
60 | jsonSt <- paste('{
61 | "datasetName": "FailedBanks",
62 | "dsID": id,
63 | "dList": {
64 | "Year":', toJSON(levels(df$Year)), ',
65 | "Month":', toJSON(levels(df$Month)), ',
66 | "Day":', toJSON(levels(df$Day)), ',
67 | "Bank.Name":', toJSON(df$Bank.Name), ',
68 | "City":', toJSON(levels(df$City)), ',
69 | "State":', toJSON(levels(df$State)),
70 | '},
71 | "mList": {
72 | "Assets":"",
73 | "Deposits":"",
74 | "Branches":"",
75 | "FDIC.Cost":""
76 | }
77 | }')
78 | file.out <- file(file.JSON, 'wt')
79 | cat(jsonSt, file=file.out, fill = TRUE)
80 | close(file.out)
81 | }
82 |
83 | # Read Raw File and Transform it
84 | fbDS <- transformDataSet(readDataSet(in.file.name))
85 |
86 | #Write Transformed data set to a CSV file
87 | writeTransformedDS(fbDS, out.file.csv)
88 |
89 | #Generate JSON with m and d
90 | writeJSON(fbDS, out.file.JSON, id)
91 |
92 | #Save transformed dataset as data.frame for later reading
93 | #save(fbDS, file=out.file.R.dataframes)
94 | #Save transformed dataset as data.frame for later reading
95 | assign(dsID, fbDS)
96 | names <- c(eval(dsID))
97 | save(list=names, file=out.file.R.dataframes)
98 | pads <- rbind(pads, data.table(id=dsID, title="Failed Banks Analysis in US (2008-2011)",
99 | subtitle="This data set explores US Banks that perished during the 2008 downturn.",
100 | records=nrow(fbDS), analyzed=0, stories=0, json=out.file.JSON, data=out.file.csv,
101 | cache=out.file.R.dataframes, source="Federal Reserve")
102 | )
--------------------------------------------------------------------------------
/learning/SQLQueryDataFrame.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | rm (list = ls())
8 | setwd("~/Toga/Alto")
9 |
10 | #load pre-saved dataframe
11 | out.file.R.dataframes <- "datasets/masterblaster/mb.RData"
12 | load(out.file.R.dataframes)
13 |
14 | library(plyr)
15 |
16 | ## Operate on a data.frame, split by "by", by could be a vector or a single filed
17 | ## aggreage by columns in vector on, on could be a single field also
18 | getSum <- function(df, by, on) {
19 | #print(sum(df[,on], na.rm=TRUE))
20 | return(ddply(df, by, colwise(function(col) {sum(col, na.rm=TRUE)}, on)))
21 | }
22 | ## Try it out
23 | crpopy <- getSum(mbDS, .(Opposition, Year), .(Test.Century, Test.Runs))
24 |
25 | ## Operate on a data.frame, split by "by", by could be a vector or a single filed
26 | ## and aggreage by columns in vector on, on could be a single field also
27 | getCount <- function(df, by, on) {
28 | return(ddply(df, by, colwise(function(col) {length(unique(col))}, on)))
29 | }
30 | ## Try it out
31 | sx <- getCount(mbDS, "Year", c("TestNo", "Ground"))
32 |
33 | #For counting column that has 0 or 1 value :: Very similar to getSum - the answer should be same
34 | getBinaryCount <- function(df, by, on) {
35 | return(ddply(df, by, colwise(function(x) {length(which(x==1))}, on)))
36 | #length(which(mbDS[,"Test.Century"] ==1))
37 | }
38 | ## Try it out
39 | cpo <- getBinaryCount(mbDS, "Opposition", c("Ing1.Century", "Ing2.Century", "Test.Century"))
40 |
41 | ###################### Samples ####################
42 |
43 | #Runs by Opposition
44 | #rpo <- ddply(mbDS, "Opposition", function(x) c(TotalRuns=sum(x$Test_Runs, na.rm=TRUE)))
45 | #rpo <- ddply(mbDS, .(Opposition), summarise, TotalRuns=sum(Test_Runs, na.rm=TRUE))
46 | rpo <- getSum(mbDS, "Opposition", "Test.Runs")
47 |
48 | #Runs by Opposition by Inning
49 | rpopi <- getSum(mbDS, "Opposition", c("Ing1.Runs", "Ing2.Runs", "Test.Runs"))
50 |
51 | #Runs by Year
52 | #rpy <- ddply(mbDS, .(Year), summarise, TotalRuns = sum(Test.Runs, na.rm=TRUE))
53 | rpy <- getSum(mbDS, "Year", "Test_Runs")
54 |
55 | #Tests Per Opposition
56 | tpo <- getCount(mbDS, "Opposition", "TestNo")
57 |
58 | #Tests Per Year
59 | tpy <- getCount(mbDS, "Year", "TestNo")
60 |
61 | #Runs by Opposition by Year - userful for clicking on an opposition and drilling down by year
62 | #rpopy <- ddply(mbDS, .(Year, Opposition), summarise, TotalRuns = sum(Test_Runs, na.rm=TRUE))
63 | rpopy <- getSum(mbDS, c("Opposition", "Year"), "Test.Runs")
64 |
65 | #Tests Per Opposition By Year
66 | tpopy <- getCount(mbDS, c("Opposition", "Year"), "TestNo")
67 |
68 | #Runs and Tests Per Opposition
69 | rtpo <- getSum1Count2(mbDS, "Opposition", c("Test.Runs", "TestNo")) ### Doesn't work
70 |
71 | #Total Runs by Opposiotn by Inning
72 | #rpopi <- ddply(mbDS, .(Opposition), summarise,
73 | # 1stIngRuns = sum(Ing1.Runs, na.rm=TRUE),
74 | # 2ndIngRuns = sum(Ing2.Runs, na.rm=TRUE),
75 | # TotalRuns = sum(Test_Runs, na.rm=TRUE))
76 |
77 | #Total Runs by Year by Opposition - userful for clicking on a year to drill down
78 | #rpypo <- ddply(mbDS, .(Year, Opposition), summarise, TotalRuns = sum(Test_Runs, na.rm=TRUE))
79 |
80 |
81 | #Total Centuries by Opposition
82 | #CPO <- ddply(mbDS, .(Opposition), summarise, TestCenturies = sum(Test.Century==1, na.rm=TRUE))
83 | cpo <- getBinaryCount(mbDS, "Opposition", "Test.Century")
84 | #cpo1 <- getSum(mbDS, "Opposition", "Test.Century")
85 |
86 | cpo <- getBinaryCount(mbDS, "Opposition", c("Ing1.Century", "Ing2.Century", "Test.Century"))
87 |
88 |
89 | #Centuries Per Year
90 | cpy <- getBinaryCount(mbDS, "Year", "Test.Century")
91 |
92 | #Centuries Per Opposition By Year
93 | cpopy <- getBinaryCount(mbDS, c("Opposition", "Year"), "Test.Century")
94 |
95 | #Mege T, R, C by Year
96 | trcpy <- merge(x=merge(tpy,rpy, by.x="Year", by.y="Year"), cpy, by.x="Year", by.y="Year")
97 | colnames(trcpy) <- c("Year", "Tests", "Runs", "Centuries")
98 |
99 |
100 | # Passing multiple arguments ?colwise example
101 | #sum <- function(x){sum(x, na.rm=TRUE)}
102 | #new <- ddply(tips, c("sex", "smoker"), colwise(function(x) {sum(x, na.rm=TRUE)}, c("tip", "total_bill")))
103 | #new <- ddply(tips, c("sex", "smoker"), colwise(sum, c("tip", "total_bill")))
--------------------------------------------------------------------------------
/learning/DGQueryEngine.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | library(plyr)
8 | library(RJSONIO)
9 | #setwd("/Users/homemac/Toga/Alto")
10 | #Set content type for output
11 | setContentType("text/html")
12 | #Query Engine - qds - query this database, d=dimension, m=measure, f=funciton
13 | qds <- d <- m <- f <- g <- NULL
14 | #d <- ""
15 | #m <- ""
16 | #f <- ""
17 |
18 | #load pre-analyzed and saved dataframe
19 | loadDS <- function(qds) {
20 | if(exists(qds)) {
21 | #lpads <- get("pads", envir=.GlobalEnv)
22 | #cat(exists(pads$sysID[pads$id==qds], envir=.GlobalEnv), "
")
23 | #cat(qds)
24 | return(get(qds)) #get a local copy in this enviornment
25 | } else {
26 | cat(paste('{"response_code": "error", "error_message": "The pad ', qds, ' does not exist. Check to see if you start up R script in RApache ran correctly."}', sep=""))
27 | }
28 | }
29 |
30 | ## Operate on a data.frame, split by "by", by could be a vector or a single filed
31 | ## aggreage by columns in vector on, on could be a single field also
32 | getSum <- function(df, by, on) {
33 | #print(sum(df[,on], na.rm=TRUE))
34 | #cat(paste(by, on, sep=", "))
35 | return(ddply(df, by, colwise(function(col) {sum(col, na.rm=TRUE)}, on)))
36 | }
37 |
38 | getMean <- function(df, by, on) {
39 | #print(sum(df[,on], na.rm=TRUE))
40 | #cat(paste(by, on, sep=", "))
41 | return(ddply(df, by, colwise(function(col) {mean(col, na.rm=TRUE)}, on)))
42 | }
43 |
44 | #ddply(mbDS, "Opposition", function(tDF){data.frame(Lat=tDF$Opposition.Lat[1], Lng=tDF$Opposition.Lng[1], Test.Runs=sum(tDF$Test.Runs, na.rm=TRUE))})
45 | ## Operate on a data.frame, split by "by", by could be a vector or a single filed
46 | ## and aggreage by columns in vector on, on could be a single field also
47 | #getCount <- function(df, by, on) {
48 | # return(ddply(df, by, colwise(function(col) {length(unique(col))}, on)))
49 | #}
50 |
51 | getCount<- function(df, by, on) {
52 | lds <- ddply(df, by, "nrow")
53 | colnames(lds) <- c(by,on)
54 | return(lds)
55 | }
56 |
57 | #For counting column that has 0 or 1 value :: Very similar to getSum - the answer should be same
58 | getBinaryCount <- function(df, by, on) {
59 | return(ddply(df, by, colwise(function(x) {length(which(x==1))}, on)))
60 | #length(which(mbDS[,"Test.Century"] ==1))
61 | }
62 |
63 | readRequest <- function() {
64 | #GetString <- str(GET)
65 | #cat(GetString)
66 | qds <<- GET$qds
67 | d <<- GET$d
68 | m <<- GET$m
69 | f <<- GET$f
70 | g <<- GET$g
71 | #cat(d, "," ,m, "," ,f, "," ,qds, sep=" ")
72 | #cat("
")
73 | }
74 |
75 | processRequest <- function(fnType, df){
76 | return(switch(fnType,
77 | s = getSum(df, d, m),
78 | c = getCount(df, d, m),
79 | m = getMean(df,d,m),
80 | d = getValue(df,d,m),
81 | 'default'
82 | )
83 | )
84 | }
85 |
86 | writeResponse <- function(responseDF, df) {
87 | sJSON <- paste('{"response_code": "ok","', m, '":', toJSON(responseDF[,m]), sep="") #JSON String is not closed.
88 | if(!is.null(g) && g == "t") {
89 | latName <- paste(d, ".Lat", sep="")
90 | lngName <- paste(d, ".Lng", sep="")
91 | sJSON <- paste(sJSON, ', "mapsJSON": [', sep="")
92 | for(i in 1:nrow(responseDF)) {
93 | key <- responseDF[,d][i]
94 | #cat(paste ("
", key , ", what?"))
95 | j <- match(key, df[,d])
96 | sJSON <- paste(sJSON, '{"d":"', key,
97 | '", "lat":',
98 | df[latName][j,],
99 | ', "lng":', df[lngName][j,],
100 | ', "data":', responseDF[i,2],
101 | '}', sep="")
102 | if(i!=nrow(responseDF))
103 | sJSON <- paste(sJSON, ",", sep="")
104 | }
105 | sJSON <- paste(sJSON, ']', sep="")
106 | }
107 | sJSON = paste(sJSON, "}", sep="")
108 | #Write response back
109 | cat(sJSON)
110 | #cat(ls())
111 | #rm(list=ls()) #Clean up everything - There has to be a better way than this.
112 | #cat(ls())
113 | }
114 |
115 | #Step 1 - Read Request, query string
116 | readRequest()
117 |
118 | #Step 2 - load environment
119 | df <- loadDS(qds)
120 |
121 | if(!is.null(df)) {
122 | #Step 3 - Process Request
123 | responseDF <- processRequest(f, df)
124 | #Step 3.a - Apply Filter
125 |
126 | #Step 4 - Write Response
127 | writeResponse(responseDF, df)
128 | }
--------------------------------------------------------------------------------
/learning/DGQEWebSockets.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 | library(plyr)
7 | library(RJSONIO)
8 | library(data.table)
9 | #Set content type for output
10 | setContentType("text/html")
11 | #Query Engine - qds - query this database, d=dimension, m=measure, f=funciton
12 | qds <- d <- m <- f <- g <- fv <- NULL
13 |
14 | #Convert to a ROW JSON
15 | getRowWiseJson <- function (jsonDT) {
16 | row.json <- apply(jsonDT, 1, toJSON)
17 | json.st <- paste('[', paste(row.json, collapse=', '), ']')
18 | return (json.st)
19 | }
20 |
21 | #load pre-analyzed and saved dataframe
22 | loadDS <- function(qds) {
23 | if(exists(qds, envir=.GlobalEnv)) {
24 | #lpads <- get("pads", envir=.GlobalEnv)
25 | #cat(exists(pads$sysID[pads$id==qds], envir=.GlobalEnv), "
")
26 | return(get(qds, envir=.GlobalEnv)) #get a local copy in this enviornment
27 | } else {
28 | #print("PAD didn't exist. Check to see if you start up R script in RApache ran correctly.")
29 | cat(paste('{"response_code": "error", "error_message": "The pad ', qds, ' does not exist. Check to see if you start up R script in RApache ran correctly."}', sep=""))
30 | }
31 | }
32 |
33 | getData <- function(dat, expr, gby) {
34 | e <- substitute(expr)
35 | b <- substitute(gby)
36 | #cat("
", gby)
37 | #cat(b)
38 | #cat(dat[,eval(e), by=gby])
39 | return(dat[,eval(e),by=b])
40 | }
41 | #getData(padDT, sum(eval(m), na.rm=TRUE), d)
42 |
43 | readRequest <- function() {
44 | #GetString <- str(GET)
45 | #cat(GetString)
46 | qds <<- GET$qds
47 | d <<- GET$d
48 | m <<- GET$m
49 | f <<- GET$f
50 | g <<- GET$g
51 | fv <<- GET$fv
52 | #cat(d, "," ,m, "," ,f, "," ,qds, sep=" ")
53 | #cat("
")
54 | }
55 | #Proecess Request
56 | processRequest <- function(fnType, padDT, mea, dim, fvalue){
57 | rs <- NULL
58 | if(is.null(fvalue)) {
59 | switch(fnType,
60 | d={rs <- padDT[,list(m=get(mea)),by=dim]}, #default by accpets character function
61 | s= { rs <- padDT[, list(m=sum(get(mea), na.rm=TRUE)), by=dim]}, #sum
62 | c= {rs <- padDT[, list(m=length(na.omit(get(mea)))), by=dim]}, #count
63 | m= {rs <- padDT[,list(m=mean(get(mea), na.rm=TRUE)),by=eval(dim)]}, #Average, eval for dim also works
64 | 'default'
65 | )
66 | } else {
67 | switch(fnType,
68 | d={rs <- padDT[,list(m=get(mea)),by=dim][get(dim)==fvalue]}, #default by accpets character function
69 | s= { rs <- padDT[, list(m=sum(get(mea), na.rm=TRUE)), by=dim][get(dim)==fvalue]}, #sum
70 | c= {rs <- padDT[, list(m=length(na.omit(get(mea)))), by=dim][get(dim)==fvalue]}, #count
71 | m= {rs <- padDT[,list(m=mean(get(mea), na.rm=TRUE)),by=eval(dim)][get(dim)==fvalue]}, #Average, eval for dim also works
72 | 'default'
73 | )
74 | }
75 | return (rs)
76 | }
77 | #test this out
78 | #dt <- data.table(iris)
79 | #m <- "Sepal.Length"
80 | #d <- "Species"
81 | #fv <- NULL
82 | #processRequest("s", padDT=dt, mea=m, dim=d, fvalue=fv)
83 |
84 | writeResponse <- function(responseDF, padDT) {
85 | if(is.null(d)) {
86 | responseDF$d = ""
87 | } else #Change the name to d
88 | setnames(responseDF, 1, "d")
89 | #sJSON <- paste('{"response_code": "ok","', m, '":', toJSON(responseDF[,m]), sep="") #JSON String is not closed.
90 | sJSON <- paste('{"response_code": "ok","chartData":', getRowWiseJson(responseDF), sep="") #JSON String is not closed.
91 | if(!is.null(g) && g == "t") {
92 | latName <- paste(d, ".Lat", sep="")
93 | lngName <- paste(d, ".Lng", sep="")
94 | sJSON <- paste(sJSON, ', "mapsJSON": [', sep="")
95 | for(i in 1:nrow(responseDF)) {
96 | key <- responseDF[,d][i]
97 | #cat(paste ("
", key , ", what?"))
98 | j <- match(key, padDT[,d])
99 | sJSON <- paste(sJSON, '{"d":"', key,
100 | '", "lat":',
101 | padDT[latName][j,],
102 | ', "lng":', padDT[lngName][j,],
103 | ', "data":', responseDF[i,2],
104 | '}', sep="")
105 | if(i!=nrow(responseDF))
106 | sJSON <- paste(sJSON, ",", sep="")
107 | }
108 | sJSON <- paste(sJSON, ']', sep="")
109 | }
110 | sJSON = paste(sJSON, "}", sep="")
111 | #Write response back
112 | cat(sJSON)
113 | #cat(ls())
114 | rm(list=ls()) #Clean up everything - There has to be a better way than this.
115 | #cat(ls())
116 | }
117 |
118 | #Step 1 - Read Request, query string
119 | readRequest()
120 |
121 | #Step 2 - load environment
122 | padDT <- data.table(loadDS(qds))
123 |
124 | #Step 3 - Process Request
125 | responseDF <- processRequest(f, padDT, mea=m, dim=d, fvalue=fv)
126 |
127 | #Step 4 - Write Response
128 | writeResponse(responseDF, padDT)
129 |
130 |
131 |
132 |
--------------------------------------------------------------------------------
/learning/DKOMSentimentAnalysis.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | rm (list = ls())
8 | setwd("~/Toga/Alto")
9 | # Revised Sentiment Analyis using Hu & Liu's library of 6,800 negative and positive words
10 | library("twitteR")
11 | library("plyr")
12 | library("stringr")
13 | library("doBy")
14 |
15 | #Populate the list of sentiment words from Hu and Liu (http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html)
16 | huliu.pwords <- scan('opinion-lexicon/positive-words.txt', what='character', comment.char=';')
17 | huliu.nwords <- scan('opinion-lexicon/negative-words.txt', what='character', comment.char=';')
18 |
19 | # Add some words
20 | huliu.nwords <- c(huliu.nwords,'wtf','wait','waiting','epicfail', 'crash', 'bug', 'bugy', 'bugs', 'slow', 'lie')
21 | #Remove some words
22 | huliu.nwords <- huliu.nwords[!huliu.nwords=='sap']
23 | huliu.nwords <- huliu.nwords[!huliu.nwords=='cloud']
24 | #which('sap' %in% huliu.nwords)
25 |
26 | cList <- list(twitterTag=c("#DKOM"))
27 |
28 | getTweets <- function(tag) {
29 | # Get 1500 tweets - an individual is only allowed to get 1500 tweets
30 | return(searchTwitter(tag, n=1500))
31 | #since=as.character(Sys.Date()-3 - If you want to restrict this to just last few days tweets
32 | }
33 |
34 |
35 | getSentimentScore <- function(tweets)
36 | {
37 | scores <- laply(tweets, function(singleTweet) {
38 | # clean up tweets with R's regex-driven global substitute, gsub()
39 | singleTweet <- gsub('[[:punct:]]', '', singleTweet)
40 | singleTweet <- gsub('[[:cntrl:]]', '', singleTweet)
41 | singleTweet <- gsub('\\d+', '', singleTweet)
42 | #Convert to lower case for comparision, split the tweet into single words and flatten the list
43 | tweetWords <- unlist(str_split(tolower(singleTweet), '\\s+'))
44 | # compare our words to the dictionaries of positive & negative terms
45 | # match() returns the position of the matched term or NA, apply is.na to convert to boolean
46 | pos.matches <- !is.na(match(tweetWords, huliu.pwords))
47 | neg.matches <- !is.na(match(tweetWords, huliu.nwords))
48 | # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
49 | score <- sum(pos.matches) - sum(neg.matches)
50 | return(score)
51 | })
52 | return(data.frame(SentimentScore=scores, Tweet=tweets))
53 | }
54 |
55 | #Step #1
56 |
57 | sentimentScoreDF <- data.frame()
58 | sentimentScoreDF <- ldply(cList$twitterTag, function(twitterTag) {
59 | tweets <- getTweets(twitterTag)
60 | tweets.text <- laply(tweets,function(t)t$getText())
61 | sentimentScore <- getSentimentScore(tweets.text)
62 | sentimentScore$TwitterTag <- twitterTag
63 | rbind(sentimentScoreDF, sentimentScore)
64 | })
65 |
66 | # Get rid of tweets that have zero score and seperate +ve from -ve tweets
67 | sentimentScoreDF$posTweets <- as.numeric(sentimentScoreDF$SentimentScore >=1)
68 | sentimentScoreDF$negTweets <- as.numeric(sentimentScoreDF$SentimentScore <=-1)
69 |
70 | #Summarize finidings
71 | summaryDF <- ddply(sentimentScoreDF,"TwitterTag", summarise,
72 | TotalTweetsFetched=length(SentimentScore),
73 | PositiveTweets=sum(posTweets), NegativeTweets=sum(negTweets),
74 | AverageScore=round(mean(SentimentScore),3))
75 |
76 | summaryDF$TotalTweets <- summaryDF$PositiveTweets + summaryDF$NegativeTweets
77 |
78 | #Get Sentiment Score
79 | summaryDF$Sentiment <- round(summaryDF$PositiveTweets/summaryDF$TotalTweets, 2)
80 |
81 | #Order
82 | orderBy(~-Sentiment,summaryDF)
83 |
84 | write.csv(summaryDF, "DKOM-Results.csv")
85 |
86 |
87 |
88 |
89 | #summary <- ddply(sentimentScoreDF, "TwitterTag", function(df) { MeanScore=round(mean(df$SentimentScore),3)})
90 | summary <- ddply(sentimentScoreDF, "TwitterTag", summarise,
91 | MeanScore=round(mean(SentimentScore),3), TweetCount=length(SentimentScore))
92 |
93 | period <- seq(as.Date("2010-01-01"), by="month", length.out=24)
94 | tweets <- llply(period, function(since){
95 | until <- seq(since, by="month", length.out=2)[2]
96 | print(since)
97 | print(until)
98 | tt <- searchTwitter("#SALESforce", n=1500, since=since, until=until )
99 | print(length(tt))
100 | return(length(tt))
101 | })
102 |
103 | getAuthorized <- function() {
104 | cred <- OAuthFactory$new(consumerKey="g8sky1SyttNWPsgZOGAA",
105 | consumerSecret="zuJvHtN63yLSYh9Bk2Qc0kd5kGIV9mtzb7r6XPVqq8",
106 | requestURL="https://api.twitter.com/oauth/request_token",
107 | accessURL="https://api.twitter.com/oauth/access_token",
108 | authURL="https://api.twitter.com/oauth/authorize")
109 | cred$handshake()
110 | registerTwitterOAuth(cred)
111 | #3739000
112 | save(cred, file="TwitterAanalyticsForAll.Cred")
113 | }
114 |
115 | loadCred <- function(){
116 | cred <- load("TwitterAanalyticsForAll.Cred")
117 | cred$handshake()
118 | registerTwitterOAuth(cred)
119 | }
120 |
--------------------------------------------------------------------------------
/learning/TwiterSentimentAnalysis.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | rm (list = ls())
8 | setwd("~/Toga/Alto")
9 | # Revised Sentiment Analyis using Hu & Liu's library of 6,800 negative and positive words
10 | library("twitteR")
11 | library("plyr")
12 | library("stringr")
13 | library("doBy")
14 |
15 | #Populate the list of sentiment words from Hu and Liu (http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html)
16 | huliu.pwords <- scan('opinion-lexicon/positive-words.txt', what='character', comment.char=';')
17 | huliu.nwords <- scan('opinion-lexicon/negative-words.txt', what='character', comment.char=';')
18 |
19 | # Add some words
20 | huliu.nwords <- c(huliu.nwords,'wtf','wait','waiting','epicfail', 'crash', 'bug', 'bugy', 'bugs', 'slow', 'lie')
21 | #Remove some words
22 | huliu.nwords <- huliu.nwords[!huliu.nwords=='sap']
23 | huliu.nwords <- huliu.nwords[!huliu.nwords=='cloud']
24 | #which('sap' %in% huliu.nwords)
25 |
26 | cList <- list(twitterTag=c("@Netflix"))
27 |
28 | getTweets <- function(tag) {
29 | # Get 1500 tweets - an individual is only allowed to get 1500 tweets
30 | return(searchTwitter(tag, n=1500))
31 | #since=as.character(Sys.Date()-3 - If you want to restrict this to just last few days tweets
32 | }
33 |
34 |
35 | getSentimentScore <- function(tweets)
36 | {
37 | scores <- laply(tweets, function(singleTweet) {
38 | # clean up tweets with R's regex-driven global substitute, gsub()
39 | singleTweet <- gsub('[[:punct:]]', '', singleTweet)
40 | singleTweet <- gsub('[[:cntrl:]]', '', singleTweet)
41 | singleTweet <- gsub('\\d+', '', singleTweet)
42 | #Convert to lower case for comparision, split the tweet into single words and flatten the list
43 | tweetWords <- unlist(str_split(tolower(singleTweet), '\\s+'))
44 | # compare our words to the dictionaries of positive & negative terms
45 | # match() returns the position of the matched term or NA, apply is.na to convert to boolean
46 | pos.matches <- !is.na(match(tweetWords, huliu.pwords))
47 | neg.matches <- !is.na(match(tweetWords, huliu.nwords))
48 | # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
49 | score <- sum(pos.matches) - sum(neg.matches)
50 | return(score)
51 | })
52 | return(data.frame(SentimentScore=scores, Tweet=tweets))
53 | }
54 |
55 | #Step #1
56 |
57 | sentimentScoreDF <- data.frame()
58 | sentimentScoreDF <- ldply(cList$twitterTag, function(twitterTag) {
59 | tweets <- getTweets(twitterTag)
60 | tweets.text <- laply(tweets,function(t)t$getText())
61 | sentimentScore <- getSentimentScore(tweets.text)
62 | sentimentScore$TwitterTag <- twitterTag
63 | rbind(sentimentScoreDF, sentimentScore)
64 | })
65 |
66 | # Get rid of tweets that have zero score and seperate +ve from -ve tweets
67 | sentimentScoreDF$posTweets <- as.numeric(sentimentScoreDF$SentimentScore >=1)
68 | sentimentScoreDF$negTweets <- as.numeric(sentimentScoreDF$SentimentScore <=-1)
69 |
70 | #Summarize finidings
71 | summaryDF <- ddply(sentimentScoreDF,"TwitterTag", summarise,
72 | TotalTweetsFetched=length(SentimentScore),
73 | PositiveTweets=sum(posTweets), NegativeTweets=sum(negTweets),
74 | AverageScore=round(mean(SentimentScore),3))
75 |
76 | summaryDF$TotalTweets <- summaryDF$PositiveTweets + summaryDF$NegativeTweets
77 |
78 | #Get Sentiment Score
79 | summaryDF$Sentiment <- round(summaryDF$PositiveTweets/summaryDF$TotalTweets, 2)
80 |
81 | #Order
82 | orderBy(~-Sentiment,summaryDF)
83 |
84 | write.csv(summaryDF, "Netflix-Results.csv")
85 |
86 |
87 |
88 |
89 | #summary <- ddply(sentimentScoreDF, "TwitterTag", function(df) { MeanScore=round(mean(df$SentimentScore),3)})
90 | summary <- ddply(sentimentScoreDF, "TwitterTag", summarise,
91 | MeanScore=round(mean(SentimentScore),3), TweetCount=length(SentimentScore))
92 |
93 | period <- seq(as.Date("2010-01-01"), by="month", length.out=24)
94 | tweets <- llply(period, function(since){
95 | until <- seq(since, by="month", length.out=2)[2]
96 | print(since)
97 | print(until)
98 | tt <- searchTwitter("#SALESforce", n=1500, since=since, until=until )
99 | print(length(tt))
100 | return(length(tt))
101 | })
102 |
103 | getAuthorized <- function() {
104 | cred <- OAuthFactory$new(consumerKey="g8sky1SyttNWPsgZOGAA",
105 | consumerSecret="zuJvHtN63yLSYh9Bk2Qc0kd5kGIV9mtzb7r6XPVqq8",
106 | requestURL="https://api.twitter.com/oauth/request_token",
107 | accessURL="https://api.twitter.com/oauth/access_token",
108 | authURL="https://api.twitter.com/oauth/authorize")
109 | cred$handshake()
110 | registerTwitterOAuth(cred)
111 | #3739000
112 | save(cred, file="TwitterAanalyticsForAll.Cred")
113 | }
114 |
115 | loadCred <- function(){
116 | cred <- load("TwitterAanalyticsForAll.Cred")
117 | cred$handshake()
118 | registerTwitterOAuth(cred)
119 | }
120 |
--------------------------------------------------------------------------------
/learning/Olympics.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | rm (list = ls())
8 | setwd("~/Toga")
9 | library(XML)
10 | library("chron")
11 | library(plyr)
12 | library(stringr)
13 | library("RJSONIO")
14 |
15 | #############################
16 | # Olympic Tables
17 | #library(XML)
18 | #theurl <- "http://en.wikipedia.org/wiki/Brazil_national_football_team"
19 | #tables <- readHTMLTable(theurl)
20 | #n.rows <- unlist(lapply(tables, function(t) dim(t)[1]))
21 | #the picked table is the longest one on the page
22 | #braz <- tables[[which.max(n.rows)]]
23 | #print(braz)
24 | ############################
25 | fixit <- function(x) { # Fix rowspan issue where there is single rank assing to multiple rows
26 | if(nchar(x[1]) > 3) { # other than rank
27 | for(i in length(x):2) { # Shift items by 1
28 | x[i] = x[i-1]
29 | }
30 | x[1] = -1 #Assign rank column to -1
31 | }
32 | return(x)
33 | }
34 | #tl <- rbind(n, ddply(medals, .(V1), temp))
35 |
36 | readWikiOT <- function(d) { #Read Wiki olympic medal tables
37 | whichTable <- 1
38 | print(d$year)
39 | if(!(d$year %in% c(1916, 1940, 1944))) { #Code breaks here due to wiki issues
40 | #whichTable <- 3
41 | if(d$year %in% c(1928, 1948, 1972, 1980,1996))
42 | whichTable <- 2
43 | if(d$year %in% c(1960, 2000, 2004, 2008, 2012))
44 | whichTable <- 3
45 | print(whichTable)
46 | medals <- NULL
47 | if(d$year == 1992) { # Remove subscript and superscript text
48 | overview <- htmlParse(d$url,encoding="UTF-8")
49 | temp<-getNodeSet(overview, "/*//sup")
50 | removeNodes(temp)
51 | medals <- try(readHTMLTable(overview, header=FALSE, skip.rows=1, which=whichTable, stringsAsFactors = FALSE))
52 | } else
53 | medals <- try(readHTMLTable(d$url, header=FALSE, skip.rows=1, which=whichTable, stringsAsFactors = FALSE))
54 |
55 | medals <- medals[-nrow(medals),]
56 | if(d$year < 1992)
57 | medals <- ddply(medals, .(V1), fixit)
58 | medals <- medals[, -c(1)]
59 | #colnames(medals) <- c('Nation', 'Gold', 'Silver', 'Bronze', 'Total')
60 | medals <- tm <- with(medals, data.frame(
61 | #Nation=str_trim(sapply(strsplit(str_trim(medals$V2), " \\("), '[', 1)),
62 | #NationCode=sapply(strsplit(str_trim(sapply(strsplit(str_trim(medals$V2), " \\)"), '[', 2)), "\\)"), '[',1),
63 | Country=substr(medals$V2, 1, nchar(medals$V2)-6),
64 | CountryCode=substr(medals$V2, nchar(medals$V2)-3, nchar(medals$V2)-1),
65 | Gold=as.numeric(medals$V3), Silver=as.numeric(medals$V4), Bronze=as.numeric(medals$V5),
66 | Total=as.numeric(medals$V6)))
67 | #medals$Rank <- rank(medals$V3)
68 | #medals <- medals[!c("Rank")]
69 | return (medals)
70 | }
71 | }
72 |
73 | year <- seq(1896, 2012, by=4)
74 | url <- paste("http://en.wikipedia.org/wiki/", year, "_Summer_Olympics_medal_table", sep="")
75 | om <- data.frame(year=year, url=url, stringsAsFactors = FALSE) #Indian movies
76 | omDS <- ddply(om, .(year), readWikiOT)
77 | write.csv(omDS, "SummerOlympic-Medals-1896-2012.csv",row.names=FALSE)
78 |
79 | ### Read Bollywood.csv
80 | in.file.name <- "datasets/BollywoodCinema-1940-2008.csv"
81 | dt <- data.frame(read.csv(in.file.name, na.strings="-", as.is=TRUE, header=TRUE,
82 | stringsAsFactors=FALSE, strip.white=TRUE))
83 | #Get Unique ID
84 | dsID <- paste("pad", substr(as.character(unclass(Sys.time())), 12,16), sep="")
85 | out.file.csv <- paste("pads/data/",dsID, ".csv", sep="")
86 | out.file.JSON <- paste("pads/meta/",dsID, ".JSON", sep="")
87 | out.file.R.dataframes <- paste("pads/cache/", dsID, ".RData", sep="")
88 | write.csv(dt, out.file.csv)
89 | jsonSt <- paste('{
90 | "datasetName": "Bollywood Cinema (1940-2008)",
91 | "id":"', dsID, '",
92 | "dList": {
93 | "Year":', toJSON(levels(dt$Year)), ',
94 | "Director":', toJSON(levels(dt$Director)), '
95 | },
96 | "mList": {
97 | "Genre1":"",
98 | "Genre2": ""
99 | },
100 | "dListType": {
101 | },
102 | "source":"Wikipedia"
103 | }', sep="")
104 | file.out <- file(out.file.JSON, 'wt')
105 | cat(jsonSt, file=file.out, fill = TRUE)
106 | close(file.out)
107 | #Save transformed dataset as data.frame for later reading
108 | assign(dsID, dt)
109 | names <- c(eval(dsID))
110 | save(list=names, file=out.file.R.dataframes)
111 | pads <- rbind(pads, data.table(id=dsID, title="Bollywood Cinema (1940-2008)",
112 | subtitle="Bollywood Movies from 1940 including director, cast and genre.",
113 | records=nrow(dt), analyzed=0, stories=0, json=out.file.JSON, data=out.file.csv,
114 | cache=out.file.R.dataframes, source="Wiki")
115 | )
116 | #url <- paste("http://www.votesmart.org/candidate/evaluations/", 1:50 , sep = "")
117 | #res <- llply(url, function(i) readHTMLtable(i))
--------------------------------------------------------------------------------
/GenerateWDI.R:
--------------------------------------------------------------------------------
1 | # Script to read wdi data and create tables in WDI database
2 | # Make sure to convert file WDI_Description.csv to WDI_Description.txt - this file is not needed in the DB
3 | # Take back up if necessary
4 | # Author: Jitender Aswani, Co-Founder @datadolph.in
5 | # Date: 3/15/2013
6 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
7 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
8 | # All rights reserved.
9 |
10 | setwd("/Users/homemac/R")
11 |
12 | require("RMySQL")
13 | require("plyr")
14 |
15 | source("Util.R")
16 | #
17 | #initiate a connection to wdi database
18 | #
19 |
20 | getWDIDBCon <- function(multi.statements=FALSE) {
21 | if(multi.statements)
22 | return(dbConnect(MySQL(), user="wdi", password="wdi", dbname="wdi", host="localhost",
23 | client.flag=CLIENT_MULTI_STATEMENTS))
24 | else
25 | return(dbConnect(MySQL(), user="wdi", password="wdi", dbname="wdi", host="localhost"))
26 | }
27 |
28 | #
29 | #disconnect a database
30 | #
31 | disconnectDB <- function(con) {
32 | #Close Connection
33 | dbDisconnect(con)
34 | }
35 |
36 |
37 | #
38 | #write a data.frame as a table in WDI database
39 | # this funciton will overwrite the exisiting table
40 | #
41 |
42 | writeTable <- function(table.name ,table.data) {
43 | #List tables and fields in a table:
44 | con <- getWDIDBCon()
45 | on.exit(disconnectDB(con)) # On function exit, close connection
46 | if(dbExistsTable(con, table.name)){
47 | dbRemoveTable(con, table.name)
48 | dbWriteTable(con, name=table.name, value=table.data, row.names = F, overwrite = T)
49 | } else {
50 | dbWriteTable(con, name=table.name, value=table.data, row.names = F, overwrite = T)
51 | }
52 |
53 | }
54 |
55 |
56 | saveAllWDIData <- function() {
57 | folder.path = "./raw-data/wdi"
58 | filenames <- list.files(path=folder.path, pattern="*.csv")
59 | for(i in filenames) {
60 | filePath <- file.path(folder.path,i)
61 | wdi.name <- unlist(strsplit(i, "\\."))[1] #Temp - use the file name as the title
62 | wdi.df <- readFile(filePath)
63 | #clean up
64 | wdi.df <- wdi.df[rowSums(is.na(wdi.df)) != ncol(wdi.df),] # remove empty rows
65 | wdi.df <- wdi.df[rowSums(is.na(wdi.df)) != ncol(wdi.df)-1,] # remove rows with just one cell filled out
66 | wdi.df <- wdi.df[,colSums(is.na(wdi.df)) != nrow(wdi.df)] # remove empty cols
67 |
68 | #some cells have new line variables, remove it
69 | #char.vars <- setdiff(char.vars, "W") # remove a column
70 | char.vars <- names(wdi.df)[sapply(wdi.df, is.character)] #select character variables
71 | for(i in 1:length(char.vars)) {
72 | wdi.df[char.vars[i]] = sapply(wdi.df[char.vars[i]], function(x) gsub("\\n", " ", x))
73 | }
74 |
75 | #clean up column names
76 | colnames(wdi.df) <- replaceMetaChars(colnames(wdi.df))
77 | writeTable(wdi.name, wdi.df)
78 | }
79 | }
80 |
81 | saveAllWDIData()
82 |
83 |
84 | testFunctionDoNoRun <- function(){
85 | wdi.df <- read.csv("./raw-data/wdi/WDI_Series.csv", stringsAsFactors=F)
86 | wdi.df <- wdi.df[rowSums(is.na(wdi.df)) != ncol(wdi.df),] # remove empty rows
87 | wdi.df <- wdi.df[,colSums(is.na(wdi.df)) != nrow(wdi.df)] # remove empty cols
88 |
89 | char.vars <- names(wdi.df)[sapply(wdi.df, is.character)] #select character variables
90 | for(i in 1:length(char.vars)) {
91 | wdi.df[char.vars[i]] = sapply(wdi.df[char.vars[i]], function(x) gsub("\\n", " ", x))
92 | }
93 | writeTable("wdi_series", wdi.df)
94 |
95 |
96 | df1 <- as.data.frame(cbind(sapply(df,function(x) {
97 | if(is.character(x)) {
98 | gsub("\\n", " ", x)
99 | }
100 | return(x)
101 | })))
102 | }
103 |
104 | generateCatsFromWDI <- function() {
105 | wdi.cats <- read.csv("./raw-data/wdi/WDI_DD_Categories.csv", stringsAsFactors=F)
106 | #colnames(wdi.cats)
107 | #unique(wdi.cats$Category)
108 | #unique(wdi.cats$Subcategory)
109 | wdi.cats$Category[wdi.cats$Category=="Economic Policy & Debt"] = "Economics"
110 | wdi.cats$Category[wdi.cats$Category=="Labor & Social Protection"] = "Social"
111 | wdi.cats$Category[wdi.cats$Category=="Public Sector"] = "Government"
112 | wdi.cats$Category[wdi.cats$Category=="Private Sector & Trade"] = "Private Sector"
113 |
114 | #write out the cat and subcats for entering into story pads data
115 | dd.cats <- unique(wdi.cats[,c("Category", "Subcategory")])
116 | colnames(dd.cats) <- tolower(colnames(dd.cats))
117 | write.csv(dd.cats, "./pads/PadsCategory.csv", row.names=F)
118 | #insertCatSubcat()
119 | con <- getDBCon()
120 | dd.cats <- dbReadTable(con, "pads_categories")
121 | dd.subcats <- dbReadTable(con, "pads_subcategories")
122 | disconnectDB(con)
123 | #Assign category id
124 | wdi.cats$category_id = 0
125 | for(i in 1:nrow(dd.cats)){
126 | wdi.cats$category_id[which(wdi.cats$Category==dd.cats$category_name[i])] = dd.cats$category_id[i]
127 | }
128 | #trim white spaces from front and end
129 | #wdi.cats$Subcategory <- gsub("^\\s|\\s$","", wdi.cats$Subcategory)
130 | #assign subcat_id
131 | wdi.cats$subcategory_id = 0
132 | for(i in 1:nrow(dd.subcats)){
133 | wdi.cats$subcategory_id[which(wdi.cats$Subcategory==dd.subcats$subcategory_name[i])] = dd.subcats$subcategory_id[i]
134 | }
135 | colnames(wdi.cats) = c("SeriesCode", "Topic", "category_name", "subcategory_name", "category_id", "subcategory_id")
136 | writeTable("wdi_categories", wdi.cats)
137 | }
138 |
--------------------------------------------------------------------------------
/learning/birbal.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | #This version will use data.table in place of PLYR.
8 | #library(plyr)
9 | library(RJSONIO)
10 | library(data.table)
11 | #setwd("/Users/homemac/Toga/Alto")
12 |
13 | #Set content type for output
14 | setContentType("text/html")
15 | #Query Engine - qds - query this database, d=dimension, m=measure, g= geo, f=funciton
16 | qds <- d <- m <- f <- g <- NULL
17 | #d <- ""
18 | #m <- ""
19 | #f <- ""
20 |
21 | #load pre-analyzed and saved dataframe
22 | loadDS <- function(qds) {
23 | if(exists("pads", envir=.GlobalEnv)) {
24 | #lpads <- get("pads", envir=.GlobalEnv)
25 | #cat(exists(pads$sysID[pads$id==qds], envir=.GlobalEnv), "
")
26 | return(get(pads$sysID[pads$id==qds], envir=.GlobalEnv)) #get a local copy in this enviornment
27 | } else {
28 | print("PADS didn't exist. Check to see if you start up R script in RApache ran correctly.")
29 | }
30 | }
31 |
32 | ## Operate on a data.frame, split by "by", by could be a vector or a single filed
33 | ## aggreage by columns in vector on, on could be a single field also
34 |
35 | getResults <- function(dt, byVar, onVar) {
36 | e <- substitute(onVar)
37 | cat(e)
38 | return(dt[,eval(e),by=byVar])
39 | }
40 |
41 |
42 | ## Operate on a data.frame, split by "by", by could be a vector or a single filed
43 | ## and aggreage by columns in vector on, on could be a single field also
44 | #getCount <- function(df, by, on) {
45 | # return(ddply(df, by, colwise(function(col) {length(unique(col))}, on)))
46 | #}
47 |
48 | getCount<- function(dt, by, on) {
49 | lds <- dt[], by, "nrow")
50 | colnames(lds) <- c(by,on)
51 | return(lds)
52 | }
53 |
54 | #For counting column that has 0 or 1 value :: Very similar to getSum - the answer should be same
55 | getBinaryCount <- function(df, by, on) {
56 | return(ddply(df, by, colwise(function(x) {length(which(x==1))}, on)))
57 | #length(which(mbDS[,"Test.Century"] ==1))
58 | }
59 |
60 | readRequest <- function() {
61 | #GetString <- str(GET)
62 | #cat(GetString)
63 | qds <<- GET$qds
64 | d <<- GET$d
65 | m <<- GET$m
66 | f <<- GET$f
67 | g <<- GET$g
68 | cat(d, "," ,m, "," ,f, "," ,qds, sep=" ")
69 | #cat("
")
70 | }
71 |
72 | processRequest <- function(fnType, dt){
73 | return(switch(fnType,
74 | sum = dt[,eval(sum(m, na.rm=TRUE)), by=d]
75 | count = getCount(df, d, m),
76 | 'default'
77 | )
78 | )
79 | }
80 |
81 |
82 | library(data.table)
83 |
84 | mbDT <- data.table(mbDS)
85 |
86 | myfunction = function(dt, expr) {
87 | e = substitute(expr)
88 | dt[,eval(e),by=Species]
89 | }
90 |
91 | d <- quote(list(Opposition, Toss))
92 |
93 | setkey(mbDT,Opposition)
94 | #m <- quote(Test.Runs)
95 | #mbDT[Opposition=="England" ,eval(m)]
96 | #m <- quote(sum(Test.Runs))
97 | #mbDT[Opposition=="England" ,eval(m)]
98 |
99 | #m <- quote(list(T=sum(Test.Runs)))
100 | #mbDT[Opposition=="England" ,eval(m)]
101 |
102 | #m <- quote(list(Test.Runs=sum(Test.Runs)))
103 | #mbDT[Opposition=="England" ,eval(m)]
104 |
105 | #m <- quote(list(Test.Runs=sum(Test.Runs)))
106 | #mbDT[,eval(m), by=d]
107 |
108 |
109 | ## Operate on a data.frame, split by "by", by could be a vector or a single filed
110 | ## aggreage by columns in vector on, on could be a single field also
111 | getSum <- function(dt, onVar, byVar) {
112 | e <- substitute(onVar)
113 | j <- eval(e)
114 | print(j)
115 | dt[,eval(j),by=byVar]
116 | }
117 |
118 | m <- "Test.Runs"
119 | d<- "Opposition"
120 | #expr <- substitute(list(m=sum(m, na.rm=TRUE)))
121 | expr <- paste("list(",m,"=sum(",m,",na.rm=TRUE))", sep="")
122 | getSum(mbDT, paste("sum(",m,")", sep=""), d)
123 |
124 |
125 | writeResponse <- function(responseDF, df) {
126 | sJSON <- paste('{"', m, '":', toJSON(responseDF[,m]), sep="") #JSON String is not closed.
127 |
128 | if(!is.null(g) && g == "t") {
129 | latName <- paste(m, ".Lat", sep="")
130 | lngName <- paste(m, ".Lng", sep="")
131 | sJSON <- paste(sJSON, ', "responseJSON": [', sep="")
132 | JSON <- with(head(df), paste('{"d":"',responseDF[m]
133 | '", "center": new google.maps.LatLng(',
134 | df[latName][df[m]==responeDF[m]][1], ',', df[lngName][df[m]==responeDF[m]][1],
135 | '), "data":', mbDS$Test.Runs,
136 | '},', sep=""))
137 |
138 | } else
139 | sJSON = paste(sJSON, "}", sep="")
140 | #Write response back
141 | cat(sJSON)
142 | #cat(ls())
143 | rm(list=ls()) #Clean up everything - There has to be a better way than this.
144 | #cat(ls())
145 | }
146 |
147 | writeError <- function() {
148 | sJSON <- paste('{ "Error":"An Error Occured :-( "}' , sep="")
149 | }
150 |
151 | #Step 1 - Read Request, query string
152 | readRequest()
153 |
154 | #Step 2 - load environment
155 | df <- loadDS(qds)
156 |
157 | if(!is.null(df))
158 | {
159 | dt <- data.table(df[order(df[,d]),])
160 | #Step 3 - Process Request
161 | responseDF <- processRequest(dt)
162 | if(!is.null(responseDF)) {
163 | #Step 4 - Write Response
164 | writeResponse(responseDF)
165 | }
166 | else {
167 | writeError();
168 | }
169 | } else {
170 | writeError();
171 | }
172 |
173 |
174 |
--------------------------------------------------------------------------------
/learning/TransformMasterBlasterData.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | rm (list = ls())
8 | setwd("~/Toga")
9 | library("chron")
10 | library("plyr")
11 | library("RJSONIO")
12 | #Get Unique ID
13 | dsID <- paste("pad", substr(as.character(unclass(Sys.time())), 12,16), sep="")
14 | out.file.csv <- paste("Alto/pads/data/",dsID, ".csv", sep="")
15 | out.file.JSON <- paste("Alto/pads/meta/",dsID, ".JSON", sep="")
16 | out.file.R.dataframes <- paste("Alto/pads/cache/", dsID, ".RData", sep="")
17 | in.file.name <- "Alto/datasets/masterblaster/SachinTestRecords.csv"
18 |
19 |
20 | lMonths <- c("January","February","March", "April","May","June","July","August","September", "October","November","December")
21 | lDays <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
22 |
23 | # Use Google's Geocoding service and geo code the data
24 | getGeoCode <- function(gcStr, cnPre)
25 | {
26 | library("RJSONIO")
27 | gcStr <- gsub(' ','%20',gcStr)
28 | connectStr <- paste('http://maps.google.com/maps/api/geocode/json?sensor=false&address=',gcStr, sep="")
29 | #print(connectStr)
30 | con <- url(connectStr)
31 | data.json <- fromJSON(paste(readLines(con), collapse=""))
32 | close(con)
33 | data.json <- unlist(data.json)
34 | lat <- data.json["results.geometry.location.lat"]
35 | lng <- data.json["results.geometry.location.lng"]
36 | gcodes <- c(lat, lng)
37 | names(gcodes) <- c(paste(cnPre,".Lat",sep=""),paste(cnPre, ".Lng", sep=""))
38 | return (gcodes)
39 | }
40 |
41 | # Read SachinTestRecords.csv
42 | readDataSet <- function(file.name) {
43 | return(
44 | read.csv(file.name, na.strings="-", as.is=TRUE, header=TRUE,
45 | stringsAsFactors=FALSE, strip.white=TRUE)
46 | )
47 | }
48 |
49 | # Transform raw dataset
50 | transformDataSet <- function(df) {
51 | tdf <- mutate(df,
52 | StartDate=as.Date(StartDate, "%m/%d/%Y"),
53 | Day=factor(weekdays(StartDate), levels=lDays, ordered=TRUE),
54 | Month=factor(months(StartDate), levels=lMonths, ordered=TRUE),
55 | Year=years(StartDate),
56 | Ing1_Runs=as.numeric(gsub('\\*', '', Bat1)),
57 | Ing1_NotOut=ifelse(is.na(Bat1), 0, ifelse(grepl("\\*", Bat1),1,0)),
58 | Ing2_Runs=as.numeric(gsub('\\*', '', Bat2)),
59 | Ing2_NotOut=as.factor(ifelse(grepl("\\*", Bat2),1,0)),
60 | Test_Runs=as.numeric(Runs),
61 | Opposition=as.factor(Opposition),
62 | Ground=as.factor(Ground),
63 | Result=as.factor(Result),
64 | Toss=as.factor(Toss),
65 | Country=as.factor(Country),
66 | Ing1_Century=ifelse(is.na(Ing1_Runs), 0, ifelse(Ing1_Runs>=100, 1, 0)),
67 | Ing2_Century=ifelse(is.na(Ing2_Runs), 0, ifelse(Ing2_Runs>=100, 1, 0)),
68 | Test_Century=ifelse((Ing1_Century==1)|(Ing2_Century==1), 1, 0),
69 | Test_DCentury=ifelse((!is.na(Ing1_Runs) & Ing1_Runs >=200)|(!is.na(Ing2_Runs) & Ing2_Runs>=200), 1, 0),
70 | Home_Away=ifelse(!is.na(Country) & Country=="India", "Home", "Away"),
71 | Ground_Country=as.factor(paste(Ground,Country, sep=","))
72 | )
73 | head(tdf)
74 | return(with(tdf, data.frame(TestNo, StartDate, Day, Month,
75 | Year, Ing1_Runs, Ing1_NotOut, Ing2_Runs, Ing2_NotOut, Test_Runs,
76 | Opposition, Ground, Toss, Result,
77 | Ing1_Century, Ing2_Century,
78 | Test_Century, Test_DCentury, Home_Away, Country,
79 | Ground_Country,
80 | laply(Opposition, function(x){getGeoCode(x, "Opposition")}),
81 | laply(Ground_Country, function(x){getGeoCode(x, "Ground")}))
82 | ))
83 | }
84 |
85 | writeTransformedDS <- function(df, file.name) {
86 | write.csv(df, file.name)
87 | }
88 |
89 | writeJSON <- function(df, file.JSON, id) {
90 |
91 | jsonSt <- paste('{
92 | "datasetName": "SachinTestRecords",
93 | "id": "', id, '",
94 | "dList": {
95 | "Year":', toJSON(levels(df$Year)), ',
96 | "Month":', toJSON(levels(df$Month)), ',
97 | "Day":', toJSON(levels(df$Day)), ',
98 | "Opposition":', toJSON(levels(df$Opposition)), ',
99 | "Ground":', toJSON(levels(df$Ground)), ',
100 | "Result":', toJSON(levels(df$Result)), ',
101 | "Toss":', toJSON(levels(df$Toss)),',
102 | "HomeOrAway":', toJSON(levels(df$Home_Away)),
103 | '},
104 | "mList": {
105 | "Ing1_Runs":"",
106 | "Ing2_Runs":"",
107 | "Test_Runs":"",
108 | "Ing1_Century":"",
109 | "Ing2_Century":"",
110 | "Test_Century":""
111 | },
112 | "dListType": {
113 | "Opposition": "Geo",
114 | "Ground": "Geo"
115 | },
116 | "source":"CricInfo"
117 | }')
118 | file.out <- file(file.JSON, 'wt')
119 | cat(jsonSt, file=file.out, fill = TRUE)
120 | close(file.out)
121 | }
122 |
123 | # Read Master Blaster Raw File and Transform it
124 | mbDS <- transformDataSet(readDataSet(in.file.name))
125 |
126 | #Write Transformed data set to a CSV file
127 | writeTransformedDS(mbDS, out.file.csv)
128 |
129 | #Generate JSON with m and d
130 | writeJSON(mbDS, out.file.JSON, dsID)
131 |
132 | #Save transformed dataset as data.frame for later reading
133 | assign(dsID, mbDS)
134 | names <- c(eval(dsID))
135 | save(list=names, file=out.file.R.dataframes)
136 | pads <- rbind(pads, data.frame(id=dsID, title="Master Blaster - Sachin Tendulkar(1940-2008)",
137 | subtitle="This data set explores Sachin Tendulkar's glorious test cricket record.",
138 | records=nrow(mbDS), analyzed=0, stories=0, json=out.file.JSON, data=out.file.csv,
139 | cache=out.file.R.dataframes, source="CricInfo")
140 | )
141 | #save(mbDS, file=out.file.R.dataframes)
142 |
--------------------------------------------------------------------------------
/MongoFns.R:
--------------------------------------------------------------------------------
1 | # Interact with MongoDB
2 | # Author: Jitender Aswani, Co-Founder @datadolph.in
3 | # Date: 3/15/2013
4 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
5 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
6 | # Packages Used: rmongodb
7 | # All rights reserved.
8 |
9 | require("rmongodb")
10 |
11 | mongo.db <- list(user="", pass="", name="ddfin_dev", system.pads="ddfin_dev.system_pads", host="localhost")
12 |
13 | #
14 | #initiate a connection to admin database
15 | #
16 | getDefaultMongoDBCon <- function() {
17 | mongo <- mongo.create()
18 | if (!mongo.is.connected(mongo)) {
19 | print("Unable to connect. Error code:")
20 | print(mongo.get.err(mongo))
21 | } else
22 | return(mongo)
23 | }
24 |
25 | #
26 | #initiate a connection to a db
27 | # This will create a db if it doesn't exist
28 | #
29 | getMongoDBCon <- function(db.name) {
30 | mongo <- mongo.create(db=db.name)
31 | if (!mongo.is.connected(mongo)) {
32 | cat("Unable to connect. Error code:", "\n")
33 | cat(mongo.get.err(mongo), "\n")
34 | } else
35 | return(mongo)
36 | }
37 |
38 | #
39 | #disconnect
40 | #
41 | disconnectMongoDB <- function(mongo) {
42 | #Close Connection
43 | if (mongo.is.connected(mongo)) {
44 | mongo.destroy(mongo)
45 | }
46 | }
47 |
48 | #
49 | #list all databases in Mongo
50 | #
51 | listAllDBs <- function() {
52 | #List dbs
53 | mongo <- getDefaultMongoDBCon()
54 | on.exit(disconnectMongoDB(mongo)) # On function exit, close connection
55 | cat(mongo.get.databases(mongo), "\n")
56 | }
57 |
58 | #
59 | # drop a db
60 | #
61 | dropDB <- function(db.name) {
62 | mongo <- getDefaultMongoDBCon()
63 | on.exit(disconnectMongoDB(mongo)) # On function exit, close connection
64 | cat(mongo.drop.database(mongo, db.name))
65 | }
66 |
67 | #
68 | #List collections for a given db
69 | #
70 | listAllCollections <- function(db.name) {
71 | mongo <- getDefaultMongoDBCon()
72 | on.exit(disconnectMongoDB(mongo)) # On function exit, close connection
73 | cat(mongo.get.database.collections(mongo, db.name), "\n")
74 | }
75 | #listAllCollections(mongo.db$name)
76 |
77 | #
78 | # drop a collection
79 | #
80 | dropCollection <- function(db.collection){
81 | mongo <- getDefaultMongoDBCon()
82 | on.exit(disconnectMongoDB(mongo))
83 |
84 | cat(mongo.drop(mongo, db.collection), "\n")
85 | }
86 | #dropCollection(mongo.db$system.pads)
87 |
88 | #
89 | # empty a collection
90 | #
91 | emptyCollection <- function(db.collection){
92 | mongo <- getDefaultMongoDBCon()
93 | on.exit(disconnectMongoDB(mongo))
94 |
95 | cat(mongo.remove(mongo, db.collection), "\n")
96 | }
97 | #emptyCollection(mongo.db$system.pads)
98 |
99 |
100 | #
101 | # insert systemPads
102 | #
103 | insertPadDoc <- function(doc){
104 |
105 | if(!is.null(mongo)){
106 | status <- mongo.insert(mongo, mongo.db$system.pads, doc)
107 | if(status){
108 | if(verbose) cat("pad successfully inserted", "\n")
109 | } else {
110 | err <- mongo.get.last.err(mongo, mongo.db$name)
111 | cat(mongo.get.server.err(mongo), "\n")
112 | cat(mongo.get.server.err.string(mongo), "\n")
113 | }
114 | } else {
115 | cat("Connection is null", "\n")
116 | }
117 | mongo.bson.destroy(doc)
118 | }
119 |
120 | #
121 | # update systemPads
122 | #
123 | updatePadDoc <- function(pad.id, doc){
124 | if(!is.null(mongo)){
125 | buf <- mongo.bson.buffer.create()
126 | mongo.bson.buffer.append(buf, "_id", pad.id)
127 | criteria <- mongo.bson.from.buffer(buf)
128 |
129 | status <- mongo.update(mongo, mongo.db$system.pads, criteria, doc)
130 | if(status){
131 | if(verbose) cat("pad successfully updated", "\n")
132 | } else {
133 | err <- mongo.get.last.err(mongo, mongo.db$name)
134 | cat(mongo.get.server.err(mongo), "\n")
135 | cat(mongo.get.server.err.string(mongo), "\n")
136 | }
137 | } else {
138 | cat("Connection is null", "\n")
139 | }
140 | mongo.bson.destroy(doc)
141 | }
142 |
143 | #
144 | #insert a pad into system_pads
145 | # connection object is supplied
146 | #
147 | insertPadToMongo <- function(pad.id, pad){
148 | # Insert document to collection 'system_pads'
149 | doc <- mongo.bson.from.list(list(
150 | "_id"=pad.id,
151 | pad=pad))
152 |
153 | if(verbose) cat("calling insert for pad ", pad.id, "\n")
154 | insertPadDoc(doc)
155 | }
156 |
157 | #
158 | # get a single system pad
159 | #
160 | getSingleSystemPad <- function(padId) {
161 |
162 | mongo <- getDefaultMongoDBCon()
163 | on.exit(disconnectMongoDB(mongo))
164 |
165 | buf <- mongo.bson.buffer.create()
166 | mongo.bson.buffer.append(buf, "_id", padId)
167 | query <- mongo.bson.from.buffer(buf)
168 |
169 | # Find the first 100 records in collection system_pads
170 | cursor <- mongo.find(mongo, mongo.db$system.pads, query)
171 |
172 | # Step though the matching records and display them
173 | while (mongo.cursor.next(cursor))
174 | l<- mongo.bson.to.list(mongo.cursor.value(cursor))
175 | #assign("l", l, envir=.GlobalEnv)
176 | #l$pad$default$type = "timeseries"
177 | #updatePadDoc(l$'_id', mongo.bson.from.list(l))
178 | mongo.cursor.destroy(cursor)
179 | }
180 | #getSingleSystemPad("pad136971841842064")
181 |
182 | #
183 | #get pads from system_pads
184 | #
185 | getSystemPads <- function() {
186 |
187 | mongo <- getDefaultMongoDBCon()
188 | on.exit(disconnectMongoDB(mongo))
189 |
190 | # Find the first 100 records in collection system_pads
191 | cursor <- mongo.find(mongo, mongo.db$system.pads, limit=100L)
192 |
193 | # Step though the matching records and display them
194 | while (mongo.cursor.next(cursor))
195 | print(mongo.cursor.value(cursor))
196 | mongo.cursor.destroy(cursor)
197 | }
198 | #getSystemPads()
199 |
200 | #
201 | #get pads count from system_pads
202 | #
203 | getSystemPadsCount <- function(){
204 | mongo <- getDefaultMongoDBCon()
205 | on.exit(disconnectMongoDB(mongo))
206 |
207 | # Find the first 100 records
208 | # in collection people of database test where age == 18
209 | pads.count <- mongo.count(mongo, mongo.db$system.pads)
210 | cat("total pads: ", pads.count, "\n")
211 | }
212 | #getSystemPadsCount()
213 |
--------------------------------------------------------------------------------
/r_solr_integration.R:
--------------------------------------------------------------------------------
1 | # Author: Rajani Aswani Co-Founder @datadolph.in
2 | # Author: Jitender Aswani
3 | # Date: 4/15/2013
4 | # Description: R & Solr Integration Using Solr's REST APIs
5 | # Packages Used: RCurl, RJSONIO
6 | # Blog Reference: http://www.r-bloggers.com/updated-sentiment-analysis-and-a-word-cloud-for-netflix-the-r-way/
7 | # Download
8 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
9 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
10 | # All rights reserved.
11 |
12 | #
13 | # Load Library
14 | #
15 | require(RCurl)
16 | require(RJSONIO)
17 |
18 | # define solr server address
19 | solrServer <- "http://localhost:8080/solr-dev/"
20 |
21 | #
22 | # get row-wise JSON
23 | #
24 | getRowWiseJson <- function (jsonDT) {
25 | require("RJSONIO")
26 | row.json <- apply(jsonDT, 1, toJSON)
27 | json.st <- paste('[', paste(row.json, collapse=', '), ']')
28 | return (json.st)
29 | }
30 |
31 | getSolrServer <- function() {
32 | return(solrServer)
33 | }
34 |
35 | getUpdateURL <- function(){
36 | return(paste(getSolrServer(), "update/json?commit=true&wt=json", sep=""))
37 | }
38 |
39 | getCommitURL <- function() {
40 | return(paste(getSolrServer(),"update?commit=true&wt=json", sep=""))
41 | }
42 |
43 | getQueryURL <- function() {
44 | return(paste(getSolrServer(),"select?fl=id&wt=json&indent=true&fq=type:pad&q=", sep=""))
45 | }
46 |
47 | #
48 | # query a field for the text and return docs
49 | #
50 | querySolr <- function(queryText, queryfield="all") {
51 | response <- fromJSON(getURL(paste(getQueryURL(), queryfield, ":", queryText, sep="")))
52 | if(!response$responseHeader$status) #if 0
53 | return(response$response$docs)
54 | }
55 |
56 | #
57 | # delete all indexes from solr server
58 | #
59 | deleteAllIndexes <-function() {
60 | response <- postForm(getUpdateURL(),
61 | .opts = list(postfields = '{"delete": {"query":"*:*"}}',
62 | httpheader = c('Content-Type' = 'application/json',
63 | Accept = 'application/json'),
64 | ssl.verifypeer=FALSE
65 | )
66 | ) #end of PostForm
67 | return(fromJSON(response)$responseHeader[1])
68 | }
69 |
70 | #
71 | # delete all indexes for a document type from solr server
72 | # in this example : type = sports
73 | #
74 | deleteSportsIndexes <-function() {
75 | response <- postForm(getUpdateURL(),
76 | .opts = list(postfields = '{"delete": {"query":"type:sports"}}',
77 | httpheader = c('Content-Type' = 'application/json',
78 | Accept = 'application/json'),
79 | ssl.verifypeer=FALSE
80 | )
81 | ) #end of PostForm
82 | return(fromJSON(response)$responseHeader[1])
83 | }
84 |
85 | #
86 | # delete indexes for all baskeball category in sports type from solr server
87 | # in this example : type = sports and category: basketball
88 | #
89 | deleteSportsIndexesForCat <-function(category) {
90 | response <- postForm(getUpdateURL(),
91 | .opts = list(postfields =
92 | paste('{"delete": {"query":"type:sports AND category:', category, '"}}', sep=""),
93 | httpheader = c('Content-Type' = 'application/json',
94 | Accept = 'application/json'),
95 | ssl.verifypeer=FALSE
96 | )
97 | ) #end of PostForm
98 | return(fromJSON(response)$responseHeader[1])
99 | }
100 | #deletePadIndexesForCat("baskeball")
101 |
102 |
103 | #
104 | #Post a new document to Solr
105 | #
106 | postDoc <- function(doc) {
107 | solr_update_url <- getUpdateURL()
108 | jsonst <- toJSON(list(doc))
109 |
110 | response <- postForm(solr_update_url,
111 | .opts = list(postfields = jsonst,
112 | httpheader = c('Content-Type' = 'application/json',
113 | Accept = 'application/json'),
114 | ssl.verifypeer=FALSE
115 | )) #end of PostForm
116 | return(fromJSON(response)$responseHeader[1])
117 | ########## Commit - only if it doesn't work the other way ###############
118 | #return(fromJSON(getURL(getCommitURL())))
119 | }
120 |
121 | #
122 | #Post JSON document to Solr
123 | #
124 | postJSON <- function(jsonst) { #pad.meta.data (i.e. pmd is sent)
125 | solr_update_url <- getUpdateURL()
126 |
127 | response <- try(postForm(solr_update_url,
128 | .opts = list(postfields = jsonst,
129 | httpheader = c('Content-Type' = 'application/json',
130 | Accept = 'application/json'),
131 | ssl.verifypeer=FALSE
132 | )), silent=T) #end of PostForm
133 | if(class(response) %in% c("try-error")){
134 | print(response)
135 | } else {
136 | print(response)
137 | return(fromJSON(response)$responseHeader[1])
138 | }
139 | ########## Commit - only if it doesn't work the other way ###############
140 | #return(fromJSON(getURL(getCommitURL())))
141 | }
142 |
143 | #
144 | #test it out
145 | #
146 | TestAll <- function() {
147 | status <- postJSON(toJSON(fromJSON(paste(readLines("./pads/solr-docs-IPL-T20.json"), collapse="\n"))))
148 |
149 | if(status) print("An error occurred creating indexes.")
150 |
151 | #delete test
152 | status <- deleteAllIndexes() # Status of 0 means that the operation was successfully completed.
153 | if(status) print("An error occurred while deleting all indexes.")
154 |
155 | #add doc test
156 | status <- postDoc(doc) # Status of 0 means that the operation was successfully completed.
157 | if(status) print("An error occurred while adding a document.")
158 |
159 |
160 | #Query SOlr
161 | docs <- querySolr("baseball", "tags")
162 | print(docs)
163 |
164 | docs <- querySolr("baseball") #if no field is defined, default filed all_text will be searched
165 | print(docs)
166 | }
167 |
--------------------------------------------------------------------------------
/GenerateWDIPADS.R:
--------------------------------------------------------------------------------
1 | # Generate PADS from WDI data
2 | # Get series list and the associated meta data from the database
3 | #
4 | # Author: Jitender Aswani, Co-Founder @datadolph.in
5 | # Date: 3/15/2013
6 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
7 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
8 | # All rights reserved.
9 |
10 | setwd("/Users/homemac/R")
11 | source("CreatePADS.R")
12 | #
13 | #initiate a connection to wdi database
14 | #
15 | getWDIDBCon <- function(multi.statements=FALSE) {
16 | if(multi.statements)
17 | return(dbConnect(MySQL(), user="wdi", password="wdi", dbname="wdi", host="localhost",
18 | client.flag=CLIENT_MULTI_STATEMENTS))
19 | else
20 | return(dbConnect(MySQL(), user="wdi", password="wdi", dbname="wdi", host="localhost"))
21 | }
22 |
23 | #
24 | # get data for a given series and country
25 | #
26 |
27 | getSeriesData <- function(series.name, series.country){
28 | con <- getWDIDBCon()
29 | on.exit(disconnectDB(con)) # On function exit, close connection
30 |
31 | sql <- sprintf("select * from wdi.wdi_data where Indicator_Code = '%s' and Country_Name ='%s';", series.name, series.country)
32 | print(sql)
33 | res <- dbSendQuery(con, sql)
34 | series.data <- fetch(res, n = -1)
35 | dbClearResult(res)
36 | if(!is.null(series.data) && ncol(series.data) > 4)
37 | return(series.data[,5:ncol(series.data)])
38 | else
39 | return(NULL)
40 | }
41 | #series.data <- getSeriesData('AG.AGR.TRAC.NO', 'C')
42 |
43 |
44 | padifyWDI <- function(){
45 | pads <- getSystemPads()
46 | pads.cat <- getCategories()
47 | pads.subcat <- getSubcategories()
48 | solr.index.docs <- list()
49 | con <- getWDIDBCon()
50 | #read wdi data
51 | wdi.data <- dbReadTable(con, "wdi_data")
52 | #read sereis meta data - first four columns only
53 | wdi.series <- dbReadTable(con, "wdi_series")[,1:4]
54 | #read country meta data - only two columns are needed for the time being
55 | wdi.cats <- dbReadTable(con, "wdi_categories")
56 |
57 | series <- list()
58 | series["source"] <- "World Bank"
59 |
60 | for(i in 1:nrow(wdi.series)){
61 | if(i > 39 && i < 46){
62 | series["name"] <- wdi.series$SeriesCode[i]
63 | series["title"] <- wdi.series$Indicator_Name[i]
64 | series["longdef"] <- wdi.series$Long_definition[i]
65 | series["shortdef"] <- wdi.series$Short_definition[i]
66 | wdi.cats.matched <- wdi.cats[wdi.cats$SeriesCode==series$name,]
67 | series["category"] <- wdi.cats.matched$category_name
68 | series["subcategory"] <- wdi.cats.matched$subcategory_name
69 | series["category_id"]<- wdi.cats.matched$category_id
70 | series["subcategory_id"]<- wdi.cats.matched$subcategory_id
71 | #
72 | logMessage(paste("Starting loading series", i, series$name, sep=":"))
73 | print(paste("Starting loading series", i, series$name, sep=":"))
74 |
75 | #don't continue if number of na are greather than 1/5th of number of cols
76 | all.series.data <- wdi.data[wdi.data$Indicator_Code==series$name,]
77 | for(j in 1:nrow(all.series.data)){
78 | series.data <- all.series.data[j,]
79 | if(!is.null(series.data) &&
80 | ncol(series.data) > 4 &&
81 | sum(is.na(series.data)) < .9 * ncol(series.data)) {
82 |
83 | series["country"] <- series.data$Country_Name
84 |
85 | logMessage(paste("Starting country", series$country, sep=":"))
86 | print(paste("Starting country", series$country, sep=":"))
87 |
88 |
89 | series.data <- t(series.data[,5:ncol(series.data)])
90 | #generate data.frame
91 | series.data <- data.frame(gsub("X", "", rownames(series.data)),
92 | series.data[,1],
93 | row.names=seq(1:nrow(series.data)), stringsAsFactors=FALSE)
94 | #change colnames
95 | colnames(series.data) <- c("date", paste(series$name, series$country, sep="."))
96 |
97 | #Remove NA from series column
98 | series.data <- series.data[!is.na(series.data[2]),]
99 | #Create pad and get pad meta data
100 | pmd <- createPAD(series.data, paste(series$title, series$country, sep=", "),
101 | series$longdef, series$category, series$subcategory,
102 | series$source, paste(series$name, series$country, sep="."),
103 | tolower(paste(series$category, series$subcategory, series$source, series$country, sep=",")))
104 |
105 |
106 | pads <- rbind(pads, list(pmd$id, pmd$title, pmd$desc, pmd$records, pmd$columns, series$category_id,
107 | series$subcategory_id, pmd$analyzed, pmd$stories, pmd$src, pmd$src_file, "", "", pmd$tags))
108 | #
109 | #generate solr doc
110 | #
111 | doc <- list("type"="pad", "id" = pmd$id, "title" = pmd$title, "desc"=pmd$desc, "category"=pmd$category,
112 | "subcategory"=pmd$subcategory, "tags"=pmd$tags)
113 | solr.index.docs[[length(solr.index.docs)+1]] <- doc
114 |
115 | } else {
116 | logMessage(paste(i, series$name, "was empty for ", series$country, sep=":"))
117 | print("This series was empty.")
118 | }
119 | }# inner for loop ends here
120 | }
121 | }# for loop for series ends here
122 |
123 | # now save solr doc
124 | generateSolrDocFromAllDocs(solr.index.docs)
125 |
126 | # Save pads to a csv file
127 | saveData(pads, "./pads/system_pads.csv")
128 |
129 | #now save pads to db
130 | scon <- getDBCon()
131 | dbRemoveTable(scon, "system_pads")
132 | dbWriteTable(scon, name="system_pads", value=pads, row.names = F, overwrite = T)
133 | disconnectDB(scon)
134 |
135 | dumpLogs()
136 | }
137 |
138 | padifyWDI()
139 |
140 |
141 | #use extreme caution when running this funciton - it will remove all meta, cahce, data and empty system_pads table
142 | deleteEverything <- function(){
143 | unlink("./pads/meta/*.*")
144 | unlink("./pads/cache/*.*")
145 | unlink("./pads/data/*.*")
146 | emptySystemPads()
147 | unlink("./pads/solr-docs.json")
148 | }
149 | #read wdi data
150 | #wdi.data <- dbReadTable(getWDIDBCon(), "wdi_data")[1:10,]
151 | #wdi.data.i <- t(wdi.data)
152 |
--------------------------------------------------------------------------------
/PresedentialElectionsResults.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 2012-9-9
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # Description: Financial Markets and President Elect: Do Financial Markets Favor A Republican Over A Democrat?
6 | # Packages Used: RCurl, RJSONIO
7 | # Blog Reference: http://allthingsbusinessanalytics.blogspot.com/2012/11/financial-markets-and-president-elect.html
8 | # All rights reserved.
9 |
10 | require(XML)
11 | require(plyr)
12 | require(quantmod)
13 |
14 | setwd("C:/mydir")
15 |
16 | #######################################################################
17 | # Get SPY data for US election dates
18 | #######################################################################
19 |
20 | #Presedential elections are first tuesday after first monday every 4 years
21 | election.years <- seq(1952,2012, by=4)
22 | election.months <- seq(as.Date("1952/11/1"), by="4 year", length.out=16)
23 | election.dates <- election.months + ((9-as.POSIXlt(election.months)$wday) %% 7)
24 | election.dates.1dl <- election.dates + 1
25 |
26 | getSymbols("^GSPC",src="yahoo", from="1950-1-1")
27 | colnames(GSPC) <- c("Open", "High", "LOW", "Close", "Volume", "Adjusted.Close")
28 | write.zoo(GSPC, file="pads/data/financials/SP500_historical_OHLC_data.csv", sep=",")
29 |
30 | #Generate Monthly Returns for 2008 and save them as a file
31 | write.zoo(monthlyReturn(GSPC["2008"]), file="pads/data/financials/SP500_2008_monthly_returns.csv", sep=",")
32 |
33 |
34 | # only concerned about close
35 | SPY.close <- Ad(GSPC)
36 |
37 | #system("cat pads/data/financials/SP500_historical_OHLC_data.csv")
38 | spy.daily.returns <- dailyReturn(SPY.close)
39 | #one.day.return <- subset(spy.daily.returns, index(spy.daily.returns) %in% election.dates.1daylater)
40 | #one.day.return$year <- format(index(one.day.return), "%Y")
41 |
42 | getQuote <- function(year, SPY)
43 | {
44 | #Determine election date
45 | election.month <- as.Date(paste(year, "11/1", sep="/"))
46 | election.date <- election.month + ((9-as.POSIXlt(election.month)$wday) %% 7)
47 | #election.date <- election.dates[which(format(election.dates, "%Y")==year)]
48 |
49 | #Get dates 1-day, 1-week, 4-weeks, 12-weeks and 52-weeks afte the election day
50 | quote.dates <- c(election.date, election.date+1, election.date+7,
51 | election.date+(4*7), election.date+(12*7), election.date+(52*7), election.date+(50*7*4))
52 | if(year == 2008)
53 | quote.dates[length(quote.dates)] <- Sys.Date()
54 |
55 | # Check to see if the dates exist in SPY index, if not advance or lag by 1 day
56 | quote.dates <- as.Date(sapply(quote.dates, function(x)
57 | if(length(SPY.close[x])){return(as.Date(x))}else{ return(as.Date(x)-1)}))
58 |
59 | #Get the quotes for all these dates
60 | quotes <- SPY[quote.dates]
61 | returns <- sapply(quote.dates,
62 | function(x) (as.numeric(quotes[x])/as.numeric(quotes[quote.dates[1]]) - 1)*100)
63 | r <- list()
64 | return(r[[year]]<-returns)
65 | }
66 |
67 | #Remove 2012 from this vector
68 | election.years <- election.years[!election.years==2012]
69 | returns <- do.call(cbind, llply(election.years, function(x) getQuote(x, SPY.close)))
70 | colnames(returns) <- election.years
71 | #Remove first row
72 | returns <- returns[-1,]
73 | t.returns <- t(returns)
74 | old.rownames <- rownames(t.returns)
75 | obama.1d.return <- as.numeric(spy.daily.returns["2012-11-07", 1]) * 100
76 | t.returns <- rbind(t.returns, c(obama.1d.return, NA, NA, NA, NA, NA))
77 | rownames(t.returns) <- c(old.rownames, 2012)
78 | colnames(t.returns) <- c("1_day_return", "1_week_return", "4_weeks_return", "12_weeks_return",
79 | "52_weeks_return", "term_return")
80 | write.csv(t.returns, "pads/data/history/SP500 returns after US president elections_1.csv", row.names = TRUE)
81 |
82 | #######################################################################
83 | # Get Presidential Names
84 | #######################################################################
85 |
86 | wiki <- "http://en.wikipedia.org/wiki/United_States_presidential_election"
87 | tables <- readHTMLTable(wiki, stringsAsFactors=FALSE)
88 | #print(names(tables))
89 | wiki.table <- tables[[4]]
90 | #Create a new column by removing all the special chars from the winner column
91 | wiki.table$sub <- gsub("\\*", "", wiki.table$Winner)
92 | #Replace all meta chars
93 | #data$sub <- gsub("[^A-Za-z0-9 ]+", ",", data$sub)
94 | wiki.table$sub <- gsub("\\(|\\)", ",", wiki.table$sub)
95 | #Now split and create a new dataframe
96 | us.prez.elect <- data.frame(do.call("rbind", strsplit(wiki.table$sub, ",", fixed=TRUE)))
97 | drops <- c("X3","X4")
98 | us.prez.elect <- us.prez.elect[,!(names(us.prez.elect) %in% drops)]
99 | election_years <- gsub("[^A-Za-z0-9]+", "", wiki.table$"Election year")
100 | election_years[1] <- 1788
101 | election_years[20] <- 1864
102 | colnames(us.prez.elect) <- c("president", "party")
103 | rownames(us.prez.elect) <- election_years
104 | election.months <- seq(as.Date("1788/11/1"), by="4 year", length.out=(2016-1788)/4)
105 | election.dates <- election.months + ((9-as.POSIXlt(election.months)$wday) %% 7)
106 | us.prez.elect$election_date <- as.Date(election.dates)
107 | write.csv(us.prez.elect, "pads/data/history/US president elects.csv", row.names = TRUE)
108 | us.prez.elect.since.1950 <- subset(us.prez.elect, rownames(us.prez.elect) %in% election.years)
109 |
110 | #
111 | # Merge the two data frames
112 | #
113 | new.col.names <- c('election_year', colnames(us.prez.elect.since.1950), colnames(t.returns))
114 | prez.data <- merge(us.prez.elect.since.1950, t.returns, by=0)
115 | colnames(prez.data) <- new.col.names
116 | write.csv(prez.data, "pads/data/history/US president elects and SP500 returns.csv", row.names = FALSE)
117 |
118 |
119 | #all.dates <- seq(as.Date("1952-1-1"), Sys.Date(), by="day")
120 | #election.year.dates <- subset(all.dates, years(all.dates) %in% election.years)
121 | #election.months <- subset(election.year.dates, months(election.year.dates)=="November")
122 | #elections.months.days <- subset(election.months, weekdays(election.months) %in% c("Monday", "Tuesday"))
123 | #df <- data.frame(date=elections.months.days, day=weekdays(elections.months.days))
124 | #election.years <- seq(as.Date("1952/1/1"), by="4 year", length.out=16)
125 | #http://statistics.berkeley.edu/classes/s133/dates.html
126 | #sapply(quote.dates, function(x) if(length(SPY.close[x])){cat("True\n")}else{ cat("False\n")})
127 | #x <- SPY.close[paste(election.date, election.date+(52*7), sep="/")]
128 | #d <- dailyReturn(x)
129 | #w <- weeklyReturn(x)
130 | #keeps <- c("y","a")
131 | #DF[keeps]
132 |
--------------------------------------------------------------------------------
/SentimentAnalysis.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 2012-30-1
3 | # Description: Extracts tweets from twitter and run sentiment analysis on using list of sentiment words from Hu and Liu
4 | # Packages Used: RCurl, XML, TwitteR, RJSONIO
5 | # Blog Reference: http://www.r-bloggers.com/updated-sentiment-analysis-and-a-word-cloud-for-netflix-the-r-way/
6 | # Download
7 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
8 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
9 | # All rights reserved.
10 | # Revised Sentiment Analyis using Hu & Liu's library of 6,800 negative and positive words
11 |
12 | #Populate the list of sentiment words from Hu and Liu (http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html)
13 | huliu.pwords <- scan(paste(SRC_DIR, 'opinion-lexicon/positive-words.txt', sep=""), what='character', comment.char=';')
14 | huliu.nwords <- scan(paste(SRC_DIR,'opinion-lexicon/negative-words.txt', sep=""), what='character', comment.char=';')
15 | # Add some extra words
16 | huliu.nwords <- c(huliu.nwords,'wtf','wait','waiting','epicfail', 'crash', 'bug', 'bugy', 'bugs', 'slow', 'lie')
17 | #Remove some words for example sap and Cloud
18 | huliu.nwords <- huliu.nwords[!huliu.nwords=='sap']
19 | huliu.nwords <- huliu.nwords[!huliu.nwords=='cloud']
20 |
21 | #
22 | # clean up a large character string
23 | #
24 | cleanText <- function(x) {
25 | # tolower
26 | x = tolower(x)
27 | # remove rt
28 | #x = gsub("rt", "", x)
29 | # remove at
30 | x = gsub("@\\w+", "", x)
31 | # remove punctuation
32 | x = gsub("[[:punct:]]", "", x)
33 | # remove control characters
34 | x <- gsub('[[:cntrl:]]', '', x)
35 | # remove numbers
36 | x = gsub("[[:digit:]]", "", x)
37 | # remove links http
38 | x = gsub("http\\w+", "", x)
39 | # remove tabs
40 | x = gsub("[ |\t]{2,}", " ", x)
41 | # remove new lines
42 | x = gsub("[ |\n]{1,}", " ", x)
43 | # remove blank spaces at the beginning
44 | x = gsub("^ ", "", x)
45 | # remove blank spaces at the end
46 | x = gsub(" $", "", x)
47 | return(x)
48 | }
49 |
50 | #
51 | # clean up tweets or any other doucment for corpus and sentiment analysis
52 | #
53 | cleanContent <- function(content){
54 | # clean out non-ASCII characters, remove numbers, puncuations, stop words
55 | content <- sapply(content, function(x) iconv(x, "latin1", "ASCII", sub=""))
56 | content <- cleanText(content) # clean up
57 | # remove stop-words
58 | content <- removeWords(content,
59 | c(stopwords("english"), "twitter", "wikipedia"))
60 | return(content)
61 | }
62 | #
63 | # build a generic tag cloud
64 | #
65 | buildTagCloud <- function (content, word.threshold=2){
66 | #cleanup
67 | content <- cleanContent(content)
68 | # make corpus for text mining
69 | content.corpus <- Corpus(VectorSource(content))
70 |
71 | #build a term document
72 | #content.dtm <- TermDocumentMatrix(content.corpus, control = list(stopwords = TRUE, minWordLength = 5))
73 | content.dtm <- TermDocumentMatrix(content.corpus, control = list(minWordLength = 5))
74 | # get a matrix
75 | content.m = as.matrix(content.dtm)
76 | # get word counts in decreasing order
77 | content.words <- sort(rowSums(content.m), decreasing=TRUE)
78 | # create a data frame with words and their frequencies
79 | content.df = data.frame(text=names(content.words), size=content.words)
80 | #write.csv(content.words, "company-word-tag.csv", row.names=F)
81 | return( content.df[content.df$size > word.threshold,])
82 | }
83 |
84 | #
85 | # get sentiment score for each tweet
86 | #
87 | getSentimentScore <- function(tweets) {
88 | scores <- laply(tweets, function(singleTweet) {
89 | tweetWords <- unlist(str_split(tolower(singleTweet), '\\s+'))
90 | # compare our words to the dictionaries of positive & negative terms
91 | # match() returns the position of the matched term or NA, apply is.na to convert to boolean
92 | pos.matches <- !is.na(match(tweetWords, huliu.pwords))
93 | neg.matches <- !is.na(match(tweetWords, huliu.nwords))
94 | # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
95 | score <- sum(pos.matches) - sum(neg.matches)
96 | return(score)
97 | })
98 | return(data.frame(SentimentScore=scores, Tweet=tweets))
99 | }
100 |
101 | #
102 | #perform sentiment analysis
103 | #
104 | performSentimentAnalysis <- function(tweets){
105 | #
106 | # perform twitter sentiment analysis for each tweet
107 | #
108 | tweets <- cleanContent(tweets)
109 | # call getSentiment score
110 | ss <- getSentimentScore(tweets)
111 |
112 | # Get rid of tweets that have zero score and seperate +ve from -ve tweets
113 | ss$posTweets <- as.numeric(ss$SentimentScore >=1)
114 | ss$negTweets <- as.numeric(ss$SentimentScore <=-1)
115 |
116 | # Let's summarize now
117 | summary <- list(TweetsFetched=length(ss$SentimentScore),
118 | PositiveTweets=sum(ss$posTweets), NegativeTweets=sum(ss$negTweets),
119 | AverageScore=round(mean(ss$SentimentScore),3))
120 |
121 | # some tweets have no score - positive offsets negative - so the next line is necessary
122 | summary$TweetsWithScore <- summary$PositiveTweets + summary$NegativeTweets
123 |
124 | #Get Sentiment Score
125 | summary$SentimentScore <- round(summary$PositiveTweets/summary$TweetsWithScore, 2)
126 | return(summary)
127 | }
128 |
129 | #
130 | #search twitter using hash tags
131 | #
132 | searchTwitterHashtag <- function(tw.hashtag, certificate.path, how.many=300, what.lang="en") {
133 | tweets <- try(searchTwitter(tw.hashtag, lang=what.lang, n=how.many, cainfo=certificate.path), silent=T)
134 | if("try-error" %in% class(tweets))
135 | return(data.frame(error="Oops an error occurred"))
136 | return(tweets)
137 | }
138 |
139 | #search twitter using handle
140 | searchTwitterHandle <- function(tw.handle, certificate.path, how.many=300) {
141 | tweets <- try(userTimeline(tw.handle, n=how.many, cainfo=certificate.path), silent=T)
142 | if("try-error" %in% class(tweets))
143 | return(list(error="Oops an error occurred"))
144 | return(tweets)
145 | }
146 |
147 | #download ca cert file
148 | downloadCACertFile <- function(){
149 | download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.pem")
150 | }
151 |
152 | # Certain words may not be relevant for your secenario to be counted as positive and negative on Hu and Liu list
153 | # Remove them before sending the words to this list
154 | # Fitler words
155 | filterWords <- function(words){
156 | if(length(which(words %in% filter.words)) > 0)
157 | words <- words[-which(words %in% filter.words)]
158 | return(words)
159 | }
160 |
161 | # example
162 | #get tweets and perform sentiment analysis
163 | tweets <- searchTwitterHandle(twitter.handle, CERTIFICATE_PATH, 300)
164 | summary <- performSentimentAnalysis(tweets)
165 |
166 |
--------------------------------------------------------------------------------
/GenerateCrimePADS.R:
--------------------------------------------------------------------------------
1 | # Generate crime PADS
2 | # Author: Jitender Aswani, Co-Founder @datadolph.in
3 | # Date: 3/15/2013
4 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
5 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
6 | # All rights reserved.
7 |
8 | source("CreatePADS.R")
9 | verbose <- T
10 | #
11 | # generate
12 | #
13 | generate <- function() {
14 | #initialize system
15 | initializeSystem()
16 | assign("crime.folder.path", "./pads/raw-data/crime", envir=.GlobalEnv)
17 | assign("dataset", "US-Crime", envir=.GlobalEnv)
18 |
19 | #prepare pad meta data
20 | series <- list()
21 | series["source"] <- " BJS, FBI, Uniform Crime Reports"
22 | series["category"] <- "Social"
23 | series["subcategory"] <- "US Crime Data"
24 | series["tags"] <- tolower(paste(series$category, series$subcategory, series$source, "usdoj, Supplementary Homicide Report, USA", sep=","))
25 | series["desc"] <- "Homicide Trends in United States: Data are based on annual estimates of homicide from previously published versions of Crime in the United States. Data for 1989 to 2008 reflect updated homicide estimates from Crime in the United States, 2008. Data for 2009 and 2010 reflect updated homicide estimates from Crime in the United States, 2010."
26 | series["pagetag"] <- "crunchbase"
27 |
28 | series.data <- read.csv(paste(crime.folder.path, "1.csv", sep="/"), stringsAsFactors=F)
29 |
30 | series["title"] <- "Homicide Rate per 100,000 Population (1950-2008)"
31 | padify(series, series.data[c(1,2)])
32 |
33 | series["title"] <- "Homicide Incidents per 100,000 Population (1950-2008)"
34 | padify(series, series.data[c(1,3)])
35 |
36 | series["title"] <- "Homicide Rate & Incidents per 100,000 Population (1950-2008)"
37 | padify(series, series.data)
38 |
39 | series.data <- read.csv(paste(crime.folder.path, "2.csv", sep="/"), stringsAsFactors=F)
40 | series.data[2:4] <- data.frame(apply(series.data[2:4], 2, function(x) as.numeric(gsub("%", "", x, fixed=T))))
41 | series["title"] <- "Victims and Offenders by Age in USA, Rate per 100,000 (1980-2008)"
42 | padify(series, series.data)
43 |
44 | series.data <- read.csv(paste(crime.folder.path, "3.csv", sep="/"), stringsAsFactors=F)
45 | series.data[2:4] <- data.frame(apply(series.data[2:4], 2, function(x) as.numeric(gsub("%", "", x, fixed=T))))
46 | series["title"] <- "Victims and Offenders by Gender in USA (1980-2008)"
47 | padify(series, series.data)
48 |
49 | series.data <- read.csv(paste(crime.folder.path, "4.csv", sep="/"), stringsAsFactors=F)
50 | series.data[2:4] <- data.frame(apply(series.data[2:4], 2, function(x) as.numeric(gsub("%", "", x, fixed=T))))
51 | series["title"] <- "Victims and Offenders by Race in USA (1980-2008)"
52 | padify(series, series.data)
53 |
54 | series.data <- read.csv(paste(crime.folder.path, "2-1.csv", sep="/"), stringsAsFactors=F)
55 | series["title"] <- "Victims and Offenders by Age in USA, Rate per 100,000, (1980-2008)"
56 | padify(series, series.data)
57 |
58 | series.data <- read.csv(paste(crime.folder.path, "3-1.csv", sep="/"), stringsAsFactors=F)
59 | series["title"] <- "Victims and Offenders by Gender in USA, Rate per 100,000 (1980-2008)"
60 | padify(series, series.data)
61 |
62 | series.data <- read.csv(paste(crime.folder.path, "4-1.csv", sep="/"), stringsAsFactors=F)
63 | series["title"] <- "Victims and Offenders by Race in USA, Rate per 100,000 (1980-2008)"
64 | padify(series, series.data)
65 |
66 | # Homicide victimization rates by age, (1980-2008)
67 | series.data <- read.csv(paste(crime.folder.path, "5.csv", sep="/"), stringsAsFactors=F)
68 | title <- "Homicide Victimization Rates By Age"
69 | period <- "(1980-2008)"
70 | colnames(series.data) <- gsub("X", "ages", colnames(series.data))
71 | col.names <- colnames(series.data)
72 | for(i in 2:ncol(series.data))
73 | {
74 | series["title"] <- paste(title, ", ", col.names[i], period, sep="")
75 | print(i)
76 | padify(series, series.data[c(1,i)])
77 | }
78 | series["title"] <- "Homicide Victimization Rates for All Ages (1980-2008)"
79 | padify(series, series.data)
80 |
81 |
82 | # Homicide offending rates by age, 1980-2008
83 | series.data <- read.csv(paste(crime.folder.path, "5-1.csv", sep="/"), stringsAsFactors=F)
84 | title <- "Homicide Offending Rates By Age"
85 | period <- "(1980-2008)"
86 | colnames(series.data) <- gsub("X", "ages", colnames(series.data))
87 | col.names <- colnames(series.data)
88 | for(i in 2:ncol(series.data))
89 | {
90 | series["title"] <- paste(title, ", ", col.names[i], period, sep="")
91 | print(i)
92 | padify(series, series.data[c(1,i)])
93 | }
94 | series["title"] <- "Homicide Offending Rates for All Ages (1980-2008)"
95 | padify(series, series.data)
96 |
97 | #Average age of homicide victims and offenders, 1980-2008
98 | series.data <- read.csv(paste(crime.folder.path, "5-2.csv", sep="/"), stringsAsFactors=F)
99 | title <- "Average Age of Homicide"
100 | period <- "(1980-2008)"
101 | colnames(series.data) <- gsub("X", "ages", colnames(series.data))
102 | col.names <- colnames(series.data)
103 | for(i in 2:ncol(series.data))
104 | {
105 | series["title"] <- paste(title, col.names[i], period, sep=" ")
106 | print(i)
107 | padify(series, series.data[c(1,i)])
108 | }
109 | series["title"] <- "Average Age of Homicide Victims and Offenders (1980-2008)"
110 | padify(series, series.data)
111 |
112 | #Figure 6. Percent of homicides in which offender was known to victim by age of victim, 1980-2008
113 | series.data <- read.csv(paste(crime.folder.path, "11.csv", sep="/"), stringsAsFactors=F)
114 | title <- "Percent of Homicides in Which Offender was Known to Victim (1980-2008)"
115 | padify(series, series.data)
116 |
117 | #Figure 7. Number of homicides of children under age 5, by race of victim, 1980-2008
118 | #*Other race includes American Indians, Alaska Natives, Asians, Hawaiians, and other Pacific Islanders.
119 | series.data <- read.csv(paste(crime.folder.path, "12.csv", sep="/"), stringsAsFactors=F)
120 | title <- "Number of homicides of children under age 5, by race of victim"
121 | period <- "(1980-2008)"
122 | col.names <- colnames(series.data)
123 | for(i in 2:ncol(series.data))
124 | {
125 | series["title"] <- paste(title, "-", col.names[i], period, sep=" ")
126 | padify(series, series.data[c(1,i)])
127 | }
128 | series["title"] <- "Number of homicides of children under age 5, by race of victim (1980-2008)"
129 | padify(series, series.data)
130 |
131 | #Figure 8. Homicide victimization rates for children under age 5 by race of victim, 1980-2008
132 | #Rate per 100,000 population Rate per 100,000 population
133 | series.data <- read.csv(paste(crime.folder.path, "12-1.csv", sep="/"), stringsAsFactors=F)
134 | title <- "Rate of homicides of children under age 5, by race of victim"
135 | period <- "(1980-2008)"
136 | col.names <- colnames(series.data)
137 | for(i in 2:ncol(series.data))
138 | {
139 | series["title"] <- paste(title, "-", col.names[i], "per 100,000 population", period, sep=" ")
140 | padify(series, series.data[c(1,i)])
141 | }
142 | series["title"] <- "Rate of homicides of children under age 5, by race of victim, per 100,000 population (1980-2008)"
143 | padify(series, series.data)
144 |
145 | #clean up
146 | cleaupSystem()
147 | updateCatPadCount()
148 | }
149 | generate()
150 |
--------------------------------------------------------------------------------
/FetchData/FetchIPLData.R:
--------------------------------------------------------------------------------
1 | # Author: Jitender Aswani, Co-Founder @datadolph.in
2 | # Date: 3/15/2013
3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
5 | # All rights reserved.
6 |
7 | #http://www.wisdenrecords.com/Records/India/Test/Bowling/Most_Wickets_in_Career.html
8 | #http://www.wisdenrecords.com/Records/India/Test/Batting/Most_Career_Runs.html
9 | #http://www.wisdenrecords.com/Records/Pakistan/Test/Batting/Most_Career_Runs.html
10 | #http://www.wisdenrecords.com/Records/India/Test/Batting/Most_Career_Runs.html
11 | require("RJSONIO")
12 | require("plyr")
13 | library("RCurl")
14 |
15 | init <- function() {
16 | assign("folder.path", "./pads/raw-data/cricket/", envir=.GlobalEnv)
17 | assign("years", 2008:2013, envir=.GlobalEnv)
18 | assign("ipl.base.url", 'http://dynamic.pulselive.com/dynamic/data/core/cricket/2012/ipl', envir=.GlobalEnv)
19 | assign("ipl.end.url.runs", '/stats/player/full/mostRuns.js', envir=.GlobalEnv)
20 | assign("ipl.end.url.wickets", "/stats/player/full/mostWickets.js", envir=.GlobalEnv)
21 | assign("ipl.end.url.standings", "/groupStandings.js", envir=.GlobalEnv)
22 | }
23 |
24 | # Save the JSON
25 | saveStatsData <- function(out.file, jsonSt){
26 | file.out <- file(paste(folder.path, out.file, ".JSON", sep=""), 'wt')
27 | cat(jsonSt, file=file.out, fill = TRUE)
28 | close(file.out)
29 | }
30 |
31 | #write a csv file
32 | saveStatsDataAsCSV <- function(data, out.file){
33 | file.location <- paste(folder.path, out.file, ".csv", sep="")
34 | write.csv(data, file.location, row.names=F)
35 | }
36 |
37 | #
38 | # batting
39 | #
40 | battingStastByPlayer <- function(l){
41 | #return (data.frame(matrix(unlist(l), nrow=1, byrow=T)))
42 | return (data.frame(l$team$fullName, l$player$fullName, l$player$nationality, l$player$dateOfBirth,
43 | matrix(unlist(l$battingStats), nrow=1, byrow=T), stringsAsFactors = F))
44 | }
45 |
46 | getBattingStats <- function(ipl.json, year){
47 | ipl.batting.stats <- ldply(ipl.json$mostRuns[[1]]$topPlayers, battingStastByPlayer)
48 | colnames(ipl.batting.stats) <- c("team", "player", "country", "dob", names(ipl.json$mostRuns[[1]]$topPlayers[[1]]$battingStats))
49 | ipl.batting.stats$hs <- sapply(ipl.batting.stats$hs, function(x) sub("\\*", "", x))
50 | ipl.batting.stats$year <- year
51 | # get age
52 | ipl.batting.stats$age_in_months <- sapply(ipl.batting.stats$dob, function(x) getAge(x, paste(year, "/04/01", sep=""), "m"))
53 | return(ipl.batting.stats)
54 | }
55 |
56 | battingStats <- function(year){
57 | ipl.url <- paste(ipl.base.url, year, ipl.end.url.runs, sep="")
58 | print(ipl.url)
59 | content <- try(getURL(ipl.url), silent=F)
60 | if(class(content) %in% c("try-error")) stop("Can't continue...")
61 | content <- sub("\\);", "", sub(paste("onMostRuns", "\\(", sep=""), "", content))
62 | saveStatsData(paste("ipl", year, "batting-stats", sep="-"), content)
63 | ipl.json <- fromJSON(content)
64 | return(getBattingStats(ipl.json, year))
65 | }
66 |
67 | battingStatsFromFiles <- function(year){
68 | file.name <- paste(folder.path, paste("ipl", year, "batting-stats", sep="-"), ".json", sep="")
69 | file.out <- file(file.name)
70 | ipl.json <- fromJSON(paste(readLines(file.out), collapse = "\n"))
71 | close(file.out)
72 | #assign("ipl", ipl.json, envir=.GlobalEnv)
73 | return(getBattingStats(ipl.json, year))
74 | }
75 |
76 | #
77 | # Bowling stats
78 | #
79 | bowlingStastByPlayer <- function(l){
80 | #return (data.frame(matrix(unlist(l), nrow=1, byrow=T)))
81 | return (data.frame(l$team$fullName, l$player$fullName, l$player$nationality, l$player$dateOfBirth,
82 | matrix(unlist(l$bowlingStats), nrow=1, byrow=T), stringsAsFactors = F))
83 | }
84 |
85 | getBowlingStats <- function(ipl.json, year){
86 | ipl.bowling.stats <- ldply(ipl.json$mostWickets[[1]]$topPlayers, bowlingStastByPlayer)
87 | colnames(ipl.bowling.stats) <- c("team", "player", "country", "dob", names(ipl.json$mostWickets[[1]]$topPlayers[[1]]$bowlingStats))
88 | ipl.bowling.stats$year <- year
89 | # get age
90 | ipl.bowling.stats$age_in_months <- sapply(ipl.bowling.stats$dob, function(x) getAge(x, paste(year, "/04/01", sep=""), "m"))
91 |
92 | return(ipl.bowling.stats)
93 | }
94 |
95 | bowlingStats <- function(year){
96 | ipl.url <- paste(ipl.base.url, year, ipl.end.url.wickets, sep="")
97 | print(ipl.url)
98 | content <- try(getURL(ipl.url), silent=T)
99 | if(class(content) %in% c("try-error")) stop("Can't continue...")
100 | content <- sub("\\);", "", sub(paste("onMostWickets", "\\(", sep=""), "", content))
101 | saveStatsData(paste("ipl", year, "bowling-stats", sep="-"), content)
102 | ipl.json <- fromJSON(content)
103 | return(getBowlingStats(ipl.json, year))
104 | }
105 |
106 | bowlingStatsFromFiles <- function(year){
107 | file.name <- paste(folder.path, paste("ipl", year, "bowling-stats", sep="-"), ".json", sep="")
108 | file.out <- file(file.name)
109 | ipl.json <- fromJSON(paste(readLines(file.out), collapse = "\n"))
110 | close(file.out)
111 | return(getBowlingStats(ipl.json, year))
112 | }
113 |
114 | #
115 | # standings
116 | #
117 | standingStatsBySeason <- function(l){
118 | return (data.frame(l$position, l$team$fullName, l$played, l$won, l$lost, l$tied, l$noResult, l$points, l$netRunRate))
119 | }
120 |
121 | getStandingStats <- function(ipl.json, year){
122 | ipl.standing.stats <- ldply(ipl.json$groups[[1]]$standings, standingStatsBySeason)
123 | colnames(ipl.standing.stats) <- c("position", "team", "played", "won", "lost", "tied", "noResult", "points", "netrunrate")
124 | ipl.standing.stats$year <- year
125 | return(ipl.standing.stats)
126 | }
127 |
128 | standingStats <- function(year){
129 | ipl.url <- paste(ipl.base.url, year, ipl.end.url.standings, sep="")
130 | print(ipl.url)
131 | content <- try(getURL(ipl.url), silent=F)
132 | if(class(content) %in% c("try-error")) stop("Can't continue...")
133 | content <- sub("\\);", "", sub(paste("onGroupStandings", "\\(", sep=""), "", content))
134 | saveStatsData(paste("ipl", year, "standings-stats", sep="-"), content)
135 | ipl.json <- fromJSON(content)
136 | #assign("ipl", ipl.json, envir=.GlobalEnv)
137 | return(getStandingStats(ipl.json, year))
138 | }
139 |
140 | standingStatsFromFiles <- function(year){
141 | file.name <- paste(folder.path, paste("ipl", year, "standings-stats", sep="-"), ".json", sep="")
142 | file.out <- file(file.name)
143 | ipl.json <- fromJSON(paste(readLines(file.out), collapse = "\n"))
144 | close(file.out)
145 | #assign("ipl", ipl.json, envir=.GlobalEnv)
146 | return(getStandingStats(ipl.json, year))
147 | }
148 |
149 |
150 | #
151 | # get IPL stats
152 | #
153 | getStats <- function(){
154 | init()
155 | #
156 | #get batting stats
157 | #
158 | batting.stats <- adply(years, 1, battingStatsFromFiles)
159 | colnames(batting.stats) <- c("season", colnames(batting.stats)[2:9], "fours", "sixes",
160 | colnames(batting.stats)[12:13], "fiftys", "hundreds",
161 | colnames(batting.stats)[16:ncol(batting.stats)])
162 |
163 | saveStatsDataAsCSV(batting.stats, "IPL-T20-Batting-Stats-For-All-Seasons")
164 |
165 | #
166 | #get bowling stats
167 | #
168 | bowling.stats <- adply(years, 1, bowlingStatsFromFiles)
169 | colnames(bowling.stats) <- c("season", colnames(bowling.stats)[2:10], "fours",
170 | "sixes", colnames(bowling.stats)[13:21], "four_wickets",
171 | "five_wickets", "ten_wickets",
172 | colnames(bowling.stats)[25:ncol(bowling.stats)])
173 |
174 | saveStatsDataAsCSV(bowling.stats, "IPL-T20-Bowling-Stats-For-All-Seasons")
175 |
176 | #
177 | # get group stats
178 | #
179 | group.standings <- adply(years, 1, standingStatsFromFiles)
180 | colnames(group.standings) <- c("season", colnames(group.standings)[2:ncol(group.standings)])
181 | saveStatsDataAsCSV(group.standings, "IPL-T20-Standing-Stats-For-All-Seasons")
182 | }
183 |
184 | if(F){
185 |
186 | #http://dynamic.pulselive.com/dynamic/data/core/cricket/2012/ipl2013/groupStandings.js
187 | #http://dynamic.pulselive.com/dynamic/data/core/cricket/2012/ipl2012/groupStandings.js
188 |
189 | ipl.json <- fromJSON(paste(readLines(paste(folder.path, "ipl-2013-sample-bat.json", sep="")), collapse = "\n"))
190 | ipl <- read.csv(paste(folder.path, "IPLstats.csv", sep=""), stringsAsFactors=F)
191 |
192 | #http://dynamic.pulselive.com/dynamic/data/core/cricket/2012/ipl2012/stats/player/full/mostWickets.js
193 | for(y in years){
194 | aaply(ipl, 1, getStat, y)
195 | }
196 |
197 | ipl.url <- "http://dynamic.pulselive.com/dynamic/data/core/cricket/2012/ipl2012/stats/player/full/mostRuns.js"
198 | content <- getURL(ipl.url)
199 | content <- sub("\\);", "", sub("onMostRuns\\(", "", content),)
200 |
201 | theurl <- "http://en.wikipedia.org/wiki/Brazil_national_football_team"
202 | tables <- readHTMLTable(theurl)
203 | n.rows <- unlist(lapply(tables, function(t) dim(t)[1]))
204 | tables[[which.max(n.rows)]]
205 | colnames(ipl.batting.stats) <- sapply(names(unlist(ipl.json$mostRuns[[1]]$topPlayers[[1]])), function(x) {gsub("(.*)\\.", "",x)})
206 | gsub("(.*)\\.", "", "team.fullName")
207 | }
208 |
209 |
210 |
--------------------------------------------------------------------------------
/FetchData/FetchImmigrationData.R:
--------------------------------------------------------------------------------
1 | require(RCurl)
2 | require(data.table)
3 | require(plyr)
4 | setwd("/Users/homemac/dolphy/R/pads/raw-data/immigration")
5 |
6 |
7 | fetchData <- function(url,yr) {
8 | #url <- "http://www.flcdatacenter.com/download/H1B_efile_FY02_text.zip"
9 | #yr <- 2
10 | #temp <- tempfile()
11 | #tempdir()
12 | temp <- paste("temp", yr, ".zip", sep="_")
13 | download.file(url,temp)
14 | #data <- read.csv(unz(temp, read.file))
15 | #write.csv(data,csv.file, row.names=F)
16 | #l.f <- unzip("H1B_efile_FY03_text.zip") #unzips the file
17 | unzip(temp) #unzips the file
18 | unlink(temp)
19 | #"EFILE_FY2006.txt"
20 | #H1B_efile_FY02.txt
21 | }
22 |
23 | fy <- 5:6
24 | u <- paste("http://www.flcdatacenter.com/download/H1B_efile_FY0", fy, "_text.zip", sep="")
25 | sapply(seq_along(u), function(i) fetchData(u[i], fy[i]))
26 |
27 | readTextData <- function() {
28 | filenames <- list.files(path=".", pattern="*.txt")
29 |
30 | print(i)
31 | #yr <- 2000 + as.integer(gsub("[^0-9]", "", gsub("(\\./)(.*)_(.*)_(.*)(\\.txt)", "\\4", i)))
32 | yr <- 2000 + as.integer(gsub("[^0-9]", "", gsub("(.*)_(.*)_(.*)(\\.txt)", "\\3", i)))
33 | filename <- gsub("(.*)(\\.txt)", "\\1", i)
34 | print(yr)
35 | print(filename)
36 | pad <- data.table(read.csv(i, stringsAsFactors=F))
37 | col.names <- colnames(pad)
38 | setnames(pad, col.names, tolower(col.names))
39 | #colnames(pad) <- tolower(colnames(pad))
40 | write.csv(pad,paste(filename, ".csv", sep=""), row.names=F)
41 | }
42 | }
43 |
44 | readCSVData <- function() {
45 | filenames <- list.files(path=".", pattern="*.csv")
46 |
47 | for(i in filenames) {
48 | print(i)
49 | #yr <- 2000 + as.integer(gsub("[^0-9]", "", gsub("(\\./)(.*)_(.*)_(.*)(\\.txt)", "\\4", i)))
50 | yr <- 2000 + as.integer(gsub("[^0-9]", "", gsub("(.*)_(.*)_(.*)(\\.csv)", "\\3", i)))
51 | pad <- data.table(read.csv(i, stringsAsFactors=F))
52 | col.names <- colnames(pad)
53 | setnames(pad, col.names, tolower(col.names))
54 | generateStats(pad, yr)
55 | }
56 | }
57 |
58 | generateStats <- function(dt, yr) {
59 | #
60 | # by total
61 | #
62 | dt.total <- dt[,list(year=yr, total=length(approval_status),
63 | approved=length(which(approval_status=="Certified")),
64 | denied=length(which(approval_status=="Denied")),
65 | hold=length(which(approval_status=="Hold")),
66 | pending=length(which(approval_status=="Pending")))]
67 |
68 | # save this to csv
69 | write.csv(dt.total, paste("US_H1B_breakdown_by_status_", yr, ".csv", sep=""), row.names=F)
70 | # bind this to csv
71 | H1B.total <<- rbind(H1B.total, dt.total)
72 |
73 | #
74 | # by company
75 | #
76 | dt.by.company <- dt[, list(year=yr, total=length(approval_status),
77 | approved=length(which(approval_status=="Certified")),
78 | denied=length(which(approval_status=="Denied")),
79 | hold=length(which(approval_status=="Hold")),
80 | pending=length(which(approval_status=="Pending"))),
81 | by=list(employer=name)][order(-total)]
82 | # save this to csv
83 | write.csv(dt.by.company[1:15,], paste("US_H1B_top_companies_", yr, ".csv", sep=""), row.names=F)
84 | # bind this to csv
85 | H1B.by.company <<- rbind(H1B.by.company, dt.by.company)
86 |
87 | #
88 | # by state
89 | #
90 | dt.by.state <- dt[,list(year=yr, total=length(approval_status),
91 | approved=length(which(approval_status=="Certified")),
92 | denied=length(which(approval_status=="Denied")),
93 | hold=length(which(approval_status=="Hold")),
94 | pending=length(which(approval_status=="Pending"))),
95 | by=state][order(-total)]
96 | # save this to csv
97 | write.csv(dt.by.state[1:15,], paste("US_H1B_top_states_", yr, ".csv", sep=""), row.names=F)
98 | # bind this to csv
99 | H1B.by.state <<- rbind(H1B.by.state, dt.by.state)
100 |
101 | #
102 | # by city
103 | #
104 |
105 | dt.by.city <- dt[,list(year=yr, total=length(approval_status),
106 | approved=length(which(approval_status=="Certified")),
107 | denied=length(which(approval_status=="Denied")),
108 | hold=length(which(approval_status=="Hold")),
109 | pending=length(which(approval_status=="Pending"))),
110 | by=city][order(-total)]
111 | # save this to csv
112 | write.csv(dt.by.city[1:15,], paste("US_H1B_top_cites_", yr, ".csv", sep=""), row.names=F)
113 | # bind this to csv
114 | H1B.by.city <<- rbind(H1B.by.city, dt.by.city)
115 |
116 | #
117 | # by title
118 | #
119 | dt.by.title <- dt[,list(year=yr, total=length(approval_status),
120 | approved=length(which(approval_status=="Certified")),
121 | denied=length(which(approval_status=="Denied")),
122 | hold=length(which(approval_status=="Hold")),
123 | pending=length(which(approval_status=="Pending"))),
124 | by=job_title][order(-total)]
125 | # save this to csv
126 | write.csv(dt.by.title[1:15,], paste("US_H1B_top_titles_", yr, ".csv", sep=""), row.names=F)
127 | # bind this to csv
128 | H1B.by.title <<- rbind(H1B.by.title, dt.by.title)
129 |
130 |
131 |
132 | #contribs <- read.csv("path/to/file", colClasses=c(CTRIB_AMT="Currency"))
133 |
134 | #
135 | # by wage
136 | #
137 | dt$wage_offered <- as.numeric(gsub("[\\$,]","",dt$wage_rate_1))
138 | dt$prevailing_wages <- as.numeric(gsub("[\\$,]","",dt$prevailing_wage_1))
139 | dt.by.wage <- dt[,list(year=yr, wage_offered,
140 | prevailing_wages, job_title, employer=name),
141 | by=list(rate_per_1=="Year",
142 | approval_status=="Certified")][,rate_per_1:=NULL][,approval_status:=NULL][order(-wage_offered)][wage_offered < 200000]
143 | # save this to csv
144 | write.csv(dt.by.wage, paste("US_H1B_top_wages_", yr, ".csv", sep=""), row.names=F)
145 | # bind this to csv
146 | H1B.by.wages <<- rbind(H1B.by.wages, dt.by.wage)
147 | }
148 |
149 | H1B.total <- data.table()
150 | H1B.by.company <- data.table()
151 | H1B.by.state <- data.table()
152 | H1B.by.city <- data.table()
153 | H1B.by.title <- data.table()
154 | H1B.by.wages <- data.table()
155 |
156 | readCSVData()
157 | write.csv(H1B.total, "US_H1B_breakdown_by_status_2002-2007.csv", row.names=F)
158 | write.csv(H1B.by.company, "US_H1B_top_companies_2002-2007.csv", row.names=F)
159 | write.csv(H1B.by.state, "US_H1B_top_states_2002-2007.csv", row.names=F)
160 | write.csv(H1B.by.city, "US_H1B_top_cities_2002-2007.csv", row.names=F)
161 | write.csv(H1B.by.title, "US_H1B_top_titles_2002-2007.csv", row.names=F)
162 | write.csv(H1B.by.wages, "US_H1B_top_wages_2002-2007.csv", row.names=F)
163 |
164 |
165 | #lt <- count(ldf2, "approval_status")
166 |
167 | # 'x' %in% colnames(df)
168 | lt2 <- as.data.frame.matrix(with(ldf2, table(name, approval_status)))
169 | lt2$company <- rownames(lt2)
170 | rownames(lt2) <- seq(1:nrow(lt2))
171 |
172 |
173 |
174 | u <- "http://www.omegahat.org/RCurl/data.gz"
175 |
176 |
177 |
178 |
179 | setClass("Currency")
180 | setAs("character", "Currency",
181 | function(from) as.numeric(sub("$","",from, fixed=TRUE)))
182 |
183 | setClass("CurrencyCommas")
184 | setAs("character", "CurrencyCommas",
185 | function(from) as.numeric(gsub(",", "", from) ) )
186 | gsub("[\\$,]","","$1,075.20 ")
187 | if(url.exists(u)) {
188 | content <- getBinaryURL(u) #read in the https as a binary
189 | dump.file <- file("temp.zip", open = "wb")
190 | writeBin(content, dump.file)
191 | close(dump.file) #use this to close the connection
192 | l <- unzip("temp.zip", list=TRUE) #unzips the file
193 | #dir() #Look at files in the directory. We want "SRC2010.mdb"
194 | unlink("temp.zip") #delete the zip file
195 | # shell.exec("SRC2010.mdb")
196 | }
197 | library(RODBC)
198 | channel <- odbcConnectAccess("http://www.foreignlaborcert.doleta.gov/pdf/quarter_4_2011/H-1b_iCert_LCA_FY2011_Q4.mdb
199 | ")
200 | demographics <- sqlFetch(channel, sqtable="Demographic Factors", colnames = FALSE, rownames = FALSE)
201 | ave.class.sz <- sqlFetch(channel, sqtable="Average Class Size", colnames = FALSE, rownames = FALSE)
202 | att.susp <- sqlFetch(channel, sqtable="Attendance and Suspensions", colnames = FALSE, rownames = FALSE)
203 | drop.out <- sqlFetch(channel, sqtable="High School Completers", colnames = FALSE, rownames = FALSE)
204 | staff <- sqlFetch(channel, sqtable="Staff", colnames = FALSE, rownames = FALSE)
205 | close(channel)
206 |
207 |
208 | ## method 2 using curl
209 | CAINFO = paste(system.file(package="RCurl"), "/CurlSSL/ca-bundle.crt", sep = "")
210 |
211 | cookie = 'cookiefile.txt'
212 | curlH = getCurlHandle(
213 | cookiefile = cookie,
214 | useragent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en - US; rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6",
215 | header = FALSE,
216 | verbose = TRUE,
217 | netrc = TRUE,
218 | maxredirs = as.integer(20),
219 | followlocation = TRUE,
220 | ssl.verifypeer = TRUE)
221 |
222 |
223 | destfile = "log2.csv"
224 | content = getBinaryURL(url, curl = curlH, cainfo = CAINFO)
225 | ## write to file
226 | writeBin(content, destfile)
227 | ## read from binary object
228 |
229 | ## read from binary object
230 | csv.data2 <- read.csv(textConnection(rawToChar(content)))
231 | head(csv.data2)
232 | csv.data2 == csv.data
--------------------------------------------------------------------------------
/CreatePADS.R:
--------------------------------------------------------------------------------
1 | # Read a CSV file
2 | # Classify the column types
3 | # Determine measures and dimensions
4 | # Generate meta data and store JSON
5 | # Query Solr and store the information in SOLR
6 | # Save the data as cache
7 |
8 | # Author: Jitender Aswani, Co-Founder @datadolph.in
9 | # Date: 3/15/2013
10 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
11 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
12 | # All rights reserved.
13 |
14 | require("RJSONIO")
15 | require("plyr")
16 | require("data.table")
17 | require("stringr")
18 | require("lubridate")
19 | require("zoo")
20 |
21 | source("UtilPADS.R")
22 | source("ClassifyData.R")
23 | source("MySQLFunctions.R")
24 | source("MongoFns.R")
25 | source("DefaultChartForPAD.R")
26 |
27 | # initializeSystem <- function(stack=0){
28 | #
29 | # # turn off scientific formatting - getOption("scipen")
30 | # options(scipen=999)
31 | #
32 | # #initialize solr docs
33 | # #assign("solr.index.docs", list(), envir=.GlobalEnv)
34 | #
35 | # #mongo
36 | # mongo <- getDefaultMongoDBCon()
37 | # assign("mongo", mongo, envir=.GlobalEnv)
38 | #
39 | # #initialize stack
40 | # mysql.db <- list(user="ddfin_dev", pass="BfNdW87Ym9FmcYj7", name="ddfin_dev", host="localhost")
41 | # if(stack == 1) {
42 | # #prod
43 | # mysql.db <- list(user="ddfin_prod", pass="8MQ5CRDzQufHKXTx", name="ddfin_prod", host="localhost")
44 | # }
45 | # assign("mysql.db", mysql.db, envir=.GlobalEnv)
46 | #
47 | # #mysql
48 | # mysql <- getMSDBCon()
49 | # assign("mysql", mysql, envir=.GlobalEnv)
50 | #
51 | # #error counter
52 | # assign("error.count", 0, envir=.GlobalEnv)
53 | #
54 | # #initialize log file
55 | # assign("vec.log", vector(), envir=.GlobalEnv)
56 | #
57 | # #verbose mode
58 | # assign("verbose", F, envir=.GlobalEnv)
59 | #
60 | # #min log
61 | # assign("log.all", F, envir=.GlobalEnv)
62 | # }
63 |
64 | #
65 | # loadStates
66 | #
67 | loadStates <- function(){
68 | states <- data.table(readFile("./pads/raw-data/states_names.csv"))
69 | setkey(states, state_code)
70 | assign("states", states, envir=.GlobalEnv)
71 | }
72 |
73 |
74 |
75 | #
76 | # Initialize Padification Process
77 | #
78 | initializeSystem <- function(stack=0){
79 |
80 | # turn off scientific formatting - getOption("scipen")
81 | options(scipen=999)
82 |
83 | # day and months facors
84 | lMonths <- c("January","February","March", "April","May","June","July","August","September", "October","November","December")
85 | lDays <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
86 | assign("lMonths", lMonths, envir=.GlobalEnv)
87 | assign("lDays", lDays, envir=.GlobalEnv)
88 |
89 | #loadStates
90 | loadStates()
91 |
92 | #initialize solr docs
93 | #assign("solr.index.docs", list(), envir=.GlobalEnv)
94 |
95 | #mongo
96 | mongo <- getDefaultMongoDBCon()
97 | assign("mongo", mongo, envir=.GlobalEnv)
98 |
99 | #initialize stack
100 | mysql.db <- list(user="ddfin_dev", pass="b31nd1$$", name="ddfin_dev", host="localhost")
101 | if(stack == 1) {
102 | #prod
103 | mysql.db <- list(user="ddfin_prod", pass="g0f0r1t", name="ddfin_prod", host="localhost")
104 | }
105 | assign("mysql.db", mysql.db, envir=.GlobalEnv)
106 |
107 | #mysql
108 | mysql <- getMSDBCon()
109 | assign("mysql", mysql, envir=.GlobalEnv)
110 |
111 | #error counter
112 | assign("error.count", 0, envir=.GlobalEnv)
113 |
114 | #initialize log file
115 | assign("vec.log", vector(), envir=.GlobalEnv)
116 |
117 | #verbose mode
118 | assign("verbose", F, envir=.GlobalEnv)
119 |
120 | #min log
121 | assign("log.all", F, envir=.GlobalEnv)
122 | }
123 |
124 | #
125 | #cleanup
126 | #
127 | cleaupSystem <- function() {
128 |
129 | #disconnect Mongo
130 | disconnectMongoDB(mongo)
131 |
132 | # disconnect msdb
133 | disconnectMSDB(mysql)
134 |
135 | #save the logs
136 | dumpLogs()
137 |
138 | # save the solr index to FS for later editing - this is now done through mysql
139 | #persistSolrIndex(solr.index.docs, dataset)
140 | }
141 |
142 | #
143 | # create pad - the padification process starts here
144 | #
145 | createPAD <- function(pad, title, desc, category, subcategory, data.source, source.file,
146 | tags, pagetag) {
147 |
148 | if(ncol(pad) == 0) stop('dolphy can not padify a pad that has zero columns')
149 |
150 | #only continue if there is one or more colum
151 | if(verbose) print(paste("Starting the padification process for source file", source.file, sep=":"))
152 | logMessage(paste("Starting the padification process for source file", source.file, sep=":"))
153 |
154 | #Very first task, classify pad
155 | pad.classified <- classifyData(pad)
156 |
157 | #if(verbose) print(pad.classified)
158 |
159 | # Generate extra date columns
160 | #if(length(which(col.classes == "Date")) > 0) {
161 | # pad.classified <- generateExtraDateCols(pad.classified, which(col.classes == "Date"))
162 | # col.classes <- laply(pad.classified, getClass)
163 | #}
164 |
165 | meta.data <- list(id="", "title"=title, "desc"=desc,
166 | "records"=as.integer(0),
167 | "columns"=as.integer(0),
168 | "category"=category,
169 | "subcategory"=subcategory,
170 | "src"=data.source,
171 | "src_file"= source.file,
172 | "cache_location"="",
173 | "dList"=list(), "mList"=list(),
174 | "tags"=tags,
175 | "pagetag"=pagetag
176 | )
177 |
178 | #Start filling up the meta data
179 | meta.data$records <- nrow(pad.classified)
180 | meta.data$columns <- ncol(pad.classified)
181 |
182 | #Replace all meta chars from column names including spaces
183 | colnames(pad.classified) <- replaceMetaChars(colnames(pad.classified))
184 |
185 | col.classes <- laply(pad.classified, getClass)
186 | col.names <- colnames(pad.classified)
187 |
188 | #Create a dict (name=value pair)
189 | cols <- list()
190 | cols[col.names]=col.classes
191 | meta.data$dList <- cols[which(unlist(cols)!="numeric")]
192 | meta.data$mList <- cols[which(unlist(cols)=="numeric")]
193 | #meta.data$mList <- names(which(sapply(pad.classified, is.numeric)))
194 | #meta.data$dList <- setdiff(colnames(pad.classified), meta.data$mList)
195 |
196 | # Get unique id for pad
197 | padID <- getPadUID()
198 | #print(paste("Assing pad id of ", padID, sep="::"))
199 | logMessage(paste("Assigning pad id of ", padID, sep="::"))
200 | meta.data$id <- padID
201 | #meta.data$meta_location <- getMetaDataFileURL(padID)
202 | meta.data$cache_location <- getCacheDataFileURL(padID)
203 | #meta.data$data_location <- getDataFileURL(padID)
204 |
205 | #Save transformed dataset as data.frame for later reading
206 | assign(padID, pad.classified)
207 | names <- c(eval(padID))
208 |
209 | #get default chart info....
210 | meta.data <- getDefaultChartInfo(pad.classified, meta.data)
211 |
212 | #Save the cahce
213 | if(verbose) print("saving cache")
214 | if(log.all) logMessage("Saving cache...")
215 | save(list=names, file=meta.data$cache_location)
216 |
217 | #Save meta data
218 | #jsonSt <- toJSON(meta.data)
219 | #saveMetaData(meta.data$meta_location, jsonSt)
220 |
221 | #Save underlying data into csv
222 | #saveData(pad.classified, meta.data$data_location)
223 |
224 | if(verbose) print("Padification Completed!")
225 | logMessage("Padification Completed!")
226 | return(meta.data)
227 | }
228 |
229 | #
230 | # padify
231 | #
232 | padify <- function(series, series.data, x.plot.band=NULL, y.plot.band=NULL){
233 | #Remove rows that have NAs
234 | series.data <- series.data[rowSums(is.na(series.data)) != ncol(series.data),]
235 |
236 | # Remove rows that have at least 1 NA
237 | series.data <- series.data[complete.cases(series.data),]
238 |
239 | if("data.table" %in% class(series.data))
240 | series.data <- as.data.frame(series.data, , stringsAsFactors=F)
241 | #create pad
242 | pmd <- try(createPAD(series.data, series$title,
243 | series$desc, series$category, series$subcategory,
244 | series$source, paste(series$name, series$country, sep="."),
245 | series$tags, series$pagetag), silent=T)
246 |
247 | #assign("pmd", pmd, envir=.GlobalEnv)
248 | #
249 | # Add this pad to system pads table in the database, to mongo db and save the cache
250 | #
251 | #check to see if the padification went through
252 | if(class(pmd) %in% c("try-error")) {
253 | if(verbose) print("couldn't padify process.")
254 | logMessage("couldn't padify the pad...")
255 | assign("error.count", error.count+1, envir=.GlobalEnv)
256 | } else {
257 | if(!is.null(x.plot.band))
258 | pmd$default$xaxis$plotbands <- x.plot.band
259 |
260 | if(!is.null(y.plot.band))
261 | pmd$default$yaxis$plotbands <- y.plot.band
262 |
263 | if(verbose) print("adding the pad to mysql")
264 | if(log.all) logMessage("Saving PAD meta data in the database...")
265 | insertPadToMySQL(pmd)
266 |
267 | # insert into mongo
268 | if(verbose) print("adding the pad to mongo db")
269 | if(log.all) print("adding the pad to mongo db")
270 | pad <- fromJSON(toJSON(pmd))
271 |
272 | insertPadToMongo(pmd$id, pad)
273 | return (pmd$id)
274 | # #generate solr doc
275 | # if(verbose) print("adding the pad to solr vector")
276 | # if(log.all) print("adding the pad to mongo")
277 | # doc <- list("type"="pad", "id" = pmd$id, "title" = pmd$title, "desc"=pmd$desc, "category"=pmd$category,
278 | # "subcategory"=pmd$subcategory, "tags"=pmd$tags, "author"="system")
279 | # solr.index.docs[[length(solr.index.docs)+1]] <<- doc
280 | }
281 | }
--------------------------------------------------------------------------------
/ClassifyData.R:
--------------------------------------------------------------------------------
1 | # Classify Data
2 | # Author: Jitender Aswani, Co-Founder @datadolph.in
3 | # Date: 3/15/2013
4 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
5 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
6 | # All rights reserved.
7 |
8 |
9 |
10 | #
11 | # Generate extra date cols
12 | #
13 | generateExtraDateCols <- function(data, col.index) {
14 | lMonths <- c("January","February","March", "April","May","June","July","August","September", "October","November","December")
15 | lDays <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
16 | #data$Day_generated=factor(weekdays(data[,col.index]), levels=lDays, ordered=TRUE)
17 | data$Day_g=factor(weekdays(data[,col.index]), levels=lDays)
18 | #data$Month_generated=factor(months(data[,col.index]), levels=lMonths, ordered=TRUE)
19 | data$Month_g=factor(months(data[,col.index]), levels=lMonths)
20 | data$Quarter_g=as.factor(quarters(data[,col.index]))
21 | data$Year_g=as.factor(years(data[,col.index]))
22 | #data$Year=as.numeric(format(data[,col.index], "%Y"))
23 | #as.numeric(format(date1, "%m"))
24 | #format(date1, "%b") ## Month, char, abbreviated
25 | return(data)
26 | }
27 | #
28 | # Define all supported date formats
29 | #
30 | getAllowedDateFormats <- function() {
31 | #d = '([0 ]?[1-9]|[12][0-9]|3[01])' # 1-31 with leading zero or space for single digits
32 | #m = '([0 ]?[1-9]|1[0-2])' # 01-12 with leading zero or space for single digits
33 | d = '([0][1-9]|[12][0-9]|3[01])' # 1-31 with leading zero for single digits
34 | m = '([0][1-9]|1[0-2])' # 01-12 with leading zero for single digit
35 | Y = '(17[0-9][0-9]|18[0-9][0-9]|19[0-9][0-9]|20[0-9][0-9])' # 1700 - 2099
36 | #Y = '(19[0-9][0-9]|20[0-9][0-9])' # 1900 - 2099
37 | y = '([0-9][0-9])' # 00-99
38 | b='([a-zA-Z]{3})'
39 | B='([a-zA-Z]+)'
40 | #q = '([qQ][1-4])'
41 | #patterns <- c("%Y%m%d", "%m%d%y", "%m%d%Y", "%d%m%y", "%d%m%Y", "%y%m%d")
42 | #patterns <- c("Ymd", "mdy", "mdY", "dmy", "dmY", "ymd", "y", "Y", "by", "bY", "By", "BY")
43 | #removed %y from the list to avoid getting numerical column identified, moved %Y at the front to avoid 4 digit year being classified as %d%m%y
44 | patterns <- c("Y","Ymd", "mdy", "mdY", "dmy", "dmY", "ymd","by", "bY", "By", "BY")
45 | #patterns <- c("Y")
46 | separators <- c("", " ", "/", "-", ".")
47 | #strptime {base} /^[a-zA-Z]{3}\s+[0-9]{2}$/
48 | allowed.date.list <- list()
49 |
50 | for (i in 1:length(patterns)) {
51 | for (j in 1:length(separators)) {
52 | p <- patterns[i]
53 | s <- separators[j]
54 | pattern.length <- nchar(p)
55 | chars <- substring(p,seq(1, pattern.length), seq(1, pattern.length))
56 | m.chars <- paste("%", chars, sep="")
57 | format <- paste(m.chars,collapse=s)
58 | if(!s %in% c("", " ")){
59 | s <- paste("\\", s, "", sep="" )
60 | }
61 | regex <- ''
62 | if( pattern.length == 3) {
63 | regex <- paste('^', eval(get(chars[1])), s, eval(get(chars[2])), s, eval(get(chars[3])), '$', sep="")
64 | } else if ( pattern.length == 2) {
65 | regex <- paste('^', eval(get(chars[1])), s, eval(get(chars[2])),'$', sep="")
66 | } else if ( pattern.length == 1) {
67 | regex <- paste('^', eval(get(chars[1])),'$', sep="")
68 | }
69 | allowed.date.list[[regex]] = format
70 | }
71 | }
72 | return (allowed.date.list)
73 | }
74 |
75 | #
76 | # get date format for sample of values
77 | #
78 | getDateFormat <- function(sample.dates) {
79 | allowed.date.list <- getAllowedDateFormats()
80 | #sample.dates <- sample.dates[!is.na(sample.dates)]
81 | for(i in 1:length(allowed.date.list)) {
82 | #Check how many items matced a date pattern
83 | matched <- sum(grepl(names(allowed.date.list)[i], sample.dates))
84 | if(matched == length(sample.dates)) {
85 | #print(names(allowed.date.list)[i])
86 | return(allowed.date.list[[i]])
87 | }
88 | }
89 | return(NULL)
90 | }
91 |
92 | #Test this function
93 | testDates <- function() {
94 | #dates <- c("March 2010","December 2011", "February 2012", "March 2014", "Jan 2013")
95 | dates <- c("1962", "1949", "1932", "1963", "1925")#, 1937, 1912, 1944, 1930, 1958, 1939, 1919, 1915)
96 | #dates <- seq(as.Date("2000/1/1"), by="month", length.out=1000)
97 | #Remove NA: dates <- dates[!is.na(dates)]
98 | date.format <- getDateFormat(dates)
99 | #d <- as.Date(dates, date.format)
100 | if(date.format %in% c("%y", "%Y")) {
101 | dates = as.Date(paste("1 1 ", dates, sep=""), format=paste("%d %m ", date.format, sep=""))
102 | } else if (date.format %in% c("%b%y","%b %y", "%b/%y", "%b-%y", "%b.%y",
103 | "%b%Y", "%b %Y", "%b/%Y", "%b-%Y", "%b.%Y",
104 | "%B%y", "%B %y", "%B/%y", "%B-%y", "%B.%y",
105 | "%B%Y", "%B %Y", "%B/%Y", "%B-%Y", "%B.%Y")) {
106 | sep <- gsub("%[a-zA-Z]{1}(.*)%[a-zA-Z]{1}", "\\1", date.format)
107 | dates = as.Date(paste("1", dates, sep=sep), format=paste("%d", date.format, sep=sep))
108 |
109 | } else {
110 | dates = as.Date(dates, format=date.format)
111 | }
112 | }
113 |
114 | #
115 | # Test if date
116 | #
117 | isDate <- function(col, colname="date") {
118 | allowed.date.headers <- c("date", "time", "index")
119 | #if(tolower(colname) %in% allowed.date.headers || grepl("[date]")) {
120 | # print("date found in the header")
121 | #}
122 | return (getDateFormat(col))
123 | }
124 |
125 | #
126 | # Test if logical
127 | #
128 | isColLogical <- function(col) {
129 | if(length(levels(factor(col))) == 2) {
130 | bool.types <- c('true', 'false', 't', 'f', '1', '0', 'y', 'n', 'yes', 'no')
131 | countBool <- sum(sapply(tolower(col), function(x) x %in% bool.types), na.rm=TRUE)
132 | if (countBool == length(col))
133 | return(TRUE)
134 | else
135 | return(FALSE)
136 | } else
137 | return (FALSE)
138 | }
139 |
140 | #
141 | #test if number
142 | #
143 | isNumber <- function(col) {
144 | countBool <- sum(sapply(col, is.numeric), na.rm=TRUE)
145 | if (countBool == length(col))
146 | return(TRUE)
147 | else
148 | return(FALSE)
149 | }
150 |
151 | #
152 | #Main funciton which takes a column (or a vector) and attempts to classify it...
153 | #
154 | classifyData <- function(data) {
155 | #Default sample size
156 | sample.size <- 30
157 | if(nrow(data) <= 30)
158 | sample.size <- nrow(data)
159 |
160 | #Remove rows that have NA
161 | if(ncol(data) == 1) { # this is an intersting one - if pad has only one column, the above line returns a vector rather than a data.frame hence this loop
162 | col.names <- colnames(data)
163 | data <- as.data.frame(data[rowSums(is.na(data)) != ncol(data),])
164 |
165 | #Take a small sample
166 | data.sample <- as.data.frame(data[sample(1:nrow(data)[1], size=sample.size, replace=FALSE),])
167 | colnames(data) <- col.names
168 | } else {
169 | data <- data[rowSums(is.na(data)) != ncol(data),]
170 | #Take a small sample
171 | data.sample <- data[sample(1:nrow(data)[1], size=sample.size, replace=FALSE),]
172 | }
173 | cls <- sapply(data.sample, class)
174 | #print(cls)
175 | classified <- list()
176 | col.names <- colnames(data.sample)
177 | for (i in 1:ncol(data.sample)) {
178 |
179 | col <- data.sample[,i]
180 | col <- col[!is.na(col)] #remove NA
181 |
182 | #check for boolean
183 | if(isColLogical(col)) {
184 | #storage.mode(data[,i]) = "logical"
185 | data[,i] = as.factor(data[,i])
186 | #classes.classified[[i]] = "logical"
187 | next
188 | }
189 |
190 | #check for date names (in date, time)
191 | date.format <- isDate(col)
192 | #print(date.format)
193 | if(!is.null(date.format)) {
194 | if(date.format %in% c("%y", "%Y")) {
195 | data[,i] = as.Date(paste("1 1 ", data[,i], sep=""), format=paste("%d %m ", date.format, sep=""))
196 | } else if (date.format %in% c("%b%y","%b %y", "%b/%y", "%b-%y", "%b.%y",
197 | "%b%Y", "%b %Y", "%b/%Y", "%b-%Y", "%b.%Y",
198 | "%B%y", "%B %y", "%B/%y", "%B-%y", "%B.%y",
199 | "%B%Y", "%B %Y", "%B/%Y", "%B-%Y", "%B.%Y")) {
200 | sep <- gsub("%[a-zA-Z]{1}(.*)%[a-zA-Z]{1}", "\\1", date.format)
201 | data[,i] = as.Date(paste("1", data[,i], sep=sep), format=paste("%d", date.format, sep=sep))
202 | } else {
203 | data[,i] = as.Date(data[,i], format=date.format)
204 | generateExtraDateCols(data, i)
205 | }
206 | next
207 | }
208 |
209 | #check for int
210 | if (isNumber(col)){
211 | data[,i] = as.numeric(data[,i])
212 | next
213 | }
214 | #check for factor
215 |
216 | }
217 | if(verbose) print("Classification Done!")
218 | logMessage("Classification Done!")
219 |
220 | return(data)
221 | }
222 |
223 |
224 | testClassification <- function() {
225 |
226 | #l <- c(1,0,1,0,0,0,1,0)
227 | #z <- sample(c(TRUE,FALSE),1000000,rep=TRUE)
228 | #z <- tolower(z)
229 | #Geneates a matrix
230 | #date.patterns <- unlist(
231 | # lapply(separtors, function(y)
232 | # lapply(patterns, function(x) paste(substring(x,seq(1,nchar(x), 2), seq(2,nchar(x), 2)),
233 | # collapse=y)
234 | # )))
235 |
236 | #sample.df <- data.frame(bigmove=sample(c("y","n"),1000,rep=TRUE),
237 | # date=seq(as.Date("2000/1/1"), by="month", length.out=1000),
238 | # value=rep(1:100,100), stringsAsFactors = FALSE)
239 |
240 | sample.df <- data.frame(dates=c("1-31-2012", "1-1-2011", "1-1-2010", "1-1-2009"), pass=c(1,0,1,0), name=c("a", "bsd", "adaads", "dafds"), stringsAsFactors = FALSE)
241 | print(sapply(sample.df, class))
242 | data <- classifyData(sample.df)
243 | print(sapply(data, class))
244 | }
245 |
246 | if(FALSE)
247 | {
248 | #Symbol Meaning Example
249 | #%d day as a number (0-31) 01-31
250 | #%a
251 | #%A abbreviated weekday
252 | #unabbreviated weekday Mon
253 | #Monday
254 | #%m month (00-12) 00-12
255 | #%b
256 | #%B abbreviated month
257 | #unabbreviated month Jan
258 | #January
259 | #%y
260 | #%Y 2-digit year
261 | #4-digit year 07
262 | #2007
263 | # metaChar = c("$","*","+",".","?","[","^","{","|","(","\\")
264 | #
265 | #`~!@#$%^&*()_|+\-=?;:'",.<>\{\}\[\]\\\/]
266 |
267 | }
--------------------------------------------------------------------------------
/FetchData/GenerateFlightStats.R:
--------------------------------------------------------------------------------
1 | #
2 | # Generate Flight Stats
3 | #
4 | initForStats <- function(){
5 | assign("flights.folder.path", "./pads/raw-data/flights/", envir=.GlobalEnv)
6 | assign("verbose", T, envir=.GlobalEnv)
7 | flights.files <- list.files(path=flights.folder.path, pattern="*.csv.gz")
8 | assign("flights.files", flights.files, envir=.GlobalEnv)
9 | assign("period", length(flights.files), envir=.GlobalEnv)
10 | }
11 |
12 |
13 | getFlightsStatusByAirlines <- function(flights, yr){
14 | #
15 | # get flights stats By airlines
16 | #
17 | if(verbose) cat("Getting stats for airlines:", '\n')
18 | airlines.stats <- flights[, list(dep_airports=length(unique(origin)),
19 | flights=length(origin),
20 | flights_cancelled=sum(cancelled, na.rm=T),
21 | flights_diverted=sum(diverted, na.rm=T),
22 | flights_departed_late=length(which(depdelay > 0)),
23 | flights_arrived_late=length(which(arrdelay > 0)),
24 | total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]),
25 | avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])),
26 | median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)])),
27 | miles_traveled=sum(distance, na.rm=T)
28 | ),
29 | by=uniquecarrier][, year:=yr]
30 | #change col order
31 | setcolorder(airlines.stats, c("year", colnames(airlines.stats)[-ncol(airlines.stats)]))
32 | #save this data
33 | saveData(airlines.stats, paste(flights.folder.path, "stats/5/airlines_stats_", yr, ".csv", sep=""))
34 | #clear up space
35 | rm(airlines.stats)
36 |
37 | #
38 | # get flights stats By airlines by month
39 | #
40 | if(verbose) cat("Getting stats for airlines by month:", '\n')
41 | airlines.stats <- flights[, list(dep_airports=length(unique(origin)),
42 | flights=length(origin),
43 | flights_cancelled=sum(cancelled, na.rm=T),
44 | flights_diverted=sum(diverted, na.rm=T),
45 | flights_departed_late=length(which(depdelay > 0)),
46 | flights_arrived_late=length(which(arrdelay > 0)),
47 | total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]),
48 | avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])),
49 | median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)])),
50 | miles_traveled=sum(distance, na.rm=T)
51 | ),
52 | by=list(uniquecarrier, month)][, year:=yr]
53 | #change col order
54 | setcolorder(airlines.stats, c("year", colnames(airlines.stats)[-ncol(airlines.stats)]))
55 | #save this data
56 | saveData(airlines.stats, paste(flights.folder.path, "stats/6/airlines_stats_monthly_", yr, ".csv", sep=""))
57 | #clear up space
58 | rm(airlines.stats)
59 |
60 | }
61 |
62 | getFlightsStatsByAirport <- function(flights, yr){
63 | #
64 | # get flights stats By airport
65 | #
66 | if(verbose) cat("Getting stats for airport:", '\n')
67 | airport.stats <- flights[, list(flights=length(uniquecarrier),
68 | flights_cancelled=sum(cancelled, na.rm=T),
69 | flights_departed_late=length(which(depdelay > 0)),
70 | total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]),
71 | avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])),
72 | median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)]))
73 | ),
74 | by=origin][, year:=yr]
75 | #change col order
76 | setcolorder(airport.stats, c("year", colnames(airport.stats)[-ncol(airport.stats)]))
77 | #save this data
78 | saveData(airport.stats, paste(flights.folder.path, "stats/3/airport_stats_", yr, ".csv", sep=""))
79 | #clear up space
80 | rm(airport.stats)
81 |
82 | #
83 | # get flights stats By airport by month
84 | #
85 | if(verbose) cat("Getting stats for airport by month:", '\n')
86 | airport.stats <- flights[, list(flights=length(uniquecarrier),
87 | flights_cancelled=sum(cancelled, na.rm=T),
88 | flights_departed_late=length(which(depdelay > 0)),
89 | total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]),
90 | avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])),
91 | median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)]))
92 | ), by=list(origin, month)][, year:=yr]
93 | #change col order
94 | setcolorder(airport.stats, c("year", colnames(airport.stats)[-ncol(airport.stats)]))
95 | #save this data
96 | saveData(airport.stats, paste(flights.folder.path, "stats/4/airport_stats_monthly_", yr, ".csv", sep=""))
97 | #clear up space
98 | rm(airport.stats)
99 | }
100 |
101 | getFlightStatsForYear <- function(flights, yr){
102 | #
103 | # starting with flights data at aggregate level for every year
104 | #
105 | if(verbose) cat("Getting flight stats: ", '\n')
106 | flights.stats <- flights[, list(airlines = length(unique(uniquecarrier)),
107 | flights=length(uniquecarrier),
108 | flights_cancelled=sum(cancelled, na.rm=T),
109 | flights_diverted=sum(diverted, na.rm=T),
110 | flights_departed_late=length(which(depdelay > 0)),
111 | flights_arrived_late=length(which(arrdelay > 0)),
112 | total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]),
113 | avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])),
114 | median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)])),
115 | miles_traveled=sum(distance, na.rm=T),
116 | dep_airports=length(unique(origin)),
117 | arr_airports=length(unique(dest)),
118 | all_airports=length(union(unique(origin), unique(dest))),
119 | flights_delayed_reason_carrier=sum(!is.na(carrierdelay)),
120 | flights_delayed_reason_weather=sum(!is.na(weatherdelay)),
121 | flights_delayed_reason_security=sum(!is.na(securitydelay))
122 | )][, year:=yr]
123 | #save this data
124 | saveData(flights.stats, paste(flights.folder.path, "stats/1/flights_stats_", yr, ".csv", sep=""))
125 | #clear up space
126 | rm(flights.stats)
127 |
128 | #
129 | # get flights stats By month for every year
130 | #
131 | if(verbose) cat("Getting flight stats by month: ", '\n')
132 | flights.stats.month <- flights[, list(airlines = length(unique(uniquecarrier)),
133 | flights=length(uniquecarrier),
134 | flights_cancelled=sum(cancelled, na.rm=T),
135 | flights_diverted=sum(diverted, na.rm=T),
136 | flights_departed_late=length(which(depdelay > 0)),
137 | flights_arrived_late=length(which(arrdelay > 0)),
138 | total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]),
139 | avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])),
140 | median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)])),
141 | miles_traveled=sum(distance, na.rm=T)
142 | ),
143 | by=month][, year:=yr]
144 | #change col order
145 | setcolorder(flights.stats.month, c("year", colnames(flights.stats.month)[-ncol(flights.stats.month)]))
146 | #save this data
147 | saveData(flights.stats.month, paste(flights.folder.path, "stats/2/flights_stats_by_month_", yr, ".csv", sep=""))
148 | #clear up space
149 | rm(flights.stats.month)
150 |
151 | }
152 |
153 | mapFlightStats <- function(){
154 | for(j in 1:period) {
155 | if( j > 2) {
156 | yr <- as.integer(gsub("[^0-9]", "", gsub("(.*)(\\.csv)", "\\1", flights.files[j])))
157 | flights.data.file <- paste(flights.folder.path, flights.files[j], sep="")
158 | if(verbose) cat(yr, ": Reading : ", flights.data.file, "\n")
159 | #flights.data.file <- "./pads/raw-data/flights/1987.csv.gz"
160 | flights <- data.table(read.csv(flights.data.file, stringsAsFactors=F))
161 | col.names <- colnames(flights)
162 | setnames(flights, col.names, tolower(col.names))
163 | flights <- flights[, list(year, month, uniquecarrier, origin,
164 | dest, cancelled, diverted, depdelay,
165 | arrdelay, distance, carrierdelay,
166 | weatherdelay,securitydelay)]
167 | setkeyv(flights, c("year", "uniquecarrier", "dest", "origin", "month"))
168 | if(verbose) cat("Starting analysis on: ", yr, "\n")
169 | getFlightStatsForYear(flights, yr)
170 | getFlightsStatusByAirlines(flights, yr)
171 | getFlightsStatsByAirport(flights, yr)
172 | }
173 | }
174 | }
175 |
176 |
177 |
178 | #
179 | #reduce all results
180 | #
181 | reduceFlightStats <- function(){
182 | n <- 1:6
183 | folder.path <- paste("./pads/raw-data/flights/stats/", n, "/", sep="")
184 | print(folder.path)
185 | for(i in n){
186 | filenames <- paste(folder.path[i], list.files(path=folder.path[i], pattern="*.csv"), sep="")
187 | dt <- do.call("rbind", lapply(filenames, read.csv, stringsAsFactors=F))
188 | print(nrow(dt))
189 | saveData(dt, paste("./pads/raw-data/flights/stats/", i, ".csv", sep=""))
190 | }
191 | }
192 |
193 | #
194 | # Run this job - initialize, generate stats for individiual years and then aggregate them together
195 | # to get single file for fligths, airports and airlines
196 | #
197 | runJob <- function(){
198 | initForStats()
199 | #mapFlightStats()
200 | #reduceFlightStats()
201 | }
--------------------------------------------------------------------------------
/flights_map_reduce.R:
--------------------------------------------------------------------------------
1 | # Authors: Jitender Aswani, Co-Founder @ datadolph.in
2 | # Date: 2013-15-5
3 | # Description: The Airline data set consists of flight arrival and departure details for all commercial flights from 1987 to 2008.
4 | # The approximately 120MM records (CSV format) occupy 12GB space. Data can be downloaded from here: http://stat-computing.org/dataexpo/2009/
5 | # This R code simulates Map-Reduce functionality to analyze 22 years of historical data on flights.
6 | # Packages Used: data.table & plyr
7 | # Blog Reference: http://blog.datadolph.in/2013/06/big-data-analysis-performance-story-of-chicago-ohare-airport/
8 | # Blog Reference: http://allthingsr.blogspot.com/2013/06/simulating-map-reduce-in-r-for-big-data.html
9 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
10 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
11 | # All rights reserved.
12 |
13 | #
14 | # initialize and read all 22 compressed CSV files
15 | #
16 | initForStats <- function(){
17 | assign("flights.folder.path", "./raw-data/flights/", envir=.GlobalEnv)
18 | assign("verbose", T, envir=.GlobalEnv)
19 | flights.files <- list.files(path=flights.folder.path, pattern="*.csv.gz")
20 | assign("flights.files", flights.files, envir=.GlobalEnv)
21 | assign("period", length(flights.files), envir=.GlobalEnv)
22 | }
23 |
24 | #
25 | # get flights stats By airlines
26 | #
27 | getFlightsStatusByAirlines <- function(flights, yr){
28 | #
29 | # by Year
30 | #
31 | if(verbose) cat("Getting stats for airlines:", '\n')
32 | airlines.stats <- flights[, list(dep_airports=length(unique(origin)),
33 | flights=length(origin),
34 | flights_cancelled=sum(cancelled, na.rm=T),
35 | flights_diverted=sum(diverted, na.rm=T),
36 | flights_departed_late=length(which(depdelay > 0)),
37 | flights_arrived_late=length(which(arrdelay > 0)),
38 | total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]),
39 | avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])),
40 | median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)])),
41 | miles_traveled=sum(distance, na.rm=T)
42 | ),
43 | by=uniquecarrier][, year:=yr]
44 | #change col order
45 | setcolorder(airlines.stats, c("year", colnames(airlines.stats)[-ncol(airlines.stats)]))
46 | #save this data
47 | saveData(airlines.stats, paste(flights.folder.path, "stats/5/airlines_stats_", yr, ".csv", sep=""))
48 | #clear up space
49 | rm(airlines.stats)
50 |
51 | #
52 | # by month
53 | #
54 | if(verbose) cat("Getting stats for airlines by month:", '\n')
55 | airlines.stats <- flights[, list(dep_airports=length(unique(origin)),
56 | flights=length(origin),
57 | flights_cancelled=sum(cancelled, na.rm=T),
58 | flights_diverted=sum(diverted, na.rm=T),
59 | flights_departed_late=length(which(depdelay > 0)),
60 | flights_arrived_late=length(which(arrdelay > 0)),
61 | total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]),
62 | avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])),
63 | median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)])),
64 | miles_traveled=sum(distance, na.rm=T)
65 | ),
66 | by=list(uniquecarrier, month)][, year:=yr]
67 | #change col order
68 | setcolorder(airlines.stats, c("year", colnames(airlines.stats)[-ncol(airlines.stats)]))
69 | #save this data
70 | saveData(airlines.stats, paste(flights.folder.path, "stats/6/airlines_stats_monthly_", yr, ".csv", sep=""))
71 | #clear up space
72 | rm(airlines.stats)
73 |
74 | }
75 |
76 | #
77 | # get flights stats By airport
78 | #
79 | getFlightsStatsByAirport <- function(flights, yr){
80 | #
81 | # by year
82 | #
83 | if(verbose) cat("Getting stats for airport:", '\n')
84 | airport.stats <- flights[, list(flights=length(uniquecarrier),
85 | flights_cancelled=sum(cancelled, na.rm=T),
86 | flights_departed_late=length(which(depdelay > 0)),
87 | total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]),
88 | avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])),
89 | median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)]))
90 | ),
91 | by=origin][, year:=yr]
92 | #change col order
93 | setcolorder(airport.stats, c("year", colnames(airport.stats)[-ncol(airport.stats)]))
94 | #save this data
95 | saveData(airport.stats, paste(flights.folder.path, "stats/3/airport_stats_", yr, ".csv", sep=""))
96 | #clear up space
97 | rm(airport.stats)
98 |
99 | #
100 | # by month
101 | #
102 | if(verbose) cat("Getting stats for airport by month:", '\n')
103 | airport.stats <- flights[, list(flights=length(uniquecarrier),
104 | flights_cancelled=sum(cancelled, na.rm=T),
105 | flights_departed_late=length(which(depdelay > 0)),
106 | total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]),
107 | avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])),
108 | median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)]))
109 | ), by=list(origin, month)][, year:=yr]
110 | #change col order
111 | setcolorder(airport.stats, c("year", colnames(airport.stats)[-ncol(airport.stats)]))
112 | #save this data
113 | saveData(airport.stats, paste(flights.folder.path, "stats/4/airport_stats_monthly_", yr, ".csv", sep=""))
114 | #clear up space
115 | rm(airport.stats)
116 | }
117 |
118 | #
119 | # get flights stats
120 | #
121 | getFlightStatsForYear <- function(flights, yr){
122 | #
123 | # for every year
124 | #
125 | if(verbose) cat("Getting flight stats: ", '\n')
126 | flights.stats <- flights[, list(airlines = length(unique(uniquecarrier)),
127 | flights=length(uniquecarrier),
128 | flights_cancelled=sum(cancelled, na.rm=T),
129 | flights_diverted=sum(diverted, na.rm=T),
130 | flights_departed_late=length(which(depdelay > 0)),
131 | flights_arrived_late=length(which(arrdelay > 0)),
132 | total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]),
133 | avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])),
134 | median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)])),
135 | miles_traveled=sum(distance, na.rm=T),
136 | dep_airports=length(unique(origin)),
137 | arr_airports=length(unique(dest)),
138 | all_airports=length(union(unique(origin), unique(dest))),
139 | flights_delayed_reason_carrier=sum(!is.na(carrierdelay)),
140 | flights_delayed_reason_weather=sum(!is.na(weatherdelay)),
141 | flights_delayed_reason_security=sum(!is.na(securitydelay))
142 | )][, year:=yr]
143 | #save this data
144 | saveData(flights.stats, paste(flights.folder.path, "stats/1/flights_stats_", yr, ".csv", sep=""))
145 | #clear up space
146 | rm(flights.stats)
147 |
148 | #
149 | # by month for every year
150 | #
151 | if(verbose) cat("Getting flight stats by month: ", '\n')
152 | flights.stats.month <- flights[, list(airlines = length(unique(uniquecarrier)),
153 | flights=length(uniquecarrier),
154 | flights_cancelled=sum(cancelled, na.rm=T),
155 | flights_diverted=sum(diverted, na.rm=T),
156 | flights_departed_late=length(which(depdelay > 0)),
157 | flights_arrived_late=length(which(arrdelay > 0)),
158 | total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]),
159 | avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])),
160 | median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)])),
161 | miles_traveled=sum(distance, na.rm=T)
162 | ),
163 | by=month][, year:=yr]
164 | #change col order
165 | setcolorder(flights.stats.month, c("year", colnames(flights.stats.month)[-ncol(flights.stats.month)]))
166 | #save this data
167 | saveData(flights.stats.month, paste(flights.folder.path, "stats/2/flights_stats_by_month_", yr, ".csv", sep=""))
168 | #clear up space
169 | rm(flights.stats.month)
170 |
171 | }
172 |
173 | #
174 | #map all calculations
175 | #
176 | mapFlightStats <- function(){
177 | for(j in 1:period) {
178 | if( j > 2) {
179 | yr <- as.integer(gsub("[^0-9]", "", gsub("(.*)(\\.csv)", "\\1", flights.files[j])))
180 | flights.data.file <- paste(flights.folder.path, flights.files[j], sep="")
181 | if(verbose) cat(yr, ": Reading : ", flights.data.file, "\n")
182 | flights <- data.table(read.csv(flights.data.file, stringsAsFactors=F))
183 | col.names <- colnames(flights)
184 | setnames(flights, col.names, tolower(col.names))
185 | flights <- flights[, list(year, month, uniquecarrier, origin,
186 | dest, cancelled, diverted, depdelay,
187 | arrdelay, distance, carrierdelay,
188 | weatherdelay,securitydelay)]
189 | setkeyv(flights, c("year", "uniquecarrier", "dest", "origin", "month"))
190 | if(verbose) cat("Starting analysis on: ", yr, "\n")
191 | getFlightStatsForYear(flights, yr)
192 | getFlightsStatusByAirlines(flights, yr)
193 | getFlightsStatsByAirport(flights, yr)
194 | }
195 | }
196 | }
197 |
198 |
199 | #
200 | #reduce all results
201 | #
202 | reduceFlightStats <- function(){
203 | n <- 1:6
204 | folder.path <- paste("./raw-data/flights/stats/", n, "/", sep="")
205 | print(folder.path)
206 | for(i in n){
207 | filenames <- paste(folder.path[i], list.files(path=folder.path[i], pattern="*.csv"), sep="")
208 | dt <- do.call("rbind", lapply(filenames, read.csv, stringsAsFactors=F))
209 | print(nrow(dt))
210 | saveData(dt, paste("./raw-data/flights/stats/", i, ".csv", sep=""))
211 | }
212 | }
213 |
214 | #
215 | # Run this job - initialize, generate stats for individual years and then aggregate them together
216 | # to get single file for flights, airports and airlines
217 | #
218 | runJob <- function(){
219 | initForStats()
220 | mapFlightStats()
221 | reduceFlightStats()
222 |
--------------------------------------------------------------------------------
/Util.R:
--------------------------------------------------------------------------------
1 | # Utility Funcitons
2 | # Author: Jitender Aswani, Co-Founder @datadolph.in
3 | # Date: 3/15/2013
4 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License
5 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/
6 | # All rights reserved.
7 |
8 |
9 |
10 | #
11 | #open a log file with today's date
12 | #
13 | openLogfile <- function(){
14 | file.log <- file( paste("logs/pads_status.log",dataset, Sys.Date(), sep="_"), 'w')
15 | #initialize log file
16 | assign("file.log", file.log, envir=.GlobalEnv)}
17 |
18 | #
19 | #close the log file
20 | #
21 | closeLogfile <- function(){
22 | close(file.log)
23 | }
24 |
25 | #
26 | # log a message in a vector
27 | #
28 | logMessage <- function(mesg){
29 | #tryCatch(cat(paste(Sys.Date(), mesg, sep=":"), file = file.log),
30 | # error=function(e){
31 | # openLogfile()
32 | # cat(paste(Sys.Date(), mesg, sep=":"), file = file.log)
33 | # },
34 | # finally={
35 | # }
36 | #)
37 | #cat("\n", file = file.log)
38 | vec.log[length(vec.log)+1] <<- paste(Sys.Date(), mesg, sep=":")
39 | }
40 |
41 | #
42 | # dump logs to the file system
43 | #
44 | dumpLogs <- function(){
45 | openLogfile()
46 | cat(paste(vec.log, collpase="\n"), file = file.log)
47 | closeLogfile()
48 | }
49 |
50 | #
51 | # Clean Meta files folder
52 | #
53 | cleanMetaFiles <- function(){
54 | folder.meta <- "./pads/meta/"
55 | meta.files <- list.files(path=folder.meta, pattern="*.json")
56 | for(i in meta.files) {
57 | meta.file.name <- paste(folder.meta, i, sep="")
58 | #delete the file
59 | unlink(meta.file.name)
60 | }
61 | }
62 |
63 | #
64 | # Clean csv files folder
65 | #
66 | cleanDataFiles <- function(){
67 | folder.data <- "./pads/data/"
68 | data.files <- list.files(path=folder.data, pattern="*.csv")
69 | for(i in data.files) {
70 | data.file.name <- paste(folder.data, i, sep="")
71 | #delete the file
72 | unlink(data.file.name)
73 | }
74 | }
75 |
76 | #
77 | # Clean cache files folder
78 | #
79 | cleanCacheFiles <- function(){
80 | folder.cache <- "./pads/cache/"
81 | cache.files <- list.files(path=folder.cache, pattern="*.rdata")
82 | for(i in cache.files) {
83 | cache.file.name <- paste(folder.cache, i, sep="")
84 | #delete the file
85 | unlink(cache.file.name)
86 | }
87 | }
88 |
89 | #
90 | #read a csv file
91 | #
92 | readFile <- function(file.location) {
93 | out <- tryCatch(
94 | read.csv(file.location, as.is=TRUE, header=TRUE, stringsAsFactors=FALSE, strip.white=TRUE),
95 | error=function(e) {
96 | message(paste("Unable to read the file:", pad.location))
97 | message("Error message:")
98 | message(e)
99 | # Choose a return value in case of error
100 | return(NA)
101 | },
102 | finally={
103 | #message(paste("Processed URL:", url))
104 | #message("Some other message at the end")
105 | }
106 | )
107 | return(out)
108 | }
109 |
110 | #
111 | #write a csv file
112 | #
113 | saveData <- function(data, file.location){
114 | write.csv(data, file.location, row.names=F)
115 | }
116 |
117 |
118 | #
119 | # remove leading/trailing spaces from a data.frame
120 | #
121 | trimData <- function(data){
122 | return(data.frame(lapply(data, function(x) gsub("(^ +)|( +$)", "", x)), stringsAsFactors=F))
123 | }
124 |
125 | #
126 | # replace NA in every column with zeros
127 | #
128 | replaceNAWithZeros <- function(data) {
129 | data[is.na(data)] <- 0 # replace NA with 0s
130 | return(data)
131 | }
132 |
133 | #
134 | # replace NULL in every cell with zeros
135 | #
136 | replaceNULLWithZeros <- function(data) {
137 | data[is.null(data)] <- 0 # replace null with 0s
138 | return(data)
139 | }
140 |
141 | #
142 | # replace NULL in every cell with NA
143 | #
144 | replaceNULLWithNA <- function(data) {
145 | data[is.null(data)] <- NA # replace null with 0s
146 | return(data)
147 | }
148 |
149 | #
150 | #get row wise json
151 | #
152 | getRowWiseJson <- function (jsonDT) {
153 | require("RJSONIO")
154 | row.json <- apply(jsonDT, 1, toJSON)
155 | json.st <- paste('[', paste(row.json, collapse=', '), ']')
156 | return (json.st)
157 | }
158 |
159 | #
160 | #get unique ID for pads
161 | #
162 | getPadUID <- function(){
163 | systime <- as.character(unclass(Sys.time()))
164 | return(paste("pad", substr(systime, 1,10), substr(systime, 12,16), sep=""))
165 | }
166 |
167 | #
168 | #get GUID
169 | #
170 | getGUID <- function() {
171 | baseuuid <- paste(sample(c(letters[1:6],0:9),30,replace=TRUE),collapse="")
172 |
173 | paste(
174 | substr(baseuuid,1,8),
175 | "-",
176 | substr(baseuuid,9,12),
177 | "-",
178 | "4",
179 | substr(baseuuid,13,15),
180 | "-",
181 | sample(c("8","9","a","b"),1),
182 | substr(baseuuid,16,18),
183 | "-",
184 | substr(baseuuid,19,30),
185 | sep="",
186 | collapse=""
187 | )
188 | }
189 |
190 | #
191 | #get csv data file url
192 | #
193 | getDataFileURL <- function(id) {
194 | return(paste("./pads/data/",id, ".csv", sep=""))
195 | }
196 |
197 | #
198 | #get JSON data file url
199 | #
200 | getMetaDataFileURL <- function(id) {
201 | return(paste("./pads/meta/",id, ".json", sep=""))
202 | }
203 |
204 | #
205 | #get R.data file url
206 | #
207 | getCacheDataFileURL <- function(id) {
208 | return(paste("./pads/cache/",id, ".rdata", sep=""))
209 | }
210 |
211 | #
212 | # Save the JSON Meta data for a PAD
213 | #
214 | saveMetaData <- function(out.file, jsonSt){
215 | file.out <- file(out.file, 'wt')
216 | cat(jsonSt, file=file.out, fill = TRUE)
217 | close(file.out)
218 | }
219 |
220 | #
221 | #replace meta chars from an array of col names
222 | #
223 | replaceMetaChars <- function(col.names) {
224 | return(gsub("[^A-Za-z0-9]+", "_", col.names))
225 | }
226 |
227 | #
228 | #remove meta chars from an array of values
229 | #
230 | removeMetaChars <- function(values) {
231 | return(gsub("[^A-Za-z0-9 ]+", "", values))
232 | }
233 |
234 | #
235 | #remove non-numeric chars from an array of values
236 | #
237 | removeNonNumericChars <- function(values) {
238 | return(gsub("[^0-9.]+", "", values, fixed=T))
239 | }
240 |
241 |
242 | #
243 | #remove meta from numeric columns
244 | #
245 | removeMetaFromNumeric <- function(values) {
246 | #as.numeric( gsub("(,+)|(%+)|(\\$+)","",x) )
247 | return(as.numeric(gsub("[^0-9.]","",values)))
248 | }
249 |
250 | #
251 | #camel case
252 | #
253 | tocamel <- function(s, strict = FALSE) {
254 | cap <- function(s) paste(toupper(substring(s,1,1)),
255 | {s <- substring(s,2); if(strict) tolower(s) else s},
256 | sep = "", collapse = " " )
257 | sapply(strsplit(s, split = " "), cap, USE.NAMES = !is.null(names(s)))
258 | }
259 |
260 |
261 | #
262 | # clean a name
263 | #
264 | cleanName <- function(name) {
265 | clean1 <-gsub("[,]*[ ]*[&]*[ ]+(the|inc|llc|llp|ltd|limited|lp|ll|affiliates|incorporated|incorporation|corp|co|corp|corporation|stores|international|n\\.a|u\\.s\\.a)*[\\.]*[,]*$", "", name, ignore.case = T)
266 | clean2 <- gsub("[,]*[ ]*[&]*[ ]+(the|inc|llc|llp|ltd|limited|lp|affiliates|incorporated|incorporation|corp|co|corp|corporation|stores|international|n\\.a|u\\.s\\.a)*[\\.]*[,]*$", "", clean1, ignore.case = T)
267 | clean3 <- gsub("[&]", "and", clean2)
268 | clean4 <- gsub("-", "", clean3)
269 | #clean5 <- gsub("[^a-zA-Z]", "_" , clean4)
270 | #clean5 <- gsub("(_)+", "_" , unique(clean4))
271 | return (clean4)
272 | }
273 |
274 | #
275 | # Get Class function to return a single class for every object
276 | #
277 | getClass <- function(obj) {
278 | obj.class <- class(obj)
279 | if(length(obj.class) > 1)
280 | return(obj.class[2])
281 | else
282 | return(obj.class)
283 | }
284 |
285 | #
286 | # Generate solr doc from pad meta
287 | #
288 | generateSolrDocFromPADMeta <- function(meta.data) {
289 | generateSolrDoc(meta.data$id, meta.data$title, meta.data$category,
290 | meta.data$subcategory,meta.data$tags)
291 | }
292 |
293 | #
294 | # Generate solr doc and append it to a solr file for later processing
295 | #
296 | generateSolrDoc <- function(id, title, category, subcategory, tags) {
297 | file.out <- file("./pads/solr-docs.json", "r+")
298 | lines <- readLines(file.out)
299 | if(length(lines) != 0)
300 | solr.index.docs <- fromJSON(paste(lines, collapse=""))
301 | doc <- list("type"="pad", "id" = id, "title" = title, "category"=category,
302 | "subcategory"=subcategory, "tags"=tags)
303 | solr.index.docs[[length(solr.index.docs)+1]] <- doc
304 | cat(toJSON(solr.index.docs), file=file.out, fill = TRUE)
305 | close(file.out)
306 | }
307 |
308 | #
309 | # Save solr.index.doc to file system for later processing
310 | #
311 | persistSolrIndex <- function(solr.index.docs, source.name="") {
312 | file.out <- file(paste("./pads/solr-docs-", source.name, ".json", sep=""), "w")
313 | cat(toJSON(solr.index.docs), file=file.out, fill = TRUE)
314 | close(file.out)
315 | }
316 |
317 | #
318 | # Update multiple pads in a System PADS table
319 | #
320 | updateSystemPads <- function(file="./pads/QualifiedPADS.csv") {
321 | pads <- read.csv(file, stringsAsFactors=FALSE)
322 | for(i in 1:nrow(pads)) {
323 | print(paste("#", i, "Updating...", pads$id[i], sep=" "))
324 | updateSystemPad(pads$id[i], pads$title[i], pads$desc[i], pads$category[i],
325 | pads$subcategory[i], pads$src[i], pads$tags[i])
326 |
327 | ###### Update solr document
328 | generateSolrDoc(pads$id[i], pads$title[i], pads$category[i], pads$subcategory[i],
329 | pads$src[i], pads$tags[i])
330 | }
331 | }
332 |
333 | #
334 | # get age
335 | #
336 | getAge <- function(dob, as.of.when, units){
337 | switch(units,
338 | w = {# weeks
339 | return(difftime(strptime(as.of.when, format = "%Y/%m/%d"),
340 | strptime(dob, format = "%Y/%m/%d"),units="weeks"))
341 | },
342 | m = {
343 | # months
344 | return((as.yearmon(strptime(as.of.when, format = "%Y/%m/%d"))-
345 | as.yearmon(strptime(dob, format = "%Y/%m/%d")))*12)
346 | },
347 | y = {
348 | return(year(strptime(as.of.when, format = "%Y/%m/%d"))-
349 | year(strptime(dob, format = "%Y/%m/%d")))
350 | },
351 | )
352 | }
353 | #
354 | # get time period for a give vector of dates
355 | #
356 | getTimePeriod <- function (date.vector){
357 | yr <- year(date.vector)
358 | min.year <- min(yr, na.rm=T)
359 | max.year <- max(yr, na.rm=T)
360 | time.period <- paste(min.year, max.year, sep="-")
361 | return(time.period)
362 | }
363 |
364 | #
365 | # escpae meta chars
366 | #
367 | escapeMetaChars <- function(values){
368 | return(gsub("[^A-Za-z0-9 ]+", "", values))
369 | }
370 |
371 | #
372 | # remove special chars
373 | #
374 | removeSpecialChars <- function (node) {
375 | #val = gsub("[[:space:]]", "", xmlValue(node))
376 | val = gsub("[\xc2\xa0]", "", xmlValue(node))
377 | }
378 |
379 | #
380 | # set class num.with.dollar
381 | #
382 | setClass("num.with.dollar")
383 | setAs("character", "num.with.dollar",
384 | function(from) as.numeric(gsub("$","",from, fixed=TRUE)))
385 |
386 | #
387 | # set class num.with.commas
388 | #
389 | setClass("num.with.commas")
390 | setAs("character", "num.with.commas",
391 | function(from) as.numeric(gsub(",", "", from, fixed=TRUE)))
392 |
393 | #
394 | # set class mix.dates
395 | # an example - 02009-09-25 September 25, 2009
396 | #
397 | setClass("mixed.dates")
398 | setAs("character", "mixed.dates",
399 | function(from) as.Date(gsub("^0", "", str_extract(from, "^[0-9-]+")), "%Y-%m-%d"))
400 |
401 | #
402 | # revenrse string
403 | # strReverse(c("abc", "Statistics"))
404 | #
405 | strReverse <- function(x)
406 | sapply(lapply(strsplit(x, NULL), rev), paste, collapse="")
407 |
408 | #
409 | # get older data
410 | #
411 | # x <- mdy(stats$effective_date)
412 | getOlderDate <- function(x, year=1930){
413 | m <- year(x) %% 100
414 | year(x) <- ifelse(m > year %% 100, 1900+m, 2000+m)
415 | x
416 | }
417 |
--------------------------------------------------------------------------------
/learning/TwitterMap.R:
--------------------------------------------------------------------------------
1 | #########################################################################################
2 | # An R function to make a personalized map of people you follow and who follow you on twitter.
3 | # R functions Copyright (C) 2011 Jeff Leek (jtleek@gmail.com), and the Simply Statistics Blog
4 | # (http://simplystatistics.tumblr.com, http://twitter.com/simplystats)
5 | #
6 | # This program is free software: you can redistribute it and/or modify
7 | # it under the terms of the GNU General Public License as published by
8 | # the Free Software Foundation, either version 3 of the License, or
9 | # (at your option) any later version.
10 | #
11 | # This program is distributed in the hope that it will be useful,
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 | # GNU General Public License for more details, see .
15 | #
16 | #
17 | # These functions depend on the packages: twitteR, maps, geosphere, and RColorBrewer. It will
18 | # attempt to install them if they are not installed when you source this function. Care
19 | # should be used when using this function since the twitteR API has rate limiting in place.
20 | # If you have a large number of followers, or run the function many times, you may be
21 | # rate limited.
22 | #
23 | #
24 | # How to use:
25 | # # Source the function
26 | # source("http://biostat.jhsph.edu/~jleek/code/twitterMap.R")
27 | #
28 | # # Make your twittermap
29 | # twitterMap("simplystats")
30 | #
31 | # #If your location can't be found or latitude longitude can't be calculated
32 | # #choose a bigger city near you. The list of cities used by twitterMap
33 | # #can be found like so:
34 | # data(world.cities)
35 | # grep("Baltimore",world.cities[,1])
36 | #
37 | # # Then make the map using that big city
38 | # twitterMap("simplystats",userLocation="Baltimore")
39 | #
40 | # #If you want both your followers and people you follow in a plot you can do:
41 | # twitterMap("simplystats",plotType="both")
42 | #
43 | ########################################################################################
44 | getPckg <- function(pckg) install.packages(pckg, repos = "http://cran.r-project.org")
45 |
46 | pckg = try(require(twitteR))
47 | if(!pckg) {
48 | cat("Installing 'twitteR' from CRAN\n")
49 | getPckg("twitteR")
50 | require("twitteR")
51 | }
52 |
53 | pckg = try(require(maps))
54 | if(!pckg) {
55 | cat("Installing 'maps' from CRAN\n")
56 | getPckg("maps")
57 | require("maps")
58 | }
59 |
60 | pckg = try(require(geosphere))
61 | if(!pckg) {
62 | cat("Installing 'geosphere' from CRAN\n")
63 | getPckg("geosphere")
64 | require("geosphere")
65 | }
66 |
67 |
68 | pckg = try(require(RColorBrewer))
69 | if(!pckg) {
70 | cat("Installing 'RColorBrewer' from CRAN\n")
71 | getPckg("RColorBrewer")
72 | require("RColorBrewer")
73 | }
74 |
75 |
76 | twitterMap <- function(userName,userLocation=NULL,fileName="twitterMap.pdf",nMax = 1000,plotType=c("followers","both","following")){
77 |
78 | # Get location data
79 | cat("Getting data from Twitter, this may take a moment.\n")
80 | tmp = getUser(userName)
81 | if(is.null(userLocation)){
82 | userLocation = location(tmp)
83 | userLocation = trim(userLocation)
84 | if(nchar(userLocation) < 2){stop("We can not find your location from Twitter")}
85 | }
86 |
87 | followers=tmp$getFollowers(n=nMax)
88 | followersLocation = sapply(followers,function(x){location(x)})
89 | following = tmp$getFriends(n=nMax)
90 | followingLocation = sapply(following,function(x){location(x)})
91 |
92 |
93 | # Load the geographic data
94 | data(world.cities)
95 | data(us.cities)
96 | data(canada.cities)
97 |
98 | # Find the latitude and longitude of the user
99 | cat("Getting geographic (latitude/longitude) of Twitter users.\n")
100 | userLL <- findLatLon(userLocation)$latlon
101 | if(any(is.na(userLL))){stop("We can't find the latitude and longitude of your location from Twitter")}
102 |
103 |
104 | # Find the latitude and longitude of each of the followers/following
105 | # and calcualte the distance to the user
106 |
107 | followersLL = matrix(NA,nrow=length(followers),ncol=4)
108 | followingLL = matrix(NA,nrow=length(following),ncol=4)
109 |
110 | for(i in 1:length(followers)){
111 | if(length(followersLocation[[i]]) > 0){
112 | tmpLL = findLatLon(trim(followersLocation[[i]]))
113 | if(any(!is.na(tmpLL$latlon))){
114 | followersLL[i,] = c(unlist(tmpLL$latlon),distCosine(userLL,tmpLL$latlon),unlist(tmpLL$cont))
115 | }
116 | }
117 | }
118 |
119 | for(i in 1:length(following)){
120 | if(length(followingLocation[[i]]) > 0){
121 | tmpLL = findLatLon(trim(followingLocation[[i]]))
122 | if(any(!is.na(tmpLL$latlon))){
123 | followingLL[i,] = c(unlist(tmpLL$latlon),distCosine(userLL,tmpLL$latlon),unlist(tmpLL$cont))
124 | }
125 | }
126 | }
127 |
128 | followingLL = followingLL[order(-followingLL[,3]),]
129 | followersLL = followersLL[order(-followersLL[,3]),]
130 |
131 | followingLL = followingLL[!is.na(followingLL[,1]),]
132 | followersLL = followersLL[!is.na(followersLL[,1]),]
133 |
134 |
135 | cat("Plotting results.\n")
136 | # Set up the colors
137 | cols = brewer.pal(7,"Set2")
138 |
139 | # Both followers and following
140 | if(plotType=="both"){
141 | pdf(fileName,height=12,width=10)
142 | data(worldMapEnv)
143 | par(mfrow=c(2,1),mar=rep(0,4))
144 | map('world',col="#191919",bg="black",fill=T,mar=rep(0,4),border=0)
145 |
146 | mtext(paste("@",userName," Follower Map",sep=""),col="lightgrey")
147 | nFollowers = dim(followersLL)[1]
148 | for(i in 1:nFollowers){
149 | greatC = getGreatCircle(userLL,followersLL[i,1:2])
150 | lines(greatC,col=cols[followersLL[i,4]],lwd=0.8)
151 | }
152 |
153 | legend(-180,0,legend = c(paste("Asia",sum(followersLL[,4]==1)),paste("Africa",sum(followersLL[,4]==2)),paste("N. America",sum(followersLL[,4]==3)),paste("S. America",sum(followersLL[,4]==4)),paste("Australia/N.Z.",sum(followersLL[,4]==5)),paste("Europe",sum(followersLL[,4]==6))),text.col=cols[1:6],bg="black",cex=0.75)
154 |
155 |
156 | map('world',col="#191919",bg="black",fill=T,mar=rep(0,4),border=0)
157 | mtext(paste("@",userName," Following Map",sep=""),col="lightgrey")
158 | nFollowing = dim(followingLL)[1]
159 | for(i in 1:nFollowing){
160 | greatC = getGreatCircle(userLL,followingLL[i,1:2])
161 | lines(greatC,col=cols[followingLL[i,4]],lwd=0.8)
162 | }
163 |
164 | legend(-180,0,legend = c(paste("Asia",sum(followingLL[,4]==1)),paste("Africa",sum(followingLL[,4]==2)),paste("N. America",sum(followingLL[,4]==3)),paste("S. America",sum(followingLL[,4]==4)),paste("Australia/N.Z.",sum(followingLL[,4]==5)),paste("Europe",sum(followingLL[,4]==6))),text.col=cols[1:6],bg="black",cex=0.75)
165 |
166 | mtext("Created by @simplystats twitterMap",side=1,adj=1,cex=0.8,col="grey")
167 | dev.off()
168 | }
169 |
170 | ## Just followers
171 | if(plotType=="followers"){
172 | pdf(fileName,height=6,width=10)
173 | data(worldMapEnv)
174 | map('world',col="#191919",bg="black",fill=T,mar=rep(0,4),border=0)
175 |
176 | mtext(paste("@",userName," Follower Map",sep=""),col="lightgrey")
177 | nFollowers = dim(followersLL)[1]
178 | for(i in 1:nFollowers){
179 | greatC = getGreatCircle(userLL,followersLL[i,1:2])
180 | lines(greatC,col=cols[followersLL[i,4]],lwd=0.8)
181 | }
182 |
183 | legend(-180,0,legend = c(paste("Asia",sum(followersLL[,4]==1)),paste("Africa",sum(followersLL[,4]==2)),paste("N. America",sum(followersLL[,4]==3)),paste("S. America",sum(followersLL[,4]==4)),paste("Australia/N.Z.",sum(followersLL[,4]==5)),paste("Europe",sum(followersLL[,4]==6))),text.col=cols[1:6],bg="black",cex=0.75)
184 | mtext("Created by @simplystats twitterMap",side=1,adj=1,cex=0.8,col="grey")
185 | dev.off()
186 |
187 | }
188 |
189 | ## Just following
190 | if(plotType=="following"){
191 | pdf(fileName,height=6,width=10)
192 | data(worldMapEnv)
193 | map('world',col="#191919",bg="black",fill=T,mar=rep(0,4),border=0)
194 | mtext(paste("@",userName," Following Map",sep=""),col="lightgrey")
195 | nFollowing = dim(followingLL)[1]
196 | for(i in 1:nFollowing){
197 | greatC = getGreatCircle(userLL,followingLL[i,1:2])
198 | lines(greatC,col=cols[followingLL[i,4]],lwd=0.8)
199 | }
200 |
201 | legend(-180,0,legend = c(paste("Asia",sum(followingLL[,4]==1)),paste("Africa",sum(followingLL[,4]==2)),paste("N. America",sum(followingLL[,4]==3)),paste("S. America",sum(followingLL[,4]==4)),paste("Australia/N.Z.",sum(followingLL[,4]==5)),paste("Europe",sum(followingLL[,4]==6))),text.col=cols[1:6],bg="black",cex=0.75)
202 |
203 | mtext("Created by @simplystats twitterMap",side=1,adj=1,cex=0.8,col="grey")
204 | dev.off()
205 |
206 | }
207 |
208 | }
209 |
210 |
211 | findLatLon <- function(loc){
212 | latlon = NA
213 | cont = NA
214 |
215 | # Asia = 1, Africa = 2, North America = 3, South America = 4, Australia/New Zealand = 5, Europe = 6
216 | continents = matrix(NA,nrow=length(unique(world.cities[,2])),ncol=2)
217 | continents[,1] = unique(world.cities[,2])
218 | continents[1:10,2] = c(1,1,1,2,1,1,1,1,1,1)
219 | continents[11:20,2]= c(1,1,2,1,1,2,1,2,2,2)
220 | continents[21:30,2] = c(2,1,6,6,6,6,6,6,6,6)
221 | continents[31:40,2] = c(6,6,6,6,2,4,4,1,2,1)
222 | continents[41:50,2] = c(4,6,1,4,6,1,3,1,6,6)
223 | continents[51:60,2] = c(3,2,4,2,6,1,6,1,3,2)
224 | continents[61:70,2] = c(1,2,2,2,3,6,3,3,6,6)
225 | continents[71:80,2] = c(1,1,2,6,3,4,3,4,6,1)
226 | continents[81:90,2] = c(3,3,3,2,2,6,6,6,6,4)
227 | continents[91:100,2] = c(2,5,2,2,3,1,1,1,1,1)
228 | continents[101:110,2] = c(1,2,1,1,1,3,2,5,1,6)
229 | continents[111:120,2] = c(1,6,1,1,2,6,1,1,6,2)
230 | continents[121:130,2] = c(6,6,6,1,1,3,4,3,4,2)
231 | continents[131:140,2] = c(6,6,2,2,1,1,1,4,1,1)
232 | continents[141:150,2] = c(1,2,2,1,1,1,4,6,6,2)
233 | continents[151:160,2] = c(4,1,1,1,1,2,4,6,2,2)
234 | continents[161:170,2] = c(1,2,2,1,6,2,1,1,6,1)
235 | continents[171:180,2] = c(1,1,1,2,6,2,2,6,1,1)
236 | continents[181:190,2] = c(2,6,2,1,6,6,3,3,3,3)
237 | continents[191:200,2] = c(2,2,2,2,3,2,3,2,3,1)
238 | continents[201:210,2] = c(3,2,2,2,2,2,2,1,6,2)
239 | continents[211:220,2] = c(1,3,1,6,2,4,3,6,3,4)
240 | continents[221:230,2] = c(1,1,1,3,2,3,3,6,1,6)
241 | continents[231:232,2] = c(2,1)
242 |
243 |
244 | # Get the first element of the location
245 | # firstElement = strsplit(loc,"[^[:alnum:]]")[[1]][1]
246 | firstElement = strsplit(loc,",")[[1]][1]
247 | if(is.na(firstElement)){firstElement="zzzzzzzzz"}
248 |
249 | # See if it is a city
250 | tmp = grep(firstElement,world.cities[,1],fixed=TRUE)
251 | tmp2 = grep(firstElement,state.name,fixed=TRUE)
252 | tmp3 = grep(firstElement,world.cities[,2],fixed=TRUE)
253 |
254 | if(length(tmp) == 1){
255 | latlon = world.cities[tmp,c(5,4)]
256 | cont = continents[which(world.cities[tmp,2]==continents[,1]),2]
257 | }else if(length(tmp) > 1){
258 | tmpCities = world.cities[tmp,]
259 | latlon = tmpCities[which.max(tmpCities$pop),c(5,4)]
260 | cont = continents[which(tmpCities[which.max(tmpCities$pop),2]==continents[,1]),2]
261 | }else if(length(tmp2) == 1){
262 | latlon = c(state.center$x[tmp2],state.center$y[tmp2])
263 | cont = 3
264 | }else if(length(tmp3) > 0){
265 | tmpCities = world.cities[tmp3,]
266 | latlon = tmpCities[which.max(tmpCities$pop),c(5,4)]
267 | cont = continents[which(tmpCities[which.max(tmpCities$pop),2]==continents[,1]),2]
268 | }
269 |
270 | return(list(latlon=latlon,cont=as.numeric(cont)))
271 |
272 | }
273 |
274 |
275 | getGreatCircle = function(userLL,relationLL){
276 | tmpCircle = greatCircle(userLL,relationLL)
277 | start = which.min(abs(tmpCircle[,1] - userLL[1,1]))
278 | end = which.min(abs(tmpCircle[,1] - relationLL[1]))
279 | greatC = tmpCircle[start:end,]
280 | return(greatC)
281 | }
282 |
283 | trim <- function (x) gsub("^\\s+|\\s+$", "", x)
--------------------------------------------------------------------------------