├── learning ├── CheckLibraryPath.R ├── TwitterMapUse.R ├── EnvExample.R ├── Plot.R ├── Kmeans.R ├── GoogleVis-Motion.R ├── fileUpload.R ├── GoogleVis-Geo_Table.R ├── RjsonIO.R ├── geoCodeXML.R ├── MasterBlaster.R ├── ListLoadedPads.R ├── PlayingWithEnv.R ├── DQQE-Utility.R ├── ggplot2.R ├── ggplot.R ├── loadPADS.R ├── JSON.R ├── Clinic.R ├── WikiRead.R ├── NYTWebService.R ├── SP500MarketInflection.R ├── VCorpus.R ├── DataParser.R ├── MiscR.R ├── TransformDataSet.R ├── geoCode.R ├── TransformFailedBanks.R ├── SQLQueryDataFrame.R ├── DGQueryEngine.R ├── DGQEWebSockets.R ├── DKOMSentimentAnalysis.R ├── TwiterSentimentAnalysis.R ├── Olympics.R ├── birbal.R ├── TransformMasterBlasterData.R └── TwitterMap.R ├── GenerateMaplightPADS.R ├── README.md ├── FetchData ├── FetchFailedBanksData.R ├── FetchBeerRatings.R ├── FetchRussell2000.R ├── FetchIPLData.R ├── FetchImmigrationData.R └── GenerateFlightStats.R ├── GeneratePADS.R ├── GenerateFredPADS.R ├── GenerateWDI.R ├── MongoFns.R ├── r_solr_integration.R ├── GenerateWDIPADS.R ├── PresedentialElectionsResults.R ├── SentimentAnalysis.R ├── GenerateCrimePADS.R ├── CreatePADS.R ├── ClassifyData.R ├── flights_map_reduce.R └── Util.R /learning/CheckLibraryPath.R: -------------------------------------------------------------------------------- 1 | #Check library path 2 | pkg <- installed.packages() 3 | pkg[which(pkg[, 1] == "base"), 2] -------------------------------------------------------------------------------- /learning/TwitterMapUse.R: -------------------------------------------------------------------------------- 1 | source("http://biostat.jhsph.edu/~jleek/code/twitterMap.R") 2 | twitterMap("spoonan", userLocation="Palo Alto", plotType="both") -------------------------------------------------------------------------------- /GenerateMaplightPADS.R: -------------------------------------------------------------------------------- 1 | # 2 | # Generate maplight 3 | # 4 | 5 | require(data.table) 6 | 7 | assign("folder.path", "./pads/raw-data/maplight/", envir=.GlobalEnv) 8 | #assign("data.file", "data.csv.gz", envir=.GlobalEnv) 9 | #ml.data.f <- data.table(read.csv(paste(folder.path, data.file, sep=""))) 10 | 11 | assign("data.cache", "ml-data.Rdata", envir=.GlobalEnv) 12 | ml.data <- data.table(read.csv(paste(folder.path, data.cache, sep=""))) 13 | -------------------------------------------------------------------------------- /learning/EnvExample.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | e <- new.env() 8 | e$foo <- 42 9 | attach(e, name='datagram') 10 | rm(list=ls()) # Remove all in global env 11 | foo # Still there! 12 | #...and to detach it: 13 | #detach('datagram') -------------------------------------------------------------------------------- /learning/Plot.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | #setContentType ("image/png") 8 | temp <- tempfile () 9 | y = rnorm (100) 10 | png (temp, type="cairo") 11 | plot (1:100, y, t='l') 12 | dev.off () 13 | sendBin (readBin (temp, 'raw', n=file.info(temp)$size)) 14 | unlink (temp) -------------------------------------------------------------------------------- /learning/Kmeans.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | newiris <- iris 8 | newiris$Species <- NULL 9 | kc <- kmeans(newiris, 3) 10 | table(iris$Species, kc$cluster) 11 | plot(newiris[c("Sepal.Length", "Sepal.Width")], col = kc$cluster) 12 | points(kc$centers[,c("Sepal.Length", "Sepal.Width")], col = 1:3, pch = 8, cex=2) -------------------------------------------------------------------------------- /learning/GoogleVis-Motion.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | library("googleVis") 8 | 9 | data(Fruits) 10 | #typeof(Fruits) 11 | M <- gvisMotionChart(Fruits, idvar="Fruit", timevar="Year") 12 | plot(M) 13 | 14 | #Chart Outout 15 | #print(M, 'chart') 16 | #cat(M$html$header) 17 | #cat(M$html$chart) 18 | #cat(M$html$footer) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | R 2 | = 3 | In this repository, the datadolph.in team is sharing all of the R codebase that it developed to analyze large quantities of data. 4 | datadolph.in team has benefited tremendously from fellow R bloggers and other open source communities and is proud to contribute 5 | all of its codebase into the community. 6 | 7 | The codebase includes ETL and interatiion scripts on - 8 | - R-Solr Integration 9 | - R-Mongo Interaction 10 | - R-MySQL Interaction 11 | - Fetching, cleansing and transforming data 12 | - Classfication (idenfiy column types) 13 | - Default chart generation (based on simple heuristics and matching a dimension with a measure) 14 | -------------------------------------------------------------------------------- /learning/fileUpload.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | #Set content type for output 8 | setContentType("text/html") 9 | post <- str(POST) 10 | print(post) 11 | cat(post) 12 | cat("
") 13 | 14 | #Files 15 | FileStr <- str(FILES) 16 | cat(FileStr) 17 | cat("
") 18 | 19 | #Or you can use the paste function 20 | cat(paste(FILES,names(FILES))) 21 | cat("
") 22 | 23 | destination <- file.path('/Users/homemac/Toga/tmp',FILES$FirstFile$name) 24 | file.copy(FILES$FirstFile$tmp_name,destination,overwrite=TRUE) -------------------------------------------------------------------------------- /learning/GoogleVis-Geo_Table.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | library("googleVis") 8 | #Geo 9 | G <- gvisGeoChart(Exports, "Country", "Profit", options=list(width=200, height=100)) 10 | #Table 11 | T <- gvisTable(Exports, options=list(width=200, height=270)) 12 | #Motion 13 | M <- gvisMotionChart(Fruits, "Fruit", "Year", options=list(width=400, height=370)) 14 | #Merge Charts 15 | GT <- gvisMerge(G,T, horizontal=FALSE) 16 | #Merge Horizontal 17 | GTM <- gvisMerge(GT, M, horizontal=TRUE, tableOptions="bgcolor=\"#CCCCCC\" cellspacing=10") 18 | plot(GTM) -------------------------------------------------------------------------------- /learning/RjsonIO.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | setwd("/Users/homemac/Toga/Alto") 8 | library("RJSONIO") 9 | l <- fromJSON('[{"winner":"68694999", "votes":[ {"ts":"Thu Mar 25 03:13:01 UTC 2010", "user":{"name":"Lamur","user_id":"68694999"}}, {"ts":"Thu Mar 25 03:13:08 UTC 2010", "user":{"name":"Lamur","user_id":"68694999"}}], "lastVote":{"timestamp":1269486788526,"user":{"name":"Lamur","user_id":"68694999"}},"startPrice":0}]') 10 | m <- lapply(l[[1]]$votes, function(x) c(x$user$name, x$user$user_id, x$ts)) 11 | ma <- matrix(unlist(m), ncol=3, byrow=TRUE) 12 | p <- data.frame(ma) -------------------------------------------------------------------------------- /learning/geoCodeXML.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | getDocNodeVal <- function(doc, path) 8 | { 9 | sapply(getNodeSet(doc, path), function(el) xmlValue(el)) 10 | } 11 | 12 | 13 | getGeoCode <- function(str) 14 | { 15 | library(XML) 16 | #str <- gsub(' ','%20',str) 17 | url <- paste('http://maps.google.com/maps/api/geocode/xml?sensor=false&address=',str) 18 | response <- xmlTreeParse(url, useInternal=TRUE) 19 | 20 | lat <- getDocNodeVal(response, "/GeocodeResponse/result/geometry/location/lat") 21 | lng <- getDocNodeVal(response, "/GeocodeResponse/result/geometry/location/lng") 22 | return (c(lat,lng)) 23 | } 24 | 25 | t <- getGeoCode("Bangalore, India") -------------------------------------------------------------------------------- /learning/MasterBlaster.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | library("RJSONIO") 8 | rm (list = ls()) 9 | 10 | # Read SachinTestRecords.csv 11 | 12 | mb <- read.csv("datasets/SachinTestRecords.csv", as.is=TRUE, header=TRUE, stringsAsFactors=FALSE, strip.white=TRUE) 13 | #mb <- read.csv("datasets/SachinTestRecords.csv", header=TRUE,strip.white=TRUE) 14 | nRows <- nrow(mb) 15 | nCols <- ncol(mb) 16 | cNames <- colnames(mb) 17 | cClass <- sapply(mb, class) 18 | iCols <- list(name=cNames[1], class=class(mb[,1]), sample=head(mb[,1])) 19 | for(i in 2:5) { 20 | iCols <- list(iCols, list(name=cNames[i], class=class(mb[,i]), sample=head(mb[,i]))) 21 | } 22 | metaData <- list(nRows=nRows, nCols=nCols, iCols) 23 | jsonString <- toJSON(metaData) -------------------------------------------------------------------------------- /learning/ListLoadedPads.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | setContentType("text/html") 8 | #Step 2 - load environment 9 | pads.loaded <- ls(pattern='pad[0-9]', envir=.GlobalEnv) 10 | #pads.loaded <- ls() 11 | #cat(is.environment(.rAenv)) 12 | cat("This is a test script. It checks for pre-loaded pads.
Following pads are pre-loaded in this environment:
"); 13 | cat(paste("No. of Pads: ", length(pads.loaded), "
", sep="")) 14 | getDetails <- function(x) 15 | { 16 | d <- get(x) 17 | return(paste("PadName:", x, ", Records:", nrow(d), ", Columns:", length(d), ", ColNames:", paste(colnames(d), collapse="; "), ", Col Classes:", paste(lapply(d, class), collapse="||"), sep="")) 18 | } 19 | v <- vapply(pads.loaded, getDetails, "Pad Was Not Found.") 20 | cat(paste(v, collapse="
")) -------------------------------------------------------------------------------- /learning/PlayingWithEnv.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | rm (list = ls("PADSEnv")) 7 | padsEnv <- new.env() 8 | padsFile <- "/Users/homemac/pads.json" 9 | 10 | getPADS <- function() 11 | { 12 | library("RJSONIO") 13 | con <- file(padsFile) 14 | data.json <- fromJSON(paste(readLines(con), collapse="")) 15 | close(con) 16 | #Convert the list to a matrix and then to a data frame 17 | m <- matrix(unlist(data.json), ncol=2, byrow=TRUE) 18 | pads <- data.frame(m, stringsAsFactors=FALSE) 19 | colnames(pads) <- c("id", "location") 20 | return (pads) 21 | } 22 | 23 | padsEnv$pads <- getPADS() 24 | #Load all the PADS 25 | sysID <- NULL 26 | for(i in 1:length(padsEnv$pads)) { 27 | sysID[i] <- load(padsEnv$pads$location[i], padsEnv) 28 | } 29 | padsEnv$pads$sysID <- sysID 30 | attach(padsEnv, name='PADSEnv') 31 | # All Done .rAenv 32 | rm(list=ls()) 33 | -------------------------------------------------------------------------------- /learning/DQQE-Utility.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | getResult <- function(dt, expr, gby) { 8 | return(dt[,eval(expr), by=eval(gby)]) 9 | } 10 | dt <- data.table(iris) 11 | v1 <- "Sepal.Length" 12 | v2 <- "Species" 13 | e <- parse(text = paste("sum(", v1, ", na.rm = TRUE)")) 14 | b <- parse(text = v2) 15 | #rDT2 <- dt[, eval(e), by = eval(b)] 16 | dtR <- getResult(dt = dt, expr = e, gby = b) 17 | 18 | 19 | i <- which(colnames(padDT)==m) 20 | getData(padDT, sum(i, na.rm=TRUE), d) 21 | #getData(padDT, parse(text=paste("sum(", m, ",na.rm=TRUE)", sep="")), d) lapply(padDT, class) 22 | 23 | 24 | getResult <- function(dt, expr, gby) { 25 | e <- substitute(expr) 26 | b <- substitute(gby) 27 | return(dt[,eval(e),by=b]) 28 | } 29 | v1 <- "Sepal.Length" 30 | v2 <- "Species" 31 | 32 | 33 | getResult(dt, sum(get(v1), na.rm=TRUE), v2) 34 | #e <- substitute(expr) 35 | #b <- substitute(gby) 36 | -------------------------------------------------------------------------------- /FetchData/FetchFailedBanksData.R: -------------------------------------------------------------------------------- 1 | library(XML) 2 | library("chron") 3 | library(plyr) 4 | library(stringr) 5 | 6 | source("UtilPADS.R") 7 | 8 | run <- function() { 9 | #col names 10 | col.names <- c("bank_name","city","st","closing_date","acquiring_institution","assets_in_mil_usd") 11 | #wiki url 12 | url <- "http://en.wikipedia.org/wiki/List_of_bank_failures_in_the_United_States_(2008%E2%80%93present)" 13 | # read wiki page 14 | page <- htmlParse(url, encoding="UTF-8") 15 | # read all tables 16 | all.tables <- try(readHTMLTable(page, colClasses=c('numeric', 'character', 'character', 'character', 'mixed.dates', 17 | 'character', 'num.with.commas'), stringsAsFactors = FALSE, 18 | trim=TRUE, elFun = removeSpecialChars)) 19 | 20 | #bind all the tables together 21 | s.d <- do.call(rbind, all.tables) 22 | # fix row names 23 | rownames(s.d) <- 1:nrow(s.d) 24 | # remove first column 25 | s.d <- s.d[-c(1)] 26 | # assing column names 27 | colnames(s.d) <- col.names 28 | saveData(s.d, "./pads/raw-data/failed-banks/failed_banks_assets_fdic.csv") 29 | } 30 | 31 | run() -------------------------------------------------------------------------------- /learning/ggplot2.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | pres <- read.csv("http://www.stanford.edu/~messing/primaryres.csv", as.is=T) 8 | 9 | # sort data in order of percent of vote: 10 | pres <- pres[order(pres$Percentage, decreasing=T), ] 11 | 12 | # only show top 15 candidates: 13 | pres <- pres[1:15,] 14 | 15 | # create a precentage variable 16 | pres$Percentage <- pres$Percentage*100 17 | 18 | # reorder the Candidate factor by percentage for plotting purposes: 19 | pres$Candidate <- reorder(pres$Candidate, pres$Percentage) 20 | 21 | # To install ggplot2, run the following line after deleting the # 22 | #install.packages("ggplot2") 23 | 24 | library(ggplot2) 25 | ggplot(pres, aes(x = Percentage, y = factor(Candidate) )) + 26 | geom_point() + 27 | theme_bw() + opts(axis.title.x = theme_text(size = 12, vjust = .25))+ 28 | xlab("Percent of Vote") + ylab("Candidate") + 29 | opts(title = expression("New Hampshire Primary 2012")) -------------------------------------------------------------------------------- /learning/ggplot.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | library (ggplot2) 8 | 9 | v1 <- c(1,2,3,3,4) 10 | v2 <- c(4,3,1,1,9) 11 | v3 <- c(3,5,7,2,9) 12 | gender <- c("m","f","m","f","f") 13 | 14 | d.data <- data.frame (v1, v2, v3, gender) 15 | 16 | molten <- melt(d.data) 17 | 18 | Average <- ddply(molten, c("gender", "variable"), function(z){ 19 | c(Mean = mean(z$value))} 20 | ) 21 | 22 | ggplot (data=Average, aes(x = variable, y = Mean, fill = gender)) + 23 | geom_bar(position = position_dodge()) + 24 | coord_flip() + 25 | geom_text (position = position_dodge(0.5), 26 | aes(label=round(Mean, 1)), vjust=0.5, hjust=4,colour="white", size=7) 27 | 28 | ggplot (data=Average, aes(x = variable, y = Mean)) + 29 | geom_bar() + 30 | geom_text(aes(label=round(Mean, 1)), vjust=0.5, 31 | hjust=4,colour="white", size=7) + 32 | facet_wrap(~gender) -------------------------------------------------------------------------------- /GeneratePADS.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | # Generate PADS - new way - read a csv file that has the list of pads 8 | # 9 | source("CreatePADS.R") 10 | 11 | # 12 | # generate Pads 13 | # 14 | generatePads <- function() { 15 | #initialize system 16 | initializeSystem() 17 | assign("folder.path", "./pads/raw-data/internet_stats/", envir=.GlobalEnv) 18 | assign("pads.file", "pads_meta.csv", envir=.GlobalEnv) 19 | assign("dataset", "Internet-Stats", envir=.GlobalEnv) 20 | pads <- readFile(paste(folder.path, pads.file, sep="")) 21 | for(i in 1:nrow(pads)){ 22 | #prepare pad meta data 23 | series <- list() 24 | series["source"] <- pads$source[i] 25 | series["category"] <- pads$category[i] 26 | series["subcategory"] <- pads$subcategory[i] 27 | series["tags"] <- pads$tags[i] 28 | series["desc"] <- pads$desc[i] 29 | series["title"] <- pads$title[i] 30 | series["pagetag"] <- "internet" 31 | series.data <- trimData(readFile(paste(folder.path, pads$file[i], sep=""))) 32 | series.data[2:ncol(series.data)] <- sapply(series.data[2:ncol(series.data)], removeMetaFromNumeric) 33 | padify(series, series.data) 34 | } 35 | #clean up 36 | cleaupSystem() 37 | updateCatPadCount() 38 | } 39 | 40 | generatePads() 41 | -------------------------------------------------------------------------------- /learning/loadPADS.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | #setContentType("text/html") 8 | setwd("/Users/homemac/Toga") 9 | #read pre-analyzed data-sets file 10 | #padsFile <- "pads/pads.json" 11 | #getPADS <- function() 12 | #{ 13 | # library("RJSONIO") 14 | # con <- file(padsFile) 15 | # data.json <- fromJSON(paste(readLines(con), collapse="")) 16 | # close(con) 17 | #Convert the list to a matrix and then to a data frame 18 | # m <- matrix(unlist(data.json), ncol=10, byrow=TRUE) 19 | # pads <- data.frame(m, stringsAsFactors=FALSE) 20 | #colnames(pads) <- c("id", "location") 21 | # return (pads) 22 | #} 23 | #SYS <- new.env(hash = TRUE, size = NA) 24 | #Read pads configuration 25 | #pads <- getPADS() 26 | pads <- data.frame(read.csv("Alto/pads/pads.csv", header=TRUE, sep=",",stringsAsFactors=FALSE )) 27 | for(i in 1:nrow(pads)) { 28 | cat(pads$cache[i]) 29 | cat("
") 30 | #Load all the PADS 31 | #cat(pads$cache[i]) 32 | #cat("
") 33 | cat(try(load(pads$cache[i], envir=.GlobalEnv), FALSE)) 34 | #try(load(pads$cache[i], envir=SYS), FALSE) 35 | #unlink(pads$cache[i]) 36 | #cat("
") 37 | 38 | } 39 | cat(ls(pattern="pad[0-9]", envir=.GlobalEnv)) 40 | # All Done - Al these variables get loaded in .GlobalEnv, run "search" to see. 41 | 42 | -------------------------------------------------------------------------------- /learning/JSON.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | library("rjson") 8 | x <- list( alpha = 1:5, beta = "Bravo", gamma = list(a=1:3, b=NULL), delta = c(TRUE, FALSE) ) 9 | json <- toJSON( x ) 10 | fromJSON( json ) 11 | #named vectors are treated as JSON objects (lists) 12 | toJSON(islands[1:4]) 13 | #data.frames must be converted into a list before converting into JSON 14 | plot(cars, pch=2) 15 | json_cars <- toJSON(as.list(cars)) 16 | points( data.frame( fromJSON( json_cars ) ), col="red", pch=3 ) 17 | #special R types are encoded as strings 18 | testString <- c(1,2,3,4,NA,NaN,Inf,8,9); 19 | toJSON(testString); 20 | 21 | sample_json <- ' { "breakfast" : [ "milk", "fruit loops", "juice" ], "lunch" : [ "left over sushi" ] } ' 22 | parser <- newJSONParser() 23 | parser$addData( sample_json ) 24 | food <- parser$getObject() 25 | print( food ) 26 | 27 | #This is equivalent to using FromJSON( sample_json ) 28 | #However, sample_json can be split into several parts: 29 | ### EXAMPLE 2: 30 | part_1 <- '{ "breakfast" : [ "milk", "fruit loops", "juice" ], ' 31 | part_2 <- '"lunch" : [ "left over sushi" ]' 32 | part_3 <- '} [1,2,3,4,5]' #close off the first object, and create a 2nd JSON object, which i 33 | parser <- newJSONParser() 34 | parser$addData( part_1 ) 35 | parser$getObject() #returns NULL - since part_1 isn't complete 36 | parser$addData( part_2 ) 37 | parser$getObject() #returns NULL - since part_2 still isn't complete 38 | parser$addData( part_3 ) 39 | parser$getObject() #returns the first food object 40 | parser$getObject() #returns the second array -------------------------------------------------------------------------------- /learning/Clinic.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | ## generate data for medical example 8 | clinical.trial <- 9 | data.frame(patient = 1:100, 10 | age = rnorm(100, mean = 60, sd = 6), 11 | year.enroll = sample(paste("19", 85:99, sep = ""), 12 | 100, replace = TRUE), 13 | treatment = gl(2, 50, 14 | labels = c("Treatment", "Control")), 15 | center = sample(paste("Center", LETTERS[1:5]), 100, replace = TRUE)) 16 | ## set some ages to NA (missing) 17 | is.na(clinical.trial$age) <- sample(1:100, 20) 18 | summary(clinical.trial) 19 | 20 | ## a simple example of a table call 21 | table(clinical.trial$center) 22 | 23 | ## a logical vector is created and passed into table 24 | table(clinical.trial$age < 60) 25 | #FALSE TRUE 26 | # 41 39 27 | ## the useNA argument shows the missing values, too 28 | table(clinical.trial$age < 60, useNA = "always") 29 | #FALSE TRUE 30 | # 41 39 20 31 | 32 | ## the table of missing age by center 33 | table(clinical.trial$center, is.na(clinical.trial$age)) 34 | # FALSE TRUE 35 | # Center A 16 6 36 | # Center B 8 2 37 | # Center C 23 5 38 | # Center D 20 3 39 | # Center E 13 4 40 | ## centers with most missing ages listed in order 41 | ## highest to lowest 42 | sort(table(clinical.trial$center, is.na(clinical.trial$age))[, 2], 43 | decreasing = TRUE) 44 | #Center A Center C Center E Center D Center B 45 | # 6 5 4 3 2 46 | 47 | 48 | c1 <- cut(clinical.trial$age, breaks = seq(30, 80, by = 10)) 49 | table(c1) -------------------------------------------------------------------------------- /learning/WikiRead.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | library(XML) 8 | library(RColorBrewer) 9 | library(plyr) 10 | library(quantmod) 11 | library(Heatplus) 12 | 13 | # get the list of symbols 14 | l <- readHTMLTable('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[[2]] 15 | l <- as.vector(l$Ticker) 16 | l <- l[c(-59, -71, -80, -124, -141, -147, -275, -283, -292, -299, -309, -316, -360, -378, -381, -406, -439, -470, -471)] 17 | 18 | getMonthlyReturns <- function(sym) { 19 | y <- to.monthly(getSymbols(sym, auto.assign=FALSE, from='2007-01-01')) 20 | as.vector(ClCl(y)*100) 21 | } 22 | 23 | d <- unlist(llply(l, getMonthlyReturns, .progress="text")) 24 | # bounds at -10% and +10% for visual clarity 25 | d[d < -10] <- -10 26 | d[d > 10] <- 10 27 | 28 | heatmap_2(t(matrix(d, ncol=481)), col=brewer.pal(9, 'PuBu'), Rowv=NA, Colv=NA, do.dendro=c(FALSE,FALSE), scale='none', legend=2, main="S&P 500 since 2007 (monthly returns)") 29 | 30 | 31 | 32 | http://blog.datapunks.com/2011/10/sp-500-components-heatmap-in-r/ 33 | 34 | getWikiFC <- function(d) { 35 | l <- try(readHTMLTable("http://en.wikipedia.org/wiki/Forbes%27_list_of_the_most_valuable_football_clubs", 36 | header=FALSE, skip.rows=1, which=d$tableNo, stringsAsFactors = FALSE)) 37 | l$year = d$year 38 | ifelse(d$year > 2010, return(with(l, data.frame(year, club=V2, country=V3, value=V4, revenue=V6))), 39 | return(with(l, data.frame(year, club=V2, country=V3, value=V4, revenue=V7)))) 40 | } 41 | d <- data.frame(year=2012:2007, tableNo=2:7) 42 | mostValuedSoccerClubs <- ddply(d, .(year), getWikiFC) 43 | 44 | 45 | 46 | 47 | l <- readHTMLTable("http://en.wikipedia.org/wiki/Forbes%27_list_of_the_most_valuable_football_clubs")[[7]] -------------------------------------------------------------------------------- /FetchData/FetchBeerRatings.R: -------------------------------------------------------------------------------- 1 | # 2 | # Get beer data 3 | # 4 | # Author: Jitender Aswani, Co-Founder @datadolph.in 5 | # Date: 3/15/2013 6 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 7 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 8 | # All rights reserved. 9 | require(data.table) 10 | startup <- function() { 11 | #initialize system 12 | initializeSystem(0) 13 | 14 | assign("folder.path", "./pads/raw-data/crunchbase/", envir=.GlobalEnv) 15 | assign("comps.file", "cb_companies_june_2013.csv", envir=.GlobalEnv) 16 | assign("comps.rounds.file", "cb_companies_rounds.csv", envir=.GlobalEnv) 17 | assign("investors.file", "cb_investors.csv", envir=.GlobalEnv) 18 | 19 | assign("dataset", "crunchbase", envir=.GlobalEnv) 20 | #assign("verbose", TRUE, envir=.GlobalEnv) 21 | 22 | loadCompaniesStats() 23 | loadCompanyRounds() 24 | loadInvestosStats() 25 | 26 | #prepare pad meta data 27 | series <- list() 28 | series["source"] <- "CrunchBase" 29 | series["category"] <- "Financial Sector" 30 | series["subcategory"] <- "Investment" 31 | series["tags"] <- tolower(paste(series$source, "VC, venture capital, startups, US startups, investments, angel, series-a, series-b, series-c, funding, seed, biotech, ecommerce, enterprise, software, mobile, web", sep=",")) 32 | series["pagetag"] <- "crunchbase" 33 | series["desc"] <- "Built using data from CrunchBase extracted on June 6, 2013." 34 | assign("series", series, envir=.GlobalEnv) 35 | } 36 | 37 | assign("folder.path", "./pads/raw-data/beer-data/", envir=.GlobalEnv) 38 | assign("data.file", "beer_reviews.csv.gz", envir=.GlobalEnv) 39 | 40 | beer.data <- data.table(read.csv(paste(folder.path, data.file, sep=""))) 41 | setkey(beer.data, beer_beerid) 42 | #get beers with 500 or more reviews 43 | b.d.500 <- beer.data[, list(no_of_reviews=nrow(.SD)), by=beer_beerid][no_of_reviews >= 500] 44 | setkey(b.d.500, beer_beerid) 45 | #merge to get all records ofbeers which have 500 or more reviews 46 | b.d.500 <- merge(b.d.500, beer.data, all=F) -------------------------------------------------------------------------------- /FetchData/FetchRussell2000.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | library(XML) 8 | library(plyr) 9 | library(quantmod) 10 | library(data.table) 11 | library("zoo") 12 | 13 | folder.path = "./raw-data/fin-markets/" 14 | russell <- new.env() 15 | # get the list of symbols from the csv file 16 | ru2000 <- read.csv(paste(folder.path, "russell-2000-components.csv", sep=""), stringsAsFactors=F) 17 | colnames(ru2000) <- c("name", "ticker") 18 | #ru2000$name <- gsub(ru2000$ticker, "", ru2000$name) 19 | 20 | getDailyReturns <- function(sym) { 21 | y <- tryCatch(getSymbols(sym, auto.assign=FALSE, from="2007-01-01"), error = identity) 22 | 23 | if(inherits(y, "error")) 24 | cat("Symbol '", sym, "' not downloadable!\n", sep = "") 25 | else { 26 | y <- tryCatch(cbind(y, OpCl(y)*100), error = identity) 27 | if(inherits(y, "error")) 28 | cat("Symbol '", sym, "' not downloadable!\n", sep = "") 29 | else { 30 | write.csv(data.frame( date=index(y), coredata(y) ),row.names=FALSE, file=paste(sym, ".csv", sep="")) 31 | cat("Sucessfully saved the stock data to %s",sym) 32 | } 33 | } 34 | } 35 | 36 | ru2000.DR <- llply(ru2000$Ticker[1109:nrow(ru2000)], getDailyReturns, .progress="text") 37 | 38 | 39 | #transpose d to have tickers as columns and date as rows 40 | ru2000.DR.T <- t(ru2000.DR) 41 | colnames(ru2000.DR.T)<- head(ru2000$Ticker) 42 | AACC <- data.frame[seq(as.Date("2007-01-01"), Sys.Date(), by="day"), ru2000.DR.T$AACC] 43 | #SP100.MR <- data.table(SP100.MR) 44 | plot(SP100.MR$AAP, ylab="Apple", main="Montly Returns Stock" 45 | read.zoo(SP100.MR) 46 | #d <- unlist(llply(l, getMonthlyReturns, .progress="text")) 47 | # bounds at -10% and +10% for visual clarity 48 | d[d < -10] <- -10 49 | d[d > 10] <- 10 50 | 51 | heatmap_2(t(matrix(d, ncol=100)), col=brewer.pal(9, 'PuBu'), 52 | Rowv=NA, Colv=NA, do.dendro=c(FALSE,FALSE), scale='none', legend=2, 53 | main="S&P 100 since 2007 (monthly returns)") -------------------------------------------------------------------------------- /learning/NYTWebService.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | setwd("/Users/homemac/Toga/Alto") 8 | library(RJSONIO) 9 | 10 | ### set parameters ### 11 | api <- "YOUR_KEY" # API key goes here!! 12 | q <- "health+care+reform" # Query string, use + instead of space 13 | records <- 500 # total number of records to return, note limitations above 14 | # calculate parameter for offset 15 | os <- 0:(records/10-1) 16 | 17 | # read first set of data in 18 | uri <- paste ("http://api.nytimes.com/svc/search/v1/article?format=json&query=", q, "&offset=", os[1], "&fields=date&api-key=", api, sep="") 19 | raw.data <- readLines(uri, warn="F") # get them 20 | res <- fromJSON(raw.data) # tokenize 21 | 22 | dat <- unlist(res$results) # convert the dates to a vector 23 | # read in the rest via loop 24 | for (i in 2:length(os)) { 25 | # concatenate URL for each offset 26 | uri <- paste ("http://api.nytimes.com/svc/search/v1/article?format=json&query=", q, "&offset=", os[i], "&fields=date&api-key=", api, sep="") 27 | raw.data <- readLines(uri, warn="F") 28 | res <- fromJSON(raw.data) 29 | dat <- append(dat, unlist(res$results)) # append 30 | } 31 | 32 | # aggregate counts for dates and coerce into a data frame 33 | cts <- as.data.frame(table(dat)) 34 | # establish date range 35 | dat.conv <- strptime(dat, format="%Y%m%d") # need to convert dat into POSIX format for this 36 | daterange <- c(min(dat.conv), max(dat.conv)) 37 | dat.all <- seq(daterange[1], daterange[2], by="day") 38 | # all possible days 39 | # compare dates from counts dataframe with the whole data range 40 | # assign 0 where there is no count, otherwise take count 41 | # (take out PSD at the end to make it comparable) 42 | dat.all <- strptime(dat.all, format="%Y-%m-%d") 43 | 44 | # cant' seem to be able to compare Posix objects with %in%, so coerce them to character for this: 45 | freqs <- ifelse(as.character(dat.all) %in% as.character(strptime(cts$dat, format="%Y%m%d")), cts$Freq, 0) 46 | 47 | plot (freqs, type="l", xaxt="n", main=paste("Search term(s):",q), ylab="# of articles", xlab="date") 48 | axis(1, 1:length(freqs), dat.all) 49 | lines(lowess(freqs, f=.2), col = 2) -------------------------------------------------------------------------------- /learning/SP500MarketInflection.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | library(XML) 8 | library(plyr) 9 | library(quantmod) 10 | #library(data.table) 11 | #library("zoo") 12 | 13 | # get the list of symbols from Wiki 14 | sp500 <- readHTMLTable('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies', 15 | which=2, header=FALSE, skip.rows=1) 16 | sp500.tickers <- as.vector(sp500$V1) 17 | #Remove LOW 18 | sp500.tickers <- sp500.tickers[-which(sp500.tickers=="LOW")] 19 | sp500.tickers <- sp500.tickers[-which(sp500.tickers=="PSX")] 20 | sp500.tickers <- sp500.tickers[-which(sp500.tickers=="XYL")] 21 | sp500.tickers <- sp500.tickers[-which(sp500.tickers=="TRIP")] 22 | sp500.tickers <- sp500.tickers[-which(sp500.tickers=="WPX")] 23 | 24 | sp500.tickers[which(sp500.tickers=="BFB")]="BF-B" 25 | 26 | isFiftyTwoWeekLow <- function(sym) { 27 | #to <- "2012/6/01" 28 | #to <- "2012/4/02" 29 | to <- "2011/10/04" 30 | from <- last(seq(as.Date(to), by="-1 week", length.out=52)) 31 | #last(seq(Sys.Date(), by="-1 week", length.out=52)) 32 | print(sym) 33 | y <- getSymbols(sym, auto.assign=FALSE, from=from, to=to) 34 | low <- ifelse(Cl(last(y))[[1]] < Cl(seriesLo(last(y, '52 weeks')))[[1]], 1, 0) 35 | return (low) 36 | } 37 | #Percent of stocks trading at 52 weeks low 38 | sp500.low <- ldply(sp500.tickers, isFiftyTwoWeekLow, .progress="text") 39 | per.stocks.trading.low.Jun1 <- (sum(sp500.low$V1)/length(sp500.low$V1))*100 40 | 41 | #Get SP500 Index data 42 | getSymbols("^gspc") 43 | Cl(last(GSPC))[[1]] #Get Last Close 44 | Cl(seriesLo(last(GSPC, '52 weeks')))[[1]] #Get Low from last 52weeks 45 | 46 | #April 2nd - SP500 closed at 1419, multi-year high - What Percent of stocks trading at 52 weeks low 47 | sp500.low.Apr2 <- ldply(sp500.tickers, isFiftyTwoWeekLow, .progress="text") 48 | per.stocks.trading.low.Apr2 <- (sum(sp500.low.Apr2$V1)/length(sp500.low.Apr2$V1))*100 49 | 50 | 51 | #October 4th - SP500 closed at 1123.95 - What Percent of stocks were trading at 52 weeks low 52 | sp500.low.Oct4 <- ldply(sp500.tickers, isFiftyTwoWeekLow, .progress="text") 53 | per.stocks.trading.low.Oct4 <- (sum(sp500.low.Oct4$V1)/length(sp500.low.Oct4$V1))*100 54 | 55 | -------------------------------------------------------------------------------- /learning/VCorpus.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | library(tm) 8 | library(twitteR) 9 | 10 | tweets <- searchTwitter("#DKOM", n=1500 ) 11 | df <- do.call("rbind", lapply(tweets, as.data.frame)) 12 | 13 | #tweets1 <- searchTwitter("#Netflix", n=1500 ) 14 | #df1 <- do.call("rbind", lapply(tweets, as.data.frame)) 15 | #dim(df) 16 | 17 | # build a corpus, which is a collection of text documents 18 | # VectorSource specifies that the source is character vectors. 19 | myCorpus <- Corpus(VectorSource(df$text)) 20 | 21 | #After that, the corpus needs a couple of transformations, including changing letters to lower case, removing punctuations/numbers and removing stop words. The general English stop-word list is tailored by adding "available" and "via" and removing "r". 22 | myCorpus <- tm_map(myCorpus, tolower) 23 | # remove punctuation 24 | myCorpus <- tm_map(myCorpus, removePunctuation) 25 | # remove numbers 26 | myCorpus <- tm_map(myCorpus, removeNumbers) 27 | # remove stopwords 28 | # keep "r" by removing it from stopwords 29 | myStopwords <- c(stopwords('english'), "available", "via") 30 | idx <- which(myStopwords == "r") 31 | myStopwords <- myStopwords[-idx] 32 | myCorpus <- tm_map(myCorpus, removeWords, myStopwords) 33 | 34 | #Stemming Words 35 | dictCorpus <- myCorpus 36 | # stem words in a text document with the snowball stemmers, 37 | # which requires packages Snowball, RWeka, rJava, RWekajars 38 | myCorpus <- tm_map(myCorpus, stemDocument) 39 | # inspect the first three ``documents" 40 | #inspect(myCorpus[1:3]) 41 | 42 | # stem completion 43 | myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=dictCorpus) 44 | 45 | #inspect(myCorpus[1:3]) 46 | 47 | #Building a Document-Term Matrix 48 | myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 1)) 49 | inspect(myDtm[266:270,31:40]) 50 | 51 | #Frequent Terms and Associations 52 | findFreqTerms(myDtm, lowfreq=10) 53 | 54 | # which words are associated with "r"? 55 | #findAssocs(myDtm, 'r', 0.30) 56 | 57 | # which words are associated with "mining"? 58 | # Here "miners" is used instead of "mining", 59 | # because the latter is stemmed and then completed to "miners". :-( 60 | #findAssocs(myDtm, 'miners', 0.30) 61 | 62 | 63 | library(wordcloud) 64 | m <- as.matrix(myDtm) 65 | # calculate the frequency of words 66 | v <- sort(rowSums(m), decreasing=TRUE) 67 | myNames <- names(v) 68 | #k <- which(names(v)=="miners") 69 | #myNames[k] <- "mining" 70 | d <- data.frame(word=myNames, freq=v) 71 | wordcloud(d$word, d$freq, min.freq=3) 72 | -------------------------------------------------------------------------------- /learning/DataParser.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | library(RJSONIO) 8 | library(stringr) 9 | 10 | #Set content type for output 11 | setContentType("text/html") 12 | #Query Engine - dfp datafilepath 13 | dfp <- NULL 14 | udir <- NULL 15 | 16 | #Convert to a ROW JSON 17 | getRowWiseJson <- function (jsonDT) { 18 | row.json <- apply(jsonDT, 1, toJSON) 19 | json.st <- paste('[', paste(row.json, collapse=', '), ']') 20 | return (json.st) 21 | } 22 | 23 | 24 | # Read SachinTestRecords.csv 25 | readDataSet <- function(file.name) { 26 | return( 27 | read.csv(file.name, na.strings="-", as.is=TRUE, header=TRUE, 28 | stringsAsFactors=FALSE, strip.white=TRUE) 29 | ) 30 | } 31 | readRequest <- function() { 32 | #GetString <- str(GET) 33 | #cat(GetString) 34 | dfp <<- GET$dfp 35 | udir <<- GET$udir 36 | #cat(d, "," ,m, "," ,f, "," ,qds, sep=" ") 37 | #cat("
") 38 | } 39 | writeResponse <- function(df, datafile, dsID) { 40 | sJSON = paste('{"upad":"', datafile,'","dsID":"', dsID,'","colTypes":',toJSON(unlist(lapply(df, class), use.names=FALSE)),', "rows":', getRowWiseJson(head(df,50)), '}', sep="") 41 | #Write response back 42 | cat(sJSON) 43 | } 44 | #Step 1 - Read Request, query string 45 | readRequest() 46 | 47 | #Step 2 - load environment 48 | df <- readDataSet(dfp) 49 | #Remove white space in col names 50 | colnames(df) <- str_replace(colnames(df), ' ', '') 51 | 52 | #Get Unique ID 53 | dsID <- paste("upad", substr(as.character(unclass(Sys.time())), 12,16), sep="") 54 | out.file.csv <- paste(udir,"analyzed/", dsID, ".csv", sep="") 55 | #out.file.JSON <- paste("pads/meta/",dsID, ".JSON", sep="") 56 | out.file.R.dataframes <- paste(udir, dsID, ".RData", sep="") 57 | write.csv(df, out.file.csv) 58 | 59 | # Write JSON 60 | #jsonSt <- paste('{ 61 | # "name":"', file.name, '", 62 | # "id":"', dsID, '",', sep=""); 63 | #dlist <- paste() 64 | # "dList": { 65 | # "Player":', toJSON(levels(factor(ipl$Player))), ' 66 | # }, 67 | # "mList": { 68 | # "Mat":"", 69 | # "Sixes":"" 70 | # }, 71 | # "dListType": { 72 | # }, 73 | # "source":"CricInfo" 74 | # }', sep="") 75 | #file.out <- file(out.file.JSON, 'wt') 76 | #cat(jsonSt, file=file.out, fill = TRUE) 77 | #close(file.out) 78 | #Save transformed dataset as data.frame for later reading 79 | assign(dsID, df) 80 | names <- c(eval(dsID)) 81 | save(list=names, file=out.file.R.dataframes) 82 | 83 | #Step 4 - Write Response 84 | writeResponse(df, out.file.R.dataframes, dsID) 85 | -------------------------------------------------------------------------------- /learning/MiscR.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | responseDF <- switch(fnType, 8 | sum = getSum(df, d, m), 9 | count = getCount(df, d, m), 10 | 'default' 11 | ) 12 | if(!is.null(g) && g == "t") 13 | responseDF <- geoCode(responseDF) 14 | return(responseDF) 15 | 16 | { 17 | "name": "Australia", 18 | "center": new google.maps.LatLng(41.878113, -87.629798), 19 | "data": 2842518 20 | }, 21 | 22 | JSON <- with(head(mbDS), paste('{"d":"',mbDS$Country, 23 | '", "center": new google.maps.LatLng(', 24 | mbDS$Country.Lat, ',', mbDS$Country.Lng, 25 | '), "data":', mbDS$Test.Runs, 26 | '},', sep="")) 27 | 28 | m <- "Country" 29 | lat <- paste(m,".Lat", sep="") 30 | sumDS <- ddply(mbDS, .(Country), summarise, Test.Runs=sum(Test.Runs, na.rm=TRUE)) 31 | JSON <- with(sumDS, paste('{"d":"',sumDS[m], 32 | '", "center": new google.maps.LatLng(', 33 | mbDS[lat][sumDS[m]==mbDS[m]], ',', mbDS$Country.Lng[mbDS$Country=="India"], 34 | '), "data":', sumDS$Test.Runs, 35 | '},', sep="")) 36 | 37 | 38 | lookupLat <- function(m, i="India") 39 | { 40 | lat <- paste(m,".Lat",sep="") 41 | return(mbDS[lat][mbDS[m]==i][1]) 42 | } 43 | lookupLat("Country", "India") 44 | } 45 | } 46 | with(sumDS, data.frame(Country, Test.Runs, laply(Country, function(x){getGeoCode(x, x)}))) 47 | 48 | ddply(mbDS, "Opposition", function(x) c(Lat=unique(x$Opposition.Lat, Lng=unique(x$Opposition.Lng)))) 49 | 50 | with(sumDS, data.frame(Country, Test.Runs, laply(Country, function(x){lookupGeoCode(x)}))) 51 | 52 | 53 | with(responeDF, paste('{"d":"',responseDF[m] 54 | '", "center": new google.maps.LatLng(', 55 | df[latName][df[m]==responeDF[m]][1], ',', df[lngName][df[m]==responeDF[m]][1], 56 | '), "data":', mbDS$Test.Runs, 57 | '},', sep="")) 58 | 59 | 60 | sumDS <- ddply(mbDS, .(Opposition), summarise, Test.Runs=sum(Test.Runs, na.rm=TRUE)) 61 | 62 | d <- "Opposition" 63 | latName <- paste(d, ".Lat", sep="") 64 | lngName <- paste(d, ".Lng", sep="") 65 | m <- "Test.Runs" 66 | responseDF <- sumDS 67 | df <- mbDS 68 | with(responseDF, paste('{"d":"',responseDF[d], 69 | '", "center": new google.maps.LatLng(', 70 | df[latName][df[d]==responseDF[d]][1], 71 | ',', df[lngName][df[d]==responseDF[d]][1], 72 | '), "data":', responseDF[m], 73 | '},', sep="")) 74 | -------------------------------------------------------------------------------- /learning/TransformDataSet.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | rm (list = ls()) 8 | setwd("~/Toga/Alto") 9 | library("chron") 10 | library("plyr") 11 | library("RJSONIO") 12 | library("data.table") 13 | library("ggplot2") 14 | in.file.name <- "datasets/BollywoodCinema-1940-2008.csv" 15 | out.file.name <- "datasets/masterblaster/TransformedSachinTestRecords.csv" 16 | out.file.JSON <- "datasets/masterblaster/SachinTestRecords.JSON" 17 | out.file.R.dataframes <- "datasets/masterblaster/MB.RData" 18 | dt <- data.table(read.csv(in.file.name, na.strings="-", as.is=TRUE, header=TRUE, 19 | stringsAsFactors=FALSE, strip.white=TRUE)) 20 | 21 | lMonths <- c("January","February","March", "April","May","June","July","August","September", "October","November","December") 22 | lDays <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday") 23 | 24 | 25 | # Line plot 26 | ggplot(totmidc, aes(variable, value)) + geom_line() + xlab("") + ylab("") 27 | 28 | # Bar plot 29 | # Note that the parameter stat="identity" passed to geom_bar() 30 | ggplot(totmidc, aes(x=variable, y=value)) + geom_bar(stat="identity") + xlab("") + ylab("") 31 | 32 | 33 | d <- dt[,list(Deols=sum(grepl("deol|Dharmendra",Cast, ignore.case=TRUE)), 34 | Kapoors=sum(grepl("kapoor",Cast, ignore.case=TRUE)), 35 | Khans=sum(grepl("khan",Cast, ignore.case=TRUE)), 36 | Bachchans=sum(grepl("bachchan",Cast, ignore.case=TRUE))), 37 | by=year] 38 | d1 <- melt(d, id.var="year") 39 | ggplot(d1)+geom_bar(aes(x=year,y=value,fill=variable),stat='identity') 40 | 41 | ggplot(d, aes(year))+ geom_bar() +facet_wrap(~n) 42 | ggplot(d, aes(x=factor(year), y=c(Deols, Kapoors))) + geom_bar(stat="identity") + xlab("") + ylab("Deols") 43 | n <- c("Deols", "Kapoors") 44 | ggplot(d, aes(x=year, )) + geom_bar(stat="identity") + facet_wrap(n) 45 | 46 | 47 | 48 | dat<-data.frame(num=1:3,usage=c(4,2,5),cap=c(10,20,10),diff=c(6,18,5)) 49 | dat.melt<-melt(dat,id.var=c('num','cap')) 50 | ggplot(dat.melt)+geom_bar(aes(x=num,y=value,fill=variable),stat='identity') 51 | 52 | #IPL DS 53 | in.file.name <- "pads/data/pad56638.csv" pad89706 54 | dt <- data.table(read.csv(in.file.name, na.strings="-", as.is=TRUE, header=TRUE, 55 | stringsAsFactors=FALSE, strip.white=TRUE)) 56 | dt[,Sixes, by=Player][order(-Sixes)] 57 | 58 | getCount <- function(dat, expr, gby) { 59 | e <- substitute(expr) 60 | b <- substitute(gby) 61 | print(dat[,eval(e),by=b]) 62 | } 63 | getCount(dt, sum(Sixes), Player) 64 | 65 | q <- quote(Sixes) 66 | q1 <- quote(Player) 67 | dt[,eval(q)), by=q1 ][order(-eval(q))] 68 | dt[,q,by=q1] 69 | 70 | #IPL DS 71 | in.file.name <- "pads/data/pad89706.csv" 72 | mbdt <- data.table(read.csv(in.file.name, na.strings="-", as.is=TRUE, header=TRUE, 73 | stringsAsFactors=FALSE, strip.white=TRUE)) 74 | mbdt[,list(sum(as.integer(Test_Runs), na.rm=TRUE)), by=Year][order(Year)] 75 | -------------------------------------------------------------------------------- /learning/geoCode.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | library("plyr") 8 | 9 | mb.df <- "MB.RData" 10 | load(mb.df) 11 | 12 | getGeoCode <- function(str) 13 | { 14 | print(str) 15 | library("RJSONIO") 16 | str <- gsub(' ','%20',str) 17 | connectStr <- paste('http://maps.google.com/maps/api/geocode/json?sensor=false&address=',str, sep="") 18 | print(connectStr) 19 | #data.json <- fromJSON(paste(readLines(url("http://maps.google.com/maps/api/geocode/json?sensor=false&address=South%20Africa")), collapse="")) 20 | data.json <- fromJSON(paste(readLines(url(connectStr)), collapse="")) 21 | close(con) 22 | data.json <- unlist(data.json) 23 | #print(data.json["results.geometry.location.lat"]) 24 | #return (lat = data.json["results.geometry.location.lat"]) 25 | return (c(lat=data.json["results.geometry.location.lat"], lng=data.json["results.geometry.location.lng"])) 26 | } 27 | printVal <- function(val){ 28 | print(val) 29 | return ("YO") 30 | } 31 | shortDS <- with(head(mbDS, 20), data.frame(Opposition, Ground, Toss)) 32 | shortDS <- mutate(shortDS, Oppostion, Ground, Toss, lat=printVal(Opposition)) 33 | 34 | cn <- levels(mbDS$Opposition) 35 | llply(cn, function(x) {getGeoCode(x)}) 36 | ds <- transform(mbDS, lat=getGeoCode(Opposition)) 37 | d <- data.frame(Country=cn) 38 | d$lng <- apply(d, 1, function(x) {getGeoCode(x)}) 39 | 40 | 41 | printVal <- function(val){ 42 | print(val) 43 | return ("YO") 44 | } 45 | ddply(shortDS, "Opposition", function(x){ getGeoCode(x$Opposition[1])}) 46 | 47 | 48 | 49 | #************* Working Version 50 | df <- NULL 51 | #code <- with(head(mbDS, 4), getGeoCode(Opposition)) 52 | #m <- ddply(mbDS, "Opposition", function(x){c("lat", "lag")}) 53 | #t <- getGeoCode("Bangalore, India") 54 | cn <- levels(mbDS$Opposition) 55 | for(i in 1:length(cn)) { 56 | t <- getGeoCode(cn[i]) 57 | r <- c(cn[i], t) 58 | df <- rbind(df, r) 59 | } 60 | #************* Working Version 61 | 62 | getGeoCode <- function(str) 63 | { 64 | library("RJSONIO") 65 | str <- gsub(' ','%20',str) 66 | connectStr <- paste('http://maps.google.com/maps/api/geocode/json?sensor=false&address=',str, sep="") 67 | #print(connectStr) 68 | con <- url(connectStr) 69 | data.json <- fromJSON(paste(readLines(con), collapse="")) 70 | close(con) 71 | data.json <- unlist(data.json) 72 | #print(data.json["results.geometry.location.lat"]) 73 | #return (lat = data.json["results.geometry.location.lat"]) 74 | return (c(lat=data.json["results.geometry.location.lat"], lng=data.json["results.geometry.location.lng"])) 75 | } 76 | 77 | shortDS <- with(head(mbDS, 20), data.frame(Opposition, Ground.Country, Toss)) 78 | #v <- shortDS$Ground.Country 79 | #g <- laply(v, function(x){getGeoCode(x)}) 80 | shortDS <- with(shortDS, data.frame(Opposition, Ground.Country, Toss, 81 | laply(Ground.Country, function(val){getGeoCode(val)}))) 82 | closeAllConnections() 83 | 84 | -------------------------------------------------------------------------------- /GenerateFredPADS.R: -------------------------------------------------------------------------------- 1 | # Generate FRED PADS 2 | # Author: Jitender Aswani, Co-Founder @datadolph.in 3 | # Date: 3/15/2013 4 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 5 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 6 | # All rights reserved. 7 | # 8 | 9 | #Generate PADS from FRED data 10 | setwd("/Users/homemac/R") 11 | source("CreatePADS.R") 12 | 13 | padifyFRED <- function(){ 14 | folder.path = "./raw-data/FRED/" 15 | file.name = "FRED_series.csv" 16 | fred <- read.csv(paste(folder.path, file.name, sep=""), stringsAsFactors=F) 17 | #pads <- data.frame(pad_id=character(), title=character(), desc=character(), records=numeric(), columns=numeric(), 18 | # category_id=character(), subcategory_id=character(), analyzed=numeric(), stories_created=numeric(), 19 | # stories_published=numeric(), "source"=character(), source_data_file=character(), 20 | # created_on=character(), last_updated=character(), tags=character()) 21 | pads <- getSystemFREDPads() 22 | solr.index.docs <- list() 23 | series <- list() 24 | series["source"] <- "FRED" 25 | #for(i in 1:nrow(fred)){ 26 | for(i in 1:2){ 27 | #series["title"] <- ifelse(nchar(fred$Title[i]) > 58, 28 | # paste(substr(fred$Title[i], 1, 58), "...", sep=""), fred$Title[i]) 29 | series["title"] <- fred$Title[i] 30 | series["desc"] <- paste(fred$Title[i], " (Units:", fred$Units[i], 31 | ", Frequency:", fred$Frequency[i], ", Seasonal Adjustmet: ", 32 | fred$Seasonal.Adjustment[i], ")", sep="") 33 | series["category"] <- fred$category_name[i] 34 | series["subcategory"] <- fred$subcategory_name[i] 35 | series["category_id"]<- fred$category_id[i] 36 | series["subcategory_id"]<- fred$subcategory_id[i] 37 | 38 | series.data <- read.csv(paste(folder.path, "data/", gsub(" ", "", gsub("\\\\", "/", fred$File[i])), sep=""),stringsAsFactors=F) 39 | print(paste("Starting file #", i, " name: " , fred$File[i], sep="")) 40 | #print(series.data) 41 | #print(series) 42 | #Create pad and get pad meta data 43 | pmd <- createPAD(series.data, series$title, 44 | series$desc, series$category, series$subcategory, 45 | series$source, paste(series$name, series$country, sep="."), 46 | tolower(paste(series$category, series$subcategory, series$source, sep=","))) 47 | #l <- list(pmd$id, pmd$title, pmd$desc, pmd$records, pmd$columns, series["category_id"], 48 | # series["subcategory_id"], pmd$analyzed, pmd$stories_created, pmd$stories_published, 49 | # pmd$src, pmd$src_file, "", "", pmd$tags) 50 | 51 | pads <- rbind(pads, 52 | data.frame(pad_id=pmd$id[1], title=pmd$title[1], desc=pmd$desc[1], 53 | records=pmd$records[1], columns=pmd$columns[1], 54 | category_id=series$category_id, subcategory_id=series$subcategory_id, 55 | analyzed=pmd$analyzed[1], stories_created=pmd$stories_created[1], 56 | stories_published=pmd$stories_published[1], 57 | "source"=pmd$src[1], source_data_file=pmd$src_file[1], created_on="", last_updated="", 58 | tags=pmd$tags[1])) 59 | # 60 | #generate solr doc 61 | # 62 | doc <- list("type"="pad", "id" = pmd$id, "title" = pmd$title, "desc"=pmd$desc, "category"=pmd$category, 63 | "subcategory"=pmd$subcategory, "tags"=pmd$tags, "author"="system") 64 | solr.index.docs[[length(solr.index.docs)+1]] <- doc 65 | } 66 | # now save solr doc 67 | generateSolrDocFromAllDocs(solr.index.docs) 68 | 69 | # Save pads to a csv file 70 | saveData(pads, "./pads/system_pads.csv") 71 | 72 | #now save pads to db 73 | #scon <- getDBCon() 74 | #dbRemoveTable(scon, "system_pads") 75 | #dbWriteTable(scon, name="system_pads", value=pads, row.names = F, overwrite = T) 76 | #disconnectDB(scon) 77 | 78 | } 79 | 80 | padifyFRED() -------------------------------------------------------------------------------- /learning/TransformFailedBanks.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | rm (list = ls()) 8 | setwd("~/Toga/Alto") 9 | library("chron") 10 | library("plyr") 11 | library("RJSONIO") 12 | #Get Unique ID 13 | dsID <- paste("pad", substr(as.character(unclass(Sys.time())), 12,16), sep="") 14 | out.file.csv <- paste("pads/data/",dsID, ".csv", sep="") 15 | out.file.JSON <- paste("pads/meta/",dsID, ".JSON", sep="") 16 | out.file.R.dataframes <- paste("pads/cache/", dsID, ".RData", sep="") 17 | in.file.name <- "datasets/failedbanks/failedbanksdata.csv" 18 | 19 | lMonths <- c("January","February","March", "April","May","June","July","August","September", "October","November","December") 20 | lDays <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday") 21 | 22 | # Read *.csv 23 | readDataSet <- function(file.name) { 24 | return( 25 | read.csv(file.name, na.strings="-", as.is=TRUE, header=TRUE, 26 | stringsAsFactors=FALSE, strip.white=TRUE) 27 | ) 28 | } 29 | 30 | # Transform raw dataset 31 | transformDataSet <- function(df) { 32 | tdf <- mutate(df, 33 | Date=as.Date(Date, "%m/%d/%Y"), 34 | Day=factor(weekdays(Date), levels=lDays, ordered=TRUE), 35 | Month=factor(months(Date), levels=lMonths, ordered=TRUE), 36 | Year=years(Date), 37 | Bank.Name=Bank.Name, 38 | City=as.factor(City), 39 | State=as.factor(State), 40 | Assets=as.numeric(gsub('[\\$,]', '', Assets)), 41 | Deposits=as.numeric(gsub('[\\$,]', '', Deposits)), 42 | Branches=as.numeric(Branches), 43 | FDIC.Cost=as.numeric(gsub('[\\$,]', '', FDIC.Cost)), 44 | Lat=as.numeric(Lat), 45 | Lng=as.numeric(Lng) 46 | ) 47 | return(with(tdf, data.frame(Date, Day, Month, Year, 48 | Bank.Name, City, State, Assets, Deposits, Branches, FDIC.Cost, 49 | Lat, Lng 50 | )) 51 | ) 52 | } 53 | 54 | writeTransformedDS <- function(df, file.name) { 55 | write.csv(df, file.name) 56 | } 57 | 58 | writeJSON <- function(df, file.JSON, id) { 59 | 60 | jsonSt <- paste('{ 61 | "datasetName": "FailedBanks", 62 | "dsID": id, 63 | "dList": { 64 | "Year":', toJSON(levels(df$Year)), ', 65 | "Month":', toJSON(levels(df$Month)), ', 66 | "Day":', toJSON(levels(df$Day)), ', 67 | "Bank.Name":', toJSON(df$Bank.Name), ', 68 | "City":', toJSON(levels(df$City)), ', 69 | "State":', toJSON(levels(df$State)), 70 | '}, 71 | "mList": { 72 | "Assets":"", 73 | "Deposits":"", 74 | "Branches":"", 75 | "FDIC.Cost":"" 76 | } 77 | }') 78 | file.out <- file(file.JSON, 'wt') 79 | cat(jsonSt, file=file.out, fill = TRUE) 80 | close(file.out) 81 | } 82 | 83 | # Read Raw File and Transform it 84 | fbDS <- transformDataSet(readDataSet(in.file.name)) 85 | 86 | #Write Transformed data set to a CSV file 87 | writeTransformedDS(fbDS, out.file.csv) 88 | 89 | #Generate JSON with m and d 90 | writeJSON(fbDS, out.file.JSON, id) 91 | 92 | #Save transformed dataset as data.frame for later reading 93 | #save(fbDS, file=out.file.R.dataframes) 94 | #Save transformed dataset as data.frame for later reading 95 | assign(dsID, fbDS) 96 | names <- c(eval(dsID)) 97 | save(list=names, file=out.file.R.dataframes) 98 | pads <- rbind(pads, data.table(id=dsID, title="Failed Banks Analysis in US (2008-2011)", 99 | subtitle="This data set explores US Banks that perished during the 2008 downturn.", 100 | records=nrow(fbDS), analyzed=0, stories=0, json=out.file.JSON, data=out.file.csv, 101 | cache=out.file.R.dataframes, source="Federal Reserve") 102 | ) -------------------------------------------------------------------------------- /learning/SQLQueryDataFrame.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | rm (list = ls()) 8 | setwd("~/Toga/Alto") 9 | 10 | #load pre-saved dataframe 11 | out.file.R.dataframes <- "datasets/masterblaster/mb.RData" 12 | load(out.file.R.dataframes) 13 | 14 | library(plyr) 15 | 16 | ## Operate on a data.frame, split by "by", by could be a vector or a single filed 17 | ## aggreage by columns in vector on, on could be a single field also 18 | getSum <- function(df, by, on) { 19 | #print(sum(df[,on], na.rm=TRUE)) 20 | return(ddply(df, by, colwise(function(col) {sum(col, na.rm=TRUE)}, on))) 21 | } 22 | ## Try it out 23 | crpopy <- getSum(mbDS, .(Opposition, Year), .(Test.Century, Test.Runs)) 24 | 25 | ## Operate on a data.frame, split by "by", by could be a vector or a single filed 26 | ## and aggreage by columns in vector on, on could be a single field also 27 | getCount <- function(df, by, on) { 28 | return(ddply(df, by, colwise(function(col) {length(unique(col))}, on))) 29 | } 30 | ## Try it out 31 | sx <- getCount(mbDS, "Year", c("TestNo", "Ground")) 32 | 33 | #For counting column that has 0 or 1 value :: Very similar to getSum - the answer should be same 34 | getBinaryCount <- function(df, by, on) { 35 | return(ddply(df, by, colwise(function(x) {length(which(x==1))}, on))) 36 | #length(which(mbDS[,"Test.Century"] ==1)) 37 | } 38 | ## Try it out 39 | cpo <- getBinaryCount(mbDS, "Opposition", c("Ing1.Century", "Ing2.Century", "Test.Century")) 40 | 41 | ###################### Samples #################### 42 | 43 | #Runs by Opposition 44 | #rpo <- ddply(mbDS, "Opposition", function(x) c(TotalRuns=sum(x$Test_Runs, na.rm=TRUE))) 45 | #rpo <- ddply(mbDS, .(Opposition), summarise, TotalRuns=sum(Test_Runs, na.rm=TRUE)) 46 | rpo <- getSum(mbDS, "Opposition", "Test.Runs") 47 | 48 | #Runs by Opposition by Inning 49 | rpopi <- getSum(mbDS, "Opposition", c("Ing1.Runs", "Ing2.Runs", "Test.Runs")) 50 | 51 | #Runs by Year 52 | #rpy <- ddply(mbDS, .(Year), summarise, TotalRuns = sum(Test.Runs, na.rm=TRUE)) 53 | rpy <- getSum(mbDS, "Year", "Test_Runs") 54 | 55 | #Tests Per Opposition 56 | tpo <- getCount(mbDS, "Opposition", "TestNo") 57 | 58 | #Tests Per Year 59 | tpy <- getCount(mbDS, "Year", "TestNo") 60 | 61 | #Runs by Opposition by Year - userful for clicking on an opposition and drilling down by year 62 | #rpopy <- ddply(mbDS, .(Year, Opposition), summarise, TotalRuns = sum(Test_Runs, na.rm=TRUE)) 63 | rpopy <- getSum(mbDS, c("Opposition", "Year"), "Test.Runs") 64 | 65 | #Tests Per Opposition By Year 66 | tpopy <- getCount(mbDS, c("Opposition", "Year"), "TestNo") 67 | 68 | #Runs and Tests Per Opposition 69 | rtpo <- getSum1Count2(mbDS, "Opposition", c("Test.Runs", "TestNo")) ### Doesn't work 70 | 71 | #Total Runs by Opposiotn by Inning 72 | #rpopi <- ddply(mbDS, .(Opposition), summarise, 73 | # 1stIngRuns = sum(Ing1.Runs, na.rm=TRUE), 74 | # 2ndIngRuns = sum(Ing2.Runs, na.rm=TRUE), 75 | # TotalRuns = sum(Test_Runs, na.rm=TRUE)) 76 | 77 | #Total Runs by Year by Opposition - userful for clicking on a year to drill down 78 | #rpypo <- ddply(mbDS, .(Year, Opposition), summarise, TotalRuns = sum(Test_Runs, na.rm=TRUE)) 79 | 80 | 81 | #Total Centuries by Opposition 82 | #CPO <- ddply(mbDS, .(Opposition), summarise, TestCenturies = sum(Test.Century==1, na.rm=TRUE)) 83 | cpo <- getBinaryCount(mbDS, "Opposition", "Test.Century") 84 | #cpo1 <- getSum(mbDS, "Opposition", "Test.Century") 85 | 86 | cpo <- getBinaryCount(mbDS, "Opposition", c("Ing1.Century", "Ing2.Century", "Test.Century")) 87 | 88 | 89 | #Centuries Per Year 90 | cpy <- getBinaryCount(mbDS, "Year", "Test.Century") 91 | 92 | #Centuries Per Opposition By Year 93 | cpopy <- getBinaryCount(mbDS, c("Opposition", "Year"), "Test.Century") 94 | 95 | #Mege T, R, C by Year 96 | trcpy <- merge(x=merge(tpy,rpy, by.x="Year", by.y="Year"), cpy, by.x="Year", by.y="Year") 97 | colnames(trcpy) <- c("Year", "Tests", "Runs", "Centuries") 98 | 99 | 100 | # Passing multiple arguments ?colwise example 101 | #sum <- function(x){sum(x, na.rm=TRUE)} 102 | #new <- ddply(tips, c("sex", "smoker"), colwise(function(x) {sum(x, na.rm=TRUE)}, c("tip", "total_bill"))) 103 | #new <- ddply(tips, c("sex", "smoker"), colwise(sum, c("tip", "total_bill"))) -------------------------------------------------------------------------------- /learning/DGQueryEngine.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | library(plyr) 8 | library(RJSONIO) 9 | #setwd("/Users/homemac/Toga/Alto") 10 | #Set content type for output 11 | setContentType("text/html") 12 | #Query Engine - qds - query this database, d=dimension, m=measure, f=funciton 13 | qds <- d <- m <- f <- g <- NULL 14 | #d <- "" 15 | #m <- "" 16 | #f <- "" 17 | 18 | #load pre-analyzed and saved dataframe 19 | loadDS <- function(qds) { 20 | if(exists(qds)) { 21 | #lpads <- get("pads", envir=.GlobalEnv) 22 | #cat(exists(pads$sysID[pads$id==qds], envir=.GlobalEnv), "
") 23 | #cat(qds) 24 | return(get(qds)) #get a local copy in this enviornment 25 | } else { 26 | cat(paste('{"response_code": "error", "error_message": "The pad ', qds, ' does not exist. Check to see if you start up R script in RApache ran correctly."}', sep="")) 27 | } 28 | } 29 | 30 | ## Operate on a data.frame, split by "by", by could be a vector or a single filed 31 | ## aggreage by columns in vector on, on could be a single field also 32 | getSum <- function(df, by, on) { 33 | #print(sum(df[,on], na.rm=TRUE)) 34 | #cat(paste(by, on, sep=", ")) 35 | return(ddply(df, by, colwise(function(col) {sum(col, na.rm=TRUE)}, on))) 36 | } 37 | 38 | getMean <- function(df, by, on) { 39 | #print(sum(df[,on], na.rm=TRUE)) 40 | #cat(paste(by, on, sep=", ")) 41 | return(ddply(df, by, colwise(function(col) {mean(col, na.rm=TRUE)}, on))) 42 | } 43 | 44 | #ddply(mbDS, "Opposition", function(tDF){data.frame(Lat=tDF$Opposition.Lat[1], Lng=tDF$Opposition.Lng[1], Test.Runs=sum(tDF$Test.Runs, na.rm=TRUE))}) 45 | ## Operate on a data.frame, split by "by", by could be a vector or a single filed 46 | ## and aggreage by columns in vector on, on could be a single field also 47 | #getCount <- function(df, by, on) { 48 | # return(ddply(df, by, colwise(function(col) {length(unique(col))}, on))) 49 | #} 50 | 51 | getCount<- function(df, by, on) { 52 | lds <- ddply(df, by, "nrow") 53 | colnames(lds) <- c(by,on) 54 | return(lds) 55 | } 56 | 57 | #For counting column that has 0 or 1 value :: Very similar to getSum - the answer should be same 58 | getBinaryCount <- function(df, by, on) { 59 | return(ddply(df, by, colwise(function(x) {length(which(x==1))}, on))) 60 | #length(which(mbDS[,"Test.Century"] ==1)) 61 | } 62 | 63 | readRequest <- function() { 64 | #GetString <- str(GET) 65 | #cat(GetString) 66 | qds <<- GET$qds 67 | d <<- GET$d 68 | m <<- GET$m 69 | f <<- GET$f 70 | g <<- GET$g 71 | #cat(d, "," ,m, "," ,f, "," ,qds, sep=" ") 72 | #cat("
") 73 | } 74 | 75 | processRequest <- function(fnType, df){ 76 | return(switch(fnType, 77 | s = getSum(df, d, m), 78 | c = getCount(df, d, m), 79 | m = getMean(df,d,m), 80 | d = getValue(df,d,m), 81 | 'default' 82 | ) 83 | ) 84 | } 85 | 86 | writeResponse <- function(responseDF, df) { 87 | sJSON <- paste('{"response_code": "ok","', m, '":', toJSON(responseDF[,m]), sep="") #JSON String is not closed. 88 | if(!is.null(g) && g == "t") { 89 | latName <- paste(d, ".Lat", sep="") 90 | lngName <- paste(d, ".Lng", sep="") 91 | sJSON <- paste(sJSON, ', "mapsJSON": [', sep="") 92 | for(i in 1:nrow(responseDF)) { 93 | key <- responseDF[,d][i] 94 | #cat(paste ("
", key , ", what?")) 95 | j <- match(key, df[,d]) 96 | sJSON <- paste(sJSON, '{"d":"', key, 97 | '", "lat":', 98 | df[latName][j,], 99 | ', "lng":', df[lngName][j,], 100 | ', "data":', responseDF[i,2], 101 | '}', sep="") 102 | if(i!=nrow(responseDF)) 103 | sJSON <- paste(sJSON, ",", sep="") 104 | } 105 | sJSON <- paste(sJSON, ']', sep="") 106 | } 107 | sJSON = paste(sJSON, "}", sep="") 108 | #Write response back 109 | cat(sJSON) 110 | #cat(ls()) 111 | #rm(list=ls()) #Clean up everything - There has to be a better way than this. 112 | #cat(ls()) 113 | } 114 | 115 | #Step 1 - Read Request, query string 116 | readRequest() 117 | 118 | #Step 2 - load environment 119 | df <- loadDS(qds) 120 | 121 | if(!is.null(df)) { 122 | #Step 3 - Process Request 123 | responseDF <- processRequest(f, df) 124 | #Step 3.a - Apply Filter 125 | 126 | #Step 4 - Write Response 127 | writeResponse(responseDF, df) 128 | } -------------------------------------------------------------------------------- /learning/DGQEWebSockets.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | library(plyr) 7 | library(RJSONIO) 8 | library(data.table) 9 | #Set content type for output 10 | setContentType("text/html") 11 | #Query Engine - qds - query this database, d=dimension, m=measure, f=funciton 12 | qds <- d <- m <- f <- g <- fv <- NULL 13 | 14 | #Convert to a ROW JSON 15 | getRowWiseJson <- function (jsonDT) { 16 | row.json <- apply(jsonDT, 1, toJSON) 17 | json.st <- paste('[', paste(row.json, collapse=', '), ']') 18 | return (json.st) 19 | } 20 | 21 | #load pre-analyzed and saved dataframe 22 | loadDS <- function(qds) { 23 | if(exists(qds, envir=.GlobalEnv)) { 24 | #lpads <- get("pads", envir=.GlobalEnv) 25 | #cat(exists(pads$sysID[pads$id==qds], envir=.GlobalEnv), "
") 26 | return(get(qds, envir=.GlobalEnv)) #get a local copy in this enviornment 27 | } else { 28 | #print("PAD didn't exist. Check to see if you start up R script in RApache ran correctly.") 29 | cat(paste('{"response_code": "error", "error_message": "The pad ', qds, ' does not exist. Check to see if you start up R script in RApache ran correctly."}', sep="")) 30 | } 31 | } 32 | 33 | getData <- function(dat, expr, gby) { 34 | e <- substitute(expr) 35 | b <- substitute(gby) 36 | #cat("
", gby) 37 | #cat(b) 38 | #cat(dat[,eval(e), by=gby]) 39 | return(dat[,eval(e),by=b]) 40 | } 41 | #getData(padDT, sum(eval(m), na.rm=TRUE), d) 42 | 43 | readRequest <- function() { 44 | #GetString <- str(GET) 45 | #cat(GetString) 46 | qds <<- GET$qds 47 | d <<- GET$d 48 | m <<- GET$m 49 | f <<- GET$f 50 | g <<- GET$g 51 | fv <<- GET$fv 52 | #cat(d, "," ,m, "," ,f, "," ,qds, sep=" ") 53 | #cat("
") 54 | } 55 | #Proecess Request 56 | processRequest <- function(fnType, padDT, mea, dim, fvalue){ 57 | rs <- NULL 58 | if(is.null(fvalue)) { 59 | switch(fnType, 60 | d={rs <- padDT[,list(m=get(mea)),by=dim]}, #default by accpets character function 61 | s= { rs <- padDT[, list(m=sum(get(mea), na.rm=TRUE)), by=dim]}, #sum 62 | c= {rs <- padDT[, list(m=length(na.omit(get(mea)))), by=dim]}, #count 63 | m= {rs <- padDT[,list(m=mean(get(mea), na.rm=TRUE)),by=eval(dim)]}, #Average, eval for dim also works 64 | 'default' 65 | ) 66 | } else { 67 | switch(fnType, 68 | d={rs <- padDT[,list(m=get(mea)),by=dim][get(dim)==fvalue]}, #default by accpets character function 69 | s= { rs <- padDT[, list(m=sum(get(mea), na.rm=TRUE)), by=dim][get(dim)==fvalue]}, #sum 70 | c= {rs <- padDT[, list(m=length(na.omit(get(mea)))), by=dim][get(dim)==fvalue]}, #count 71 | m= {rs <- padDT[,list(m=mean(get(mea), na.rm=TRUE)),by=eval(dim)][get(dim)==fvalue]}, #Average, eval for dim also works 72 | 'default' 73 | ) 74 | } 75 | return (rs) 76 | } 77 | #test this out 78 | #dt <- data.table(iris) 79 | #m <- "Sepal.Length" 80 | #d <- "Species" 81 | #fv <- NULL 82 | #processRequest("s", padDT=dt, mea=m, dim=d, fvalue=fv) 83 | 84 | writeResponse <- function(responseDF, padDT) { 85 | if(is.null(d)) { 86 | responseDF$d = "" 87 | } else #Change the name to d 88 | setnames(responseDF, 1, "d") 89 | #sJSON <- paste('{"response_code": "ok","', m, '":', toJSON(responseDF[,m]), sep="") #JSON String is not closed. 90 | sJSON <- paste('{"response_code": "ok","chartData":', getRowWiseJson(responseDF), sep="") #JSON String is not closed. 91 | if(!is.null(g) && g == "t") { 92 | latName <- paste(d, ".Lat", sep="") 93 | lngName <- paste(d, ".Lng", sep="") 94 | sJSON <- paste(sJSON, ', "mapsJSON": [', sep="") 95 | for(i in 1:nrow(responseDF)) { 96 | key <- responseDF[,d][i] 97 | #cat(paste ("
", key , ", what?")) 98 | j <- match(key, padDT[,d]) 99 | sJSON <- paste(sJSON, '{"d":"', key, 100 | '", "lat":', 101 | padDT[latName][j,], 102 | ', "lng":', padDT[lngName][j,], 103 | ', "data":', responseDF[i,2], 104 | '}', sep="") 105 | if(i!=nrow(responseDF)) 106 | sJSON <- paste(sJSON, ",", sep="") 107 | } 108 | sJSON <- paste(sJSON, ']', sep="") 109 | } 110 | sJSON = paste(sJSON, "}", sep="") 111 | #Write response back 112 | cat(sJSON) 113 | #cat(ls()) 114 | rm(list=ls()) #Clean up everything - There has to be a better way than this. 115 | #cat(ls()) 116 | } 117 | 118 | #Step 1 - Read Request, query string 119 | readRequest() 120 | 121 | #Step 2 - load environment 122 | padDT <- data.table(loadDS(qds)) 123 | 124 | #Step 3 - Process Request 125 | responseDF <- processRequest(f, padDT, mea=m, dim=d, fvalue=fv) 126 | 127 | #Step 4 - Write Response 128 | writeResponse(responseDF, padDT) 129 | 130 | 131 | 132 | -------------------------------------------------------------------------------- /learning/DKOMSentimentAnalysis.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | rm (list = ls()) 8 | setwd("~/Toga/Alto") 9 | # Revised Sentiment Analyis using Hu & Liu's library of 6,800 negative and positive words 10 | library("twitteR") 11 | library("plyr") 12 | library("stringr") 13 | library("doBy") 14 | 15 | #Populate the list of sentiment words from Hu and Liu (http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html) 16 | huliu.pwords <- scan('opinion-lexicon/positive-words.txt', what='character', comment.char=';') 17 | huliu.nwords <- scan('opinion-lexicon/negative-words.txt', what='character', comment.char=';') 18 | 19 | # Add some words 20 | huliu.nwords <- c(huliu.nwords,'wtf','wait','waiting','epicfail', 'crash', 'bug', 'bugy', 'bugs', 'slow', 'lie') 21 | #Remove some words 22 | huliu.nwords <- huliu.nwords[!huliu.nwords=='sap'] 23 | huliu.nwords <- huliu.nwords[!huliu.nwords=='cloud'] 24 | #which('sap' %in% huliu.nwords) 25 | 26 | cList <- list(twitterTag=c("#DKOM")) 27 | 28 | getTweets <- function(tag) { 29 | # Get 1500 tweets - an individual is only allowed to get 1500 tweets 30 | return(searchTwitter(tag, n=1500)) 31 | #since=as.character(Sys.Date()-3 - If you want to restrict this to just last few days tweets 32 | } 33 | 34 | 35 | getSentimentScore <- function(tweets) 36 | { 37 | scores <- laply(tweets, function(singleTweet) { 38 | # clean up tweets with R's regex-driven global substitute, gsub() 39 | singleTweet <- gsub('[[:punct:]]', '', singleTweet) 40 | singleTweet <- gsub('[[:cntrl:]]', '', singleTweet) 41 | singleTweet <- gsub('\\d+', '', singleTweet) 42 | #Convert to lower case for comparision, split the tweet into single words and flatten the list 43 | tweetWords <- unlist(str_split(tolower(singleTweet), '\\s+')) 44 | # compare our words to the dictionaries of positive & negative terms 45 | # match() returns the position of the matched term or NA, apply is.na to convert to boolean 46 | pos.matches <- !is.na(match(tweetWords, huliu.pwords)) 47 | neg.matches <- !is.na(match(tweetWords, huliu.nwords)) 48 | # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum(): 49 | score <- sum(pos.matches) - sum(neg.matches) 50 | return(score) 51 | }) 52 | return(data.frame(SentimentScore=scores, Tweet=tweets)) 53 | } 54 | 55 | #Step #1 56 | 57 | sentimentScoreDF <- data.frame() 58 | sentimentScoreDF <- ldply(cList$twitterTag, function(twitterTag) { 59 | tweets <- getTweets(twitterTag) 60 | tweets.text <- laply(tweets,function(t)t$getText()) 61 | sentimentScore <- getSentimentScore(tweets.text) 62 | sentimentScore$TwitterTag <- twitterTag 63 | rbind(sentimentScoreDF, sentimentScore) 64 | }) 65 | 66 | # Get rid of tweets that have zero score and seperate +ve from -ve tweets 67 | sentimentScoreDF$posTweets <- as.numeric(sentimentScoreDF$SentimentScore >=1) 68 | sentimentScoreDF$negTweets <- as.numeric(sentimentScoreDF$SentimentScore <=-1) 69 | 70 | #Summarize finidings 71 | summaryDF <- ddply(sentimentScoreDF,"TwitterTag", summarise, 72 | TotalTweetsFetched=length(SentimentScore), 73 | PositiveTweets=sum(posTweets), NegativeTweets=sum(negTweets), 74 | AverageScore=round(mean(SentimentScore),3)) 75 | 76 | summaryDF$TotalTweets <- summaryDF$PositiveTweets + summaryDF$NegativeTweets 77 | 78 | #Get Sentiment Score 79 | summaryDF$Sentiment <- round(summaryDF$PositiveTweets/summaryDF$TotalTweets, 2) 80 | 81 | #Order 82 | orderBy(~-Sentiment,summaryDF) 83 | 84 | write.csv(summaryDF, "DKOM-Results.csv") 85 | 86 | 87 | 88 | 89 | #summary <- ddply(sentimentScoreDF, "TwitterTag", function(df) { MeanScore=round(mean(df$SentimentScore),3)}) 90 | summary <- ddply(sentimentScoreDF, "TwitterTag", summarise, 91 | MeanScore=round(mean(SentimentScore),3), TweetCount=length(SentimentScore)) 92 | 93 | period <- seq(as.Date("2010-01-01"), by="month", length.out=24) 94 | tweets <- llply(period, function(since){ 95 | until <- seq(since, by="month", length.out=2)[2] 96 | print(since) 97 | print(until) 98 | tt <- searchTwitter("#SALESforce", n=1500, since=since, until=until ) 99 | print(length(tt)) 100 | return(length(tt)) 101 | }) 102 | 103 | getAuthorized <- function() { 104 | cred <- OAuthFactory$new(consumerKey="g8sky1SyttNWPsgZOGAA", 105 | consumerSecret="zuJvHtN63yLSYh9Bk2Qc0kd5kGIV9mtzb7r6XPVqq8", 106 | requestURL="https://api.twitter.com/oauth/request_token", 107 | accessURL="https://api.twitter.com/oauth/access_token", 108 | authURL="https://api.twitter.com/oauth/authorize") 109 | cred$handshake() 110 | registerTwitterOAuth(cred) 111 | #3739000 112 | save(cred, file="TwitterAanalyticsForAll.Cred") 113 | } 114 | 115 | loadCred <- function(){ 116 | cred <- load("TwitterAanalyticsForAll.Cred") 117 | cred$handshake() 118 | registerTwitterOAuth(cred) 119 | } 120 | -------------------------------------------------------------------------------- /learning/TwiterSentimentAnalysis.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | rm (list = ls()) 8 | setwd("~/Toga/Alto") 9 | # Revised Sentiment Analyis using Hu & Liu's library of 6,800 negative and positive words 10 | library("twitteR") 11 | library("plyr") 12 | library("stringr") 13 | library("doBy") 14 | 15 | #Populate the list of sentiment words from Hu and Liu (http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html) 16 | huliu.pwords <- scan('opinion-lexicon/positive-words.txt', what='character', comment.char=';') 17 | huliu.nwords <- scan('opinion-lexicon/negative-words.txt', what='character', comment.char=';') 18 | 19 | # Add some words 20 | huliu.nwords <- c(huliu.nwords,'wtf','wait','waiting','epicfail', 'crash', 'bug', 'bugy', 'bugs', 'slow', 'lie') 21 | #Remove some words 22 | huliu.nwords <- huliu.nwords[!huliu.nwords=='sap'] 23 | huliu.nwords <- huliu.nwords[!huliu.nwords=='cloud'] 24 | #which('sap' %in% huliu.nwords) 25 | 26 | cList <- list(twitterTag=c("@Netflix")) 27 | 28 | getTweets <- function(tag) { 29 | # Get 1500 tweets - an individual is only allowed to get 1500 tweets 30 | return(searchTwitter(tag, n=1500)) 31 | #since=as.character(Sys.Date()-3 - If you want to restrict this to just last few days tweets 32 | } 33 | 34 | 35 | getSentimentScore <- function(tweets) 36 | { 37 | scores <- laply(tweets, function(singleTweet) { 38 | # clean up tweets with R's regex-driven global substitute, gsub() 39 | singleTweet <- gsub('[[:punct:]]', '', singleTweet) 40 | singleTweet <- gsub('[[:cntrl:]]', '', singleTweet) 41 | singleTweet <- gsub('\\d+', '', singleTweet) 42 | #Convert to lower case for comparision, split the tweet into single words and flatten the list 43 | tweetWords <- unlist(str_split(tolower(singleTweet), '\\s+')) 44 | # compare our words to the dictionaries of positive & negative terms 45 | # match() returns the position of the matched term or NA, apply is.na to convert to boolean 46 | pos.matches <- !is.na(match(tweetWords, huliu.pwords)) 47 | neg.matches <- !is.na(match(tweetWords, huliu.nwords)) 48 | # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum(): 49 | score <- sum(pos.matches) - sum(neg.matches) 50 | return(score) 51 | }) 52 | return(data.frame(SentimentScore=scores, Tweet=tweets)) 53 | } 54 | 55 | #Step #1 56 | 57 | sentimentScoreDF <- data.frame() 58 | sentimentScoreDF <- ldply(cList$twitterTag, function(twitterTag) { 59 | tweets <- getTweets(twitterTag) 60 | tweets.text <- laply(tweets,function(t)t$getText()) 61 | sentimentScore <- getSentimentScore(tweets.text) 62 | sentimentScore$TwitterTag <- twitterTag 63 | rbind(sentimentScoreDF, sentimentScore) 64 | }) 65 | 66 | # Get rid of tweets that have zero score and seperate +ve from -ve tweets 67 | sentimentScoreDF$posTweets <- as.numeric(sentimentScoreDF$SentimentScore >=1) 68 | sentimentScoreDF$negTweets <- as.numeric(sentimentScoreDF$SentimentScore <=-1) 69 | 70 | #Summarize finidings 71 | summaryDF <- ddply(sentimentScoreDF,"TwitterTag", summarise, 72 | TotalTweetsFetched=length(SentimentScore), 73 | PositiveTweets=sum(posTweets), NegativeTweets=sum(negTweets), 74 | AverageScore=round(mean(SentimentScore),3)) 75 | 76 | summaryDF$TotalTweets <- summaryDF$PositiveTweets + summaryDF$NegativeTweets 77 | 78 | #Get Sentiment Score 79 | summaryDF$Sentiment <- round(summaryDF$PositiveTweets/summaryDF$TotalTweets, 2) 80 | 81 | #Order 82 | orderBy(~-Sentiment,summaryDF) 83 | 84 | write.csv(summaryDF, "Netflix-Results.csv") 85 | 86 | 87 | 88 | 89 | #summary <- ddply(sentimentScoreDF, "TwitterTag", function(df) { MeanScore=round(mean(df$SentimentScore),3)}) 90 | summary <- ddply(sentimentScoreDF, "TwitterTag", summarise, 91 | MeanScore=round(mean(SentimentScore),3), TweetCount=length(SentimentScore)) 92 | 93 | period <- seq(as.Date("2010-01-01"), by="month", length.out=24) 94 | tweets <- llply(period, function(since){ 95 | until <- seq(since, by="month", length.out=2)[2] 96 | print(since) 97 | print(until) 98 | tt <- searchTwitter("#SALESforce", n=1500, since=since, until=until ) 99 | print(length(tt)) 100 | return(length(tt)) 101 | }) 102 | 103 | getAuthorized <- function() { 104 | cred <- OAuthFactory$new(consumerKey="g8sky1SyttNWPsgZOGAA", 105 | consumerSecret="zuJvHtN63yLSYh9Bk2Qc0kd5kGIV9mtzb7r6XPVqq8", 106 | requestURL="https://api.twitter.com/oauth/request_token", 107 | accessURL="https://api.twitter.com/oauth/access_token", 108 | authURL="https://api.twitter.com/oauth/authorize") 109 | cred$handshake() 110 | registerTwitterOAuth(cred) 111 | #3739000 112 | save(cred, file="TwitterAanalyticsForAll.Cred") 113 | } 114 | 115 | loadCred <- function(){ 116 | cred <- load("TwitterAanalyticsForAll.Cred") 117 | cred$handshake() 118 | registerTwitterOAuth(cred) 119 | } 120 | -------------------------------------------------------------------------------- /learning/Olympics.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | rm (list = ls()) 8 | setwd("~/Toga") 9 | library(XML) 10 | library("chron") 11 | library(plyr) 12 | library(stringr) 13 | library("RJSONIO") 14 | 15 | ############################# 16 | # Olympic Tables 17 | #library(XML) 18 | #theurl <- "http://en.wikipedia.org/wiki/Brazil_national_football_team" 19 | #tables <- readHTMLTable(theurl) 20 | #n.rows <- unlist(lapply(tables, function(t) dim(t)[1])) 21 | #the picked table is the longest one on the page 22 | #braz <- tables[[which.max(n.rows)]] 23 | #print(braz) 24 | ############################ 25 | fixit <- function(x) { # Fix rowspan issue where there is single rank assing to multiple rows 26 | if(nchar(x[1]) > 3) { # other than rank 27 | for(i in length(x):2) { # Shift items by 1 28 | x[i] = x[i-1] 29 | } 30 | x[1] = -1 #Assign rank column to -1 31 | } 32 | return(x) 33 | } 34 | #tl <- rbind(n, ddply(medals, .(V1), temp)) 35 | 36 | readWikiOT <- function(d) { #Read Wiki olympic medal tables 37 | whichTable <- 1 38 | print(d$year) 39 | if(!(d$year %in% c(1916, 1940, 1944))) { #Code breaks here due to wiki issues 40 | #whichTable <- 3 41 | if(d$year %in% c(1928, 1948, 1972, 1980,1996)) 42 | whichTable <- 2 43 | if(d$year %in% c(1960, 2000, 2004, 2008, 2012)) 44 | whichTable <- 3 45 | print(whichTable) 46 | medals <- NULL 47 | if(d$year == 1992) { # Remove subscript and superscript text 48 | overview <- htmlParse(d$url,encoding="UTF-8") 49 | temp<-getNodeSet(overview, "/*//sup") 50 | removeNodes(temp) 51 | medals <- try(readHTMLTable(overview, header=FALSE, skip.rows=1, which=whichTable, stringsAsFactors = FALSE)) 52 | } else 53 | medals <- try(readHTMLTable(d$url, header=FALSE, skip.rows=1, which=whichTable, stringsAsFactors = FALSE)) 54 | 55 | medals <- medals[-nrow(medals),] 56 | if(d$year < 1992) 57 | medals <- ddply(medals, .(V1), fixit) 58 | medals <- medals[, -c(1)] 59 | #colnames(medals) <- c('Nation', 'Gold', 'Silver', 'Bronze', 'Total') 60 | medals <- tm <- with(medals, data.frame( 61 | #Nation=str_trim(sapply(strsplit(str_trim(medals$V2), " \\("), '[', 1)), 62 | #NationCode=sapply(strsplit(str_trim(sapply(strsplit(str_trim(medals$V2), " \\)"), '[', 2)), "\\)"), '[',1), 63 | Country=substr(medals$V2, 1, nchar(medals$V2)-6), 64 | CountryCode=substr(medals$V2, nchar(medals$V2)-3, nchar(medals$V2)-1), 65 | Gold=as.numeric(medals$V3), Silver=as.numeric(medals$V4), Bronze=as.numeric(medals$V5), 66 | Total=as.numeric(medals$V6))) 67 | #medals$Rank <- rank(medals$V3) 68 | #medals <- medals[!c("Rank")] 69 | return (medals) 70 | } 71 | } 72 | 73 | year <- seq(1896, 2012, by=4) 74 | url <- paste("http://en.wikipedia.org/wiki/", year, "_Summer_Olympics_medal_table", sep="") 75 | om <- data.frame(year=year, url=url, stringsAsFactors = FALSE) #Indian movies 76 | omDS <- ddply(om, .(year), readWikiOT) 77 | write.csv(omDS, "SummerOlympic-Medals-1896-2012.csv",row.names=FALSE) 78 | 79 | ### Read Bollywood.csv 80 | in.file.name <- "datasets/BollywoodCinema-1940-2008.csv" 81 | dt <- data.frame(read.csv(in.file.name, na.strings="-", as.is=TRUE, header=TRUE, 82 | stringsAsFactors=FALSE, strip.white=TRUE)) 83 | #Get Unique ID 84 | dsID <- paste("pad", substr(as.character(unclass(Sys.time())), 12,16), sep="") 85 | out.file.csv <- paste("pads/data/",dsID, ".csv", sep="") 86 | out.file.JSON <- paste("pads/meta/",dsID, ".JSON", sep="") 87 | out.file.R.dataframes <- paste("pads/cache/", dsID, ".RData", sep="") 88 | write.csv(dt, out.file.csv) 89 | jsonSt <- paste('{ 90 | "datasetName": "Bollywood Cinema (1940-2008)", 91 | "id":"', dsID, '", 92 | "dList": { 93 | "Year":', toJSON(levels(dt$Year)), ', 94 | "Director":', toJSON(levels(dt$Director)), ' 95 | }, 96 | "mList": { 97 | "Genre1":"", 98 | "Genre2": "" 99 | }, 100 | "dListType": { 101 | }, 102 | "source":"Wikipedia" 103 | }', sep="") 104 | file.out <- file(out.file.JSON, 'wt') 105 | cat(jsonSt, file=file.out, fill = TRUE) 106 | close(file.out) 107 | #Save transformed dataset as data.frame for later reading 108 | assign(dsID, dt) 109 | names <- c(eval(dsID)) 110 | save(list=names, file=out.file.R.dataframes) 111 | pads <- rbind(pads, data.table(id=dsID, title="Bollywood Cinema (1940-2008)", 112 | subtitle="Bollywood Movies from 1940 including director, cast and genre.", 113 | records=nrow(dt), analyzed=0, stories=0, json=out.file.JSON, data=out.file.csv, 114 | cache=out.file.R.dataframes, source="Wiki") 115 | ) 116 | #url <- paste("http://www.votesmart.org/candidate/evaluations/", 1:50 , sep = "") 117 | #res <- llply(url, function(i) readHTMLtable(i)) -------------------------------------------------------------------------------- /GenerateWDI.R: -------------------------------------------------------------------------------- 1 | # Script to read wdi data and create tables in WDI database 2 | # Make sure to convert file WDI_Description.csv to WDI_Description.txt - this file is not needed in the DB 3 | # Take back up if necessary 4 | # Author: Jitender Aswani, Co-Founder @datadolph.in 5 | # Date: 3/15/2013 6 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 7 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 8 | # All rights reserved. 9 | 10 | setwd("/Users/homemac/R") 11 | 12 | require("RMySQL") 13 | require("plyr") 14 | 15 | source("Util.R") 16 | # 17 | #initiate a connection to wdi database 18 | # 19 | 20 | getWDIDBCon <- function(multi.statements=FALSE) { 21 | if(multi.statements) 22 | return(dbConnect(MySQL(), user="wdi", password="wdi", dbname="wdi", host="localhost", 23 | client.flag=CLIENT_MULTI_STATEMENTS)) 24 | else 25 | return(dbConnect(MySQL(), user="wdi", password="wdi", dbname="wdi", host="localhost")) 26 | } 27 | 28 | # 29 | #disconnect a database 30 | # 31 | disconnectDB <- function(con) { 32 | #Close Connection 33 | dbDisconnect(con) 34 | } 35 | 36 | 37 | # 38 | #write a data.frame as a table in WDI database 39 | # this funciton will overwrite the exisiting table 40 | # 41 | 42 | writeTable <- function(table.name ,table.data) { 43 | #List tables and fields in a table: 44 | con <- getWDIDBCon() 45 | on.exit(disconnectDB(con)) # On function exit, close connection 46 | if(dbExistsTable(con, table.name)){ 47 | dbRemoveTable(con, table.name) 48 | dbWriteTable(con, name=table.name, value=table.data, row.names = F, overwrite = T) 49 | } else { 50 | dbWriteTable(con, name=table.name, value=table.data, row.names = F, overwrite = T) 51 | } 52 | 53 | } 54 | 55 | 56 | saveAllWDIData <- function() { 57 | folder.path = "./raw-data/wdi" 58 | filenames <- list.files(path=folder.path, pattern="*.csv") 59 | for(i in filenames) { 60 | filePath <- file.path(folder.path,i) 61 | wdi.name <- unlist(strsplit(i, "\\."))[1] #Temp - use the file name as the title 62 | wdi.df <- readFile(filePath) 63 | #clean up 64 | wdi.df <- wdi.df[rowSums(is.na(wdi.df)) != ncol(wdi.df),] # remove empty rows 65 | wdi.df <- wdi.df[rowSums(is.na(wdi.df)) != ncol(wdi.df)-1,] # remove rows with just one cell filled out 66 | wdi.df <- wdi.df[,colSums(is.na(wdi.df)) != nrow(wdi.df)] # remove empty cols 67 | 68 | #some cells have new line variables, remove it 69 | #char.vars <- setdiff(char.vars, "W") # remove a column 70 | char.vars <- names(wdi.df)[sapply(wdi.df, is.character)] #select character variables 71 | for(i in 1:length(char.vars)) { 72 | wdi.df[char.vars[i]] = sapply(wdi.df[char.vars[i]], function(x) gsub("\\n", " ", x)) 73 | } 74 | 75 | #clean up column names 76 | colnames(wdi.df) <- replaceMetaChars(colnames(wdi.df)) 77 | writeTable(wdi.name, wdi.df) 78 | } 79 | } 80 | 81 | saveAllWDIData() 82 | 83 | 84 | testFunctionDoNoRun <- function(){ 85 | wdi.df <- read.csv("./raw-data/wdi/WDI_Series.csv", stringsAsFactors=F) 86 | wdi.df <- wdi.df[rowSums(is.na(wdi.df)) != ncol(wdi.df),] # remove empty rows 87 | wdi.df <- wdi.df[,colSums(is.na(wdi.df)) != nrow(wdi.df)] # remove empty cols 88 | 89 | char.vars <- names(wdi.df)[sapply(wdi.df, is.character)] #select character variables 90 | for(i in 1:length(char.vars)) { 91 | wdi.df[char.vars[i]] = sapply(wdi.df[char.vars[i]], function(x) gsub("\\n", " ", x)) 92 | } 93 | writeTable("wdi_series", wdi.df) 94 | 95 | 96 | df1 <- as.data.frame(cbind(sapply(df,function(x) { 97 | if(is.character(x)) { 98 | gsub("\\n", " ", x) 99 | } 100 | return(x) 101 | }))) 102 | } 103 | 104 | generateCatsFromWDI <- function() { 105 | wdi.cats <- read.csv("./raw-data/wdi/WDI_DD_Categories.csv", stringsAsFactors=F) 106 | #colnames(wdi.cats) 107 | #unique(wdi.cats$Category) 108 | #unique(wdi.cats$Subcategory) 109 | wdi.cats$Category[wdi.cats$Category=="Economic Policy & Debt"] = "Economics" 110 | wdi.cats$Category[wdi.cats$Category=="Labor & Social Protection"] = "Social" 111 | wdi.cats$Category[wdi.cats$Category=="Public Sector"] = "Government" 112 | wdi.cats$Category[wdi.cats$Category=="Private Sector & Trade"] = "Private Sector" 113 | 114 | #write out the cat and subcats for entering into story pads data 115 | dd.cats <- unique(wdi.cats[,c("Category", "Subcategory")]) 116 | colnames(dd.cats) <- tolower(colnames(dd.cats)) 117 | write.csv(dd.cats, "./pads/PadsCategory.csv", row.names=F) 118 | #insertCatSubcat() 119 | con <- getDBCon() 120 | dd.cats <- dbReadTable(con, "pads_categories") 121 | dd.subcats <- dbReadTable(con, "pads_subcategories") 122 | disconnectDB(con) 123 | #Assign category id 124 | wdi.cats$category_id = 0 125 | for(i in 1:nrow(dd.cats)){ 126 | wdi.cats$category_id[which(wdi.cats$Category==dd.cats$category_name[i])] = dd.cats$category_id[i] 127 | } 128 | #trim white spaces from front and end 129 | #wdi.cats$Subcategory <- gsub("^\\s|\\s$","", wdi.cats$Subcategory) 130 | #assign subcat_id 131 | wdi.cats$subcategory_id = 0 132 | for(i in 1:nrow(dd.subcats)){ 133 | wdi.cats$subcategory_id[which(wdi.cats$Subcategory==dd.subcats$subcategory_name[i])] = dd.subcats$subcategory_id[i] 134 | } 135 | colnames(wdi.cats) = c("SeriesCode", "Topic", "category_name", "subcategory_name", "category_id", "subcategory_id") 136 | writeTable("wdi_categories", wdi.cats) 137 | } 138 | -------------------------------------------------------------------------------- /learning/birbal.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | #This version will use data.table in place of PLYR. 8 | #library(plyr) 9 | library(RJSONIO) 10 | library(data.table) 11 | #setwd("/Users/homemac/Toga/Alto") 12 | 13 | #Set content type for output 14 | setContentType("text/html") 15 | #Query Engine - qds - query this database, d=dimension, m=measure, g= geo, f=funciton 16 | qds <- d <- m <- f <- g <- NULL 17 | #d <- "" 18 | #m <- "" 19 | #f <- "" 20 | 21 | #load pre-analyzed and saved dataframe 22 | loadDS <- function(qds) { 23 | if(exists("pads", envir=.GlobalEnv)) { 24 | #lpads <- get("pads", envir=.GlobalEnv) 25 | #cat(exists(pads$sysID[pads$id==qds], envir=.GlobalEnv), "
") 26 | return(get(pads$sysID[pads$id==qds], envir=.GlobalEnv)) #get a local copy in this enviornment 27 | } else { 28 | print("PADS didn't exist. Check to see if you start up R script in RApache ran correctly.") 29 | } 30 | } 31 | 32 | ## Operate on a data.frame, split by "by", by could be a vector or a single filed 33 | ## aggreage by columns in vector on, on could be a single field also 34 | 35 | getResults <- function(dt, byVar, onVar) { 36 | e <- substitute(onVar) 37 | cat(e) 38 | return(dt[,eval(e),by=byVar]) 39 | } 40 | 41 | 42 | ## Operate on a data.frame, split by "by", by could be a vector or a single filed 43 | ## and aggreage by columns in vector on, on could be a single field also 44 | #getCount <- function(df, by, on) { 45 | # return(ddply(df, by, colwise(function(col) {length(unique(col))}, on))) 46 | #} 47 | 48 | getCount<- function(dt, by, on) { 49 | lds <- dt[], by, "nrow") 50 | colnames(lds) <- c(by,on) 51 | return(lds) 52 | } 53 | 54 | #For counting column that has 0 or 1 value :: Very similar to getSum - the answer should be same 55 | getBinaryCount <- function(df, by, on) { 56 | return(ddply(df, by, colwise(function(x) {length(which(x==1))}, on))) 57 | #length(which(mbDS[,"Test.Century"] ==1)) 58 | } 59 | 60 | readRequest <- function() { 61 | #GetString <- str(GET) 62 | #cat(GetString) 63 | qds <<- GET$qds 64 | d <<- GET$d 65 | m <<- GET$m 66 | f <<- GET$f 67 | g <<- GET$g 68 | cat(d, "," ,m, "," ,f, "," ,qds, sep=" ") 69 | #cat("
") 70 | } 71 | 72 | processRequest <- function(fnType, dt){ 73 | return(switch(fnType, 74 | sum = dt[,eval(sum(m, na.rm=TRUE)), by=d] 75 | count = getCount(df, d, m), 76 | 'default' 77 | ) 78 | ) 79 | } 80 | 81 | 82 | library(data.table) 83 | 84 | mbDT <- data.table(mbDS) 85 | 86 | myfunction = function(dt, expr) { 87 | e = substitute(expr) 88 | dt[,eval(e),by=Species] 89 | } 90 | 91 | d <- quote(list(Opposition, Toss)) 92 | 93 | setkey(mbDT,Opposition) 94 | #m <- quote(Test.Runs) 95 | #mbDT[Opposition=="England" ,eval(m)] 96 | #m <- quote(sum(Test.Runs)) 97 | #mbDT[Opposition=="England" ,eval(m)] 98 | 99 | #m <- quote(list(T=sum(Test.Runs))) 100 | #mbDT[Opposition=="England" ,eval(m)] 101 | 102 | #m <- quote(list(Test.Runs=sum(Test.Runs))) 103 | #mbDT[Opposition=="England" ,eval(m)] 104 | 105 | #m <- quote(list(Test.Runs=sum(Test.Runs))) 106 | #mbDT[,eval(m), by=d] 107 | 108 | 109 | ## Operate on a data.frame, split by "by", by could be a vector or a single filed 110 | ## aggreage by columns in vector on, on could be a single field also 111 | getSum <- function(dt, onVar, byVar) { 112 | e <- substitute(onVar) 113 | j <- eval(e) 114 | print(j) 115 | dt[,eval(j),by=byVar] 116 | } 117 | 118 | m <- "Test.Runs" 119 | d<- "Opposition" 120 | #expr <- substitute(list(m=sum(m, na.rm=TRUE))) 121 | expr <- paste("list(",m,"=sum(",m,",na.rm=TRUE))", sep="") 122 | getSum(mbDT, paste("sum(",m,")", sep=""), d) 123 | 124 | 125 | writeResponse <- function(responseDF, df) { 126 | sJSON <- paste('{"', m, '":', toJSON(responseDF[,m]), sep="") #JSON String is not closed. 127 | 128 | if(!is.null(g) && g == "t") { 129 | latName <- paste(m, ".Lat", sep="") 130 | lngName <- paste(m, ".Lng", sep="") 131 | sJSON <- paste(sJSON, ', "responseJSON": [', sep="") 132 | JSON <- with(head(df), paste('{"d":"',responseDF[m] 133 | '", "center": new google.maps.LatLng(', 134 | df[latName][df[m]==responeDF[m]][1], ',', df[lngName][df[m]==responeDF[m]][1], 135 | '), "data":', mbDS$Test.Runs, 136 | '},', sep="")) 137 | 138 | } else 139 | sJSON = paste(sJSON, "}", sep="") 140 | #Write response back 141 | cat(sJSON) 142 | #cat(ls()) 143 | rm(list=ls()) #Clean up everything - There has to be a better way than this. 144 | #cat(ls()) 145 | } 146 | 147 | writeError <- function() { 148 | sJSON <- paste('{ "Error":"An Error Occured :-( "}' , sep="") 149 | } 150 | 151 | #Step 1 - Read Request, query string 152 | readRequest() 153 | 154 | #Step 2 - load environment 155 | df <- loadDS(qds) 156 | 157 | if(!is.null(df)) 158 | { 159 | dt <- data.table(df[order(df[,d]),]) 160 | #Step 3 - Process Request 161 | responseDF <- processRequest(dt) 162 | if(!is.null(responseDF)) { 163 | #Step 4 - Write Response 164 | writeResponse(responseDF) 165 | } 166 | else { 167 | writeError(); 168 | } 169 | } else { 170 | writeError(); 171 | } 172 | 173 | 174 | -------------------------------------------------------------------------------- /learning/TransformMasterBlasterData.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | rm (list = ls()) 8 | setwd("~/Toga") 9 | library("chron") 10 | library("plyr") 11 | library("RJSONIO") 12 | #Get Unique ID 13 | dsID <- paste("pad", substr(as.character(unclass(Sys.time())), 12,16), sep="") 14 | out.file.csv <- paste("Alto/pads/data/",dsID, ".csv", sep="") 15 | out.file.JSON <- paste("Alto/pads/meta/",dsID, ".JSON", sep="") 16 | out.file.R.dataframes <- paste("Alto/pads/cache/", dsID, ".RData", sep="") 17 | in.file.name <- "Alto/datasets/masterblaster/SachinTestRecords.csv" 18 | 19 | 20 | lMonths <- c("January","February","March", "April","May","June","July","August","September", "October","November","December") 21 | lDays <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday") 22 | 23 | # Use Google's Geocoding service and geo code the data 24 | getGeoCode <- function(gcStr, cnPre) 25 | { 26 | library("RJSONIO") 27 | gcStr <- gsub(' ','%20',gcStr) 28 | connectStr <- paste('http://maps.google.com/maps/api/geocode/json?sensor=false&address=',gcStr, sep="") 29 | #print(connectStr) 30 | con <- url(connectStr) 31 | data.json <- fromJSON(paste(readLines(con), collapse="")) 32 | close(con) 33 | data.json <- unlist(data.json) 34 | lat <- data.json["results.geometry.location.lat"] 35 | lng <- data.json["results.geometry.location.lng"] 36 | gcodes <- c(lat, lng) 37 | names(gcodes) <- c(paste(cnPre,".Lat",sep=""),paste(cnPre, ".Lng", sep="")) 38 | return (gcodes) 39 | } 40 | 41 | # Read SachinTestRecords.csv 42 | readDataSet <- function(file.name) { 43 | return( 44 | read.csv(file.name, na.strings="-", as.is=TRUE, header=TRUE, 45 | stringsAsFactors=FALSE, strip.white=TRUE) 46 | ) 47 | } 48 | 49 | # Transform raw dataset 50 | transformDataSet <- function(df) { 51 | tdf <- mutate(df, 52 | StartDate=as.Date(StartDate, "%m/%d/%Y"), 53 | Day=factor(weekdays(StartDate), levels=lDays, ordered=TRUE), 54 | Month=factor(months(StartDate), levels=lMonths, ordered=TRUE), 55 | Year=years(StartDate), 56 | Ing1_Runs=as.numeric(gsub('\\*', '', Bat1)), 57 | Ing1_NotOut=ifelse(is.na(Bat1), 0, ifelse(grepl("\\*", Bat1),1,0)), 58 | Ing2_Runs=as.numeric(gsub('\\*', '', Bat2)), 59 | Ing2_NotOut=as.factor(ifelse(grepl("\\*", Bat2),1,0)), 60 | Test_Runs=as.numeric(Runs), 61 | Opposition=as.factor(Opposition), 62 | Ground=as.factor(Ground), 63 | Result=as.factor(Result), 64 | Toss=as.factor(Toss), 65 | Country=as.factor(Country), 66 | Ing1_Century=ifelse(is.na(Ing1_Runs), 0, ifelse(Ing1_Runs>=100, 1, 0)), 67 | Ing2_Century=ifelse(is.na(Ing2_Runs), 0, ifelse(Ing2_Runs>=100, 1, 0)), 68 | Test_Century=ifelse((Ing1_Century==1)|(Ing2_Century==1), 1, 0), 69 | Test_DCentury=ifelse((!is.na(Ing1_Runs) & Ing1_Runs >=200)|(!is.na(Ing2_Runs) & Ing2_Runs>=200), 1, 0), 70 | Home_Away=ifelse(!is.na(Country) & Country=="India", "Home", "Away"), 71 | Ground_Country=as.factor(paste(Ground,Country, sep=",")) 72 | ) 73 | head(tdf) 74 | return(with(tdf, data.frame(TestNo, StartDate, Day, Month, 75 | Year, Ing1_Runs, Ing1_NotOut, Ing2_Runs, Ing2_NotOut, Test_Runs, 76 | Opposition, Ground, Toss, Result, 77 | Ing1_Century, Ing2_Century, 78 | Test_Century, Test_DCentury, Home_Away, Country, 79 | Ground_Country, 80 | laply(Opposition, function(x){getGeoCode(x, "Opposition")}), 81 | laply(Ground_Country, function(x){getGeoCode(x, "Ground")})) 82 | )) 83 | } 84 | 85 | writeTransformedDS <- function(df, file.name) { 86 | write.csv(df, file.name) 87 | } 88 | 89 | writeJSON <- function(df, file.JSON, id) { 90 | 91 | jsonSt <- paste('{ 92 | "datasetName": "SachinTestRecords", 93 | "id": "', id, '", 94 | "dList": { 95 | "Year":', toJSON(levels(df$Year)), ', 96 | "Month":', toJSON(levels(df$Month)), ', 97 | "Day":', toJSON(levels(df$Day)), ', 98 | "Opposition":', toJSON(levels(df$Opposition)), ', 99 | "Ground":', toJSON(levels(df$Ground)), ', 100 | "Result":', toJSON(levels(df$Result)), ', 101 | "Toss":', toJSON(levels(df$Toss)),', 102 | "HomeOrAway":', toJSON(levels(df$Home_Away)), 103 | '}, 104 | "mList": { 105 | "Ing1_Runs":"", 106 | "Ing2_Runs":"", 107 | "Test_Runs":"", 108 | "Ing1_Century":"", 109 | "Ing2_Century":"", 110 | "Test_Century":"" 111 | }, 112 | "dListType": { 113 | "Opposition": "Geo", 114 | "Ground": "Geo" 115 | }, 116 | "source":"CricInfo" 117 | }') 118 | file.out <- file(file.JSON, 'wt') 119 | cat(jsonSt, file=file.out, fill = TRUE) 120 | close(file.out) 121 | } 122 | 123 | # Read Master Blaster Raw File and Transform it 124 | mbDS <- transformDataSet(readDataSet(in.file.name)) 125 | 126 | #Write Transformed data set to a CSV file 127 | writeTransformedDS(mbDS, out.file.csv) 128 | 129 | #Generate JSON with m and d 130 | writeJSON(mbDS, out.file.JSON, dsID) 131 | 132 | #Save transformed dataset as data.frame for later reading 133 | assign(dsID, mbDS) 134 | names <- c(eval(dsID)) 135 | save(list=names, file=out.file.R.dataframes) 136 | pads <- rbind(pads, data.frame(id=dsID, title="Master Blaster - Sachin Tendulkar(1940-2008)", 137 | subtitle="This data set explores Sachin Tendulkar's glorious test cricket record.", 138 | records=nrow(mbDS), analyzed=0, stories=0, json=out.file.JSON, data=out.file.csv, 139 | cache=out.file.R.dataframes, source="CricInfo") 140 | ) 141 | #save(mbDS, file=out.file.R.dataframes) 142 | -------------------------------------------------------------------------------- /MongoFns.R: -------------------------------------------------------------------------------- 1 | # Interact with MongoDB 2 | # Author: Jitender Aswani, Co-Founder @datadolph.in 3 | # Date: 3/15/2013 4 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 5 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 6 | # Packages Used: rmongodb 7 | # All rights reserved. 8 | 9 | require("rmongodb") 10 | 11 | mongo.db <- list(user="", pass="", name="ddfin_dev", system.pads="ddfin_dev.system_pads", host="localhost") 12 | 13 | # 14 | #initiate a connection to admin database 15 | # 16 | getDefaultMongoDBCon <- function() { 17 | mongo <- mongo.create() 18 | if (!mongo.is.connected(mongo)) { 19 | print("Unable to connect. Error code:") 20 | print(mongo.get.err(mongo)) 21 | } else 22 | return(mongo) 23 | } 24 | 25 | # 26 | #initiate a connection to a db 27 | # This will create a db if it doesn't exist 28 | # 29 | getMongoDBCon <- function(db.name) { 30 | mongo <- mongo.create(db=db.name) 31 | if (!mongo.is.connected(mongo)) { 32 | cat("Unable to connect. Error code:", "\n") 33 | cat(mongo.get.err(mongo), "\n") 34 | } else 35 | return(mongo) 36 | } 37 | 38 | # 39 | #disconnect 40 | # 41 | disconnectMongoDB <- function(mongo) { 42 | #Close Connection 43 | if (mongo.is.connected(mongo)) { 44 | mongo.destroy(mongo) 45 | } 46 | } 47 | 48 | # 49 | #list all databases in Mongo 50 | # 51 | listAllDBs <- function() { 52 | #List dbs 53 | mongo <- getDefaultMongoDBCon() 54 | on.exit(disconnectMongoDB(mongo)) # On function exit, close connection 55 | cat(mongo.get.databases(mongo), "\n") 56 | } 57 | 58 | # 59 | # drop a db 60 | # 61 | dropDB <- function(db.name) { 62 | mongo <- getDefaultMongoDBCon() 63 | on.exit(disconnectMongoDB(mongo)) # On function exit, close connection 64 | cat(mongo.drop.database(mongo, db.name)) 65 | } 66 | 67 | # 68 | #List collections for a given db 69 | # 70 | listAllCollections <- function(db.name) { 71 | mongo <- getDefaultMongoDBCon() 72 | on.exit(disconnectMongoDB(mongo)) # On function exit, close connection 73 | cat(mongo.get.database.collections(mongo, db.name), "\n") 74 | } 75 | #listAllCollections(mongo.db$name) 76 | 77 | # 78 | # drop a collection 79 | # 80 | dropCollection <- function(db.collection){ 81 | mongo <- getDefaultMongoDBCon() 82 | on.exit(disconnectMongoDB(mongo)) 83 | 84 | cat(mongo.drop(mongo, db.collection), "\n") 85 | } 86 | #dropCollection(mongo.db$system.pads) 87 | 88 | # 89 | # empty a collection 90 | # 91 | emptyCollection <- function(db.collection){ 92 | mongo <- getDefaultMongoDBCon() 93 | on.exit(disconnectMongoDB(mongo)) 94 | 95 | cat(mongo.remove(mongo, db.collection), "\n") 96 | } 97 | #emptyCollection(mongo.db$system.pads) 98 | 99 | 100 | # 101 | # insert systemPads 102 | # 103 | insertPadDoc <- function(doc){ 104 | 105 | if(!is.null(mongo)){ 106 | status <- mongo.insert(mongo, mongo.db$system.pads, doc) 107 | if(status){ 108 | if(verbose) cat("pad successfully inserted", "\n") 109 | } else { 110 | err <- mongo.get.last.err(mongo, mongo.db$name) 111 | cat(mongo.get.server.err(mongo), "\n") 112 | cat(mongo.get.server.err.string(mongo), "\n") 113 | } 114 | } else { 115 | cat("Connection is null", "\n") 116 | } 117 | mongo.bson.destroy(doc) 118 | } 119 | 120 | # 121 | # update systemPads 122 | # 123 | updatePadDoc <- function(pad.id, doc){ 124 | if(!is.null(mongo)){ 125 | buf <- mongo.bson.buffer.create() 126 | mongo.bson.buffer.append(buf, "_id", pad.id) 127 | criteria <- mongo.bson.from.buffer(buf) 128 | 129 | status <- mongo.update(mongo, mongo.db$system.pads, criteria, doc) 130 | if(status){ 131 | if(verbose) cat("pad successfully updated", "\n") 132 | } else { 133 | err <- mongo.get.last.err(mongo, mongo.db$name) 134 | cat(mongo.get.server.err(mongo), "\n") 135 | cat(mongo.get.server.err.string(mongo), "\n") 136 | } 137 | } else { 138 | cat("Connection is null", "\n") 139 | } 140 | mongo.bson.destroy(doc) 141 | } 142 | 143 | # 144 | #insert a pad into system_pads 145 | # connection object is supplied 146 | # 147 | insertPadToMongo <- function(pad.id, pad){ 148 | # Insert document to collection 'system_pads' 149 | doc <- mongo.bson.from.list(list( 150 | "_id"=pad.id, 151 | pad=pad)) 152 | 153 | if(verbose) cat("calling insert for pad ", pad.id, "\n") 154 | insertPadDoc(doc) 155 | } 156 | 157 | # 158 | # get a single system pad 159 | # 160 | getSingleSystemPad <- function(padId) { 161 | 162 | mongo <- getDefaultMongoDBCon() 163 | on.exit(disconnectMongoDB(mongo)) 164 | 165 | buf <- mongo.bson.buffer.create() 166 | mongo.bson.buffer.append(buf, "_id", padId) 167 | query <- mongo.bson.from.buffer(buf) 168 | 169 | # Find the first 100 records in collection system_pads 170 | cursor <- mongo.find(mongo, mongo.db$system.pads, query) 171 | 172 | # Step though the matching records and display them 173 | while (mongo.cursor.next(cursor)) 174 | l<- mongo.bson.to.list(mongo.cursor.value(cursor)) 175 | #assign("l", l, envir=.GlobalEnv) 176 | #l$pad$default$type = "timeseries" 177 | #updatePadDoc(l$'_id', mongo.bson.from.list(l)) 178 | mongo.cursor.destroy(cursor) 179 | } 180 | #getSingleSystemPad("pad136971841842064") 181 | 182 | # 183 | #get pads from system_pads 184 | # 185 | getSystemPads <- function() { 186 | 187 | mongo <- getDefaultMongoDBCon() 188 | on.exit(disconnectMongoDB(mongo)) 189 | 190 | # Find the first 100 records in collection system_pads 191 | cursor <- mongo.find(mongo, mongo.db$system.pads, limit=100L) 192 | 193 | # Step though the matching records and display them 194 | while (mongo.cursor.next(cursor)) 195 | print(mongo.cursor.value(cursor)) 196 | mongo.cursor.destroy(cursor) 197 | } 198 | #getSystemPads() 199 | 200 | # 201 | #get pads count from system_pads 202 | # 203 | getSystemPadsCount <- function(){ 204 | mongo <- getDefaultMongoDBCon() 205 | on.exit(disconnectMongoDB(mongo)) 206 | 207 | # Find the first 100 records 208 | # in collection people of database test where age == 18 209 | pads.count <- mongo.count(mongo, mongo.db$system.pads) 210 | cat("total pads: ", pads.count, "\n") 211 | } 212 | #getSystemPadsCount() 213 | -------------------------------------------------------------------------------- /r_solr_integration.R: -------------------------------------------------------------------------------- 1 | # Author: Rajani Aswani Co-Founder @datadolph.in 2 | # Author: Jitender Aswani 3 | # Date: 4/15/2013 4 | # Description: R & Solr Integration Using Solr's REST APIs 5 | # Packages Used: RCurl, RJSONIO 6 | # Blog Reference: http://www.r-bloggers.com/updated-sentiment-analysis-and-a-word-cloud-for-netflix-the-r-way/ 7 | # Download 8 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 9 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 10 | # All rights reserved. 11 | 12 | # 13 | # Load Library 14 | # 15 | require(RCurl) 16 | require(RJSONIO) 17 | 18 | # define solr server address 19 | solrServer <- "http://localhost:8080/solr-dev/" 20 | 21 | # 22 | # get row-wise JSON 23 | # 24 | getRowWiseJson <- function (jsonDT) { 25 | require("RJSONIO") 26 | row.json <- apply(jsonDT, 1, toJSON) 27 | json.st <- paste('[', paste(row.json, collapse=', '), ']') 28 | return (json.st) 29 | } 30 | 31 | getSolrServer <- function() { 32 | return(solrServer) 33 | } 34 | 35 | getUpdateURL <- function(){ 36 | return(paste(getSolrServer(), "update/json?commit=true&wt=json", sep="")) 37 | } 38 | 39 | getCommitURL <- function() { 40 | return(paste(getSolrServer(),"update?commit=true&wt=json", sep="")) 41 | } 42 | 43 | getQueryURL <- function() { 44 | return(paste(getSolrServer(),"select?fl=id&wt=json&indent=true&fq=type:pad&q=", sep="")) 45 | } 46 | 47 | # 48 | # query a field for the text and return docs 49 | # 50 | querySolr <- function(queryText, queryfield="all") { 51 | response <- fromJSON(getURL(paste(getQueryURL(), queryfield, ":", queryText, sep=""))) 52 | if(!response$responseHeader$status) #if 0 53 | return(response$response$docs) 54 | } 55 | 56 | # 57 | # delete all indexes from solr server 58 | # 59 | deleteAllIndexes <-function() { 60 | response <- postForm(getUpdateURL(), 61 | .opts = list(postfields = '{"delete": {"query":"*:*"}}', 62 | httpheader = c('Content-Type' = 'application/json', 63 | Accept = 'application/json'), 64 | ssl.verifypeer=FALSE 65 | ) 66 | ) #end of PostForm 67 | return(fromJSON(response)$responseHeader[1]) 68 | } 69 | 70 | # 71 | # delete all indexes for a document type from solr server 72 | # in this example : type = sports 73 | # 74 | deleteSportsIndexes <-function() { 75 | response <- postForm(getUpdateURL(), 76 | .opts = list(postfields = '{"delete": {"query":"type:sports"}}', 77 | httpheader = c('Content-Type' = 'application/json', 78 | Accept = 'application/json'), 79 | ssl.verifypeer=FALSE 80 | ) 81 | ) #end of PostForm 82 | return(fromJSON(response)$responseHeader[1]) 83 | } 84 | 85 | # 86 | # delete indexes for all baskeball category in sports type from solr server 87 | # in this example : type = sports and category: basketball 88 | # 89 | deleteSportsIndexesForCat <-function(category) { 90 | response <- postForm(getUpdateURL(), 91 | .opts = list(postfields = 92 | paste('{"delete": {"query":"type:sports AND category:', category, '"}}', sep=""), 93 | httpheader = c('Content-Type' = 'application/json', 94 | Accept = 'application/json'), 95 | ssl.verifypeer=FALSE 96 | ) 97 | ) #end of PostForm 98 | return(fromJSON(response)$responseHeader[1]) 99 | } 100 | #deletePadIndexesForCat("baskeball") 101 | 102 | 103 | # 104 | #Post a new document to Solr 105 | # 106 | postDoc <- function(doc) { 107 | solr_update_url <- getUpdateURL() 108 | jsonst <- toJSON(list(doc)) 109 | 110 | response <- postForm(solr_update_url, 111 | .opts = list(postfields = jsonst, 112 | httpheader = c('Content-Type' = 'application/json', 113 | Accept = 'application/json'), 114 | ssl.verifypeer=FALSE 115 | )) #end of PostForm 116 | return(fromJSON(response)$responseHeader[1]) 117 | ########## Commit - only if it doesn't work the other way ############### 118 | #return(fromJSON(getURL(getCommitURL()))) 119 | } 120 | 121 | # 122 | #Post JSON document to Solr 123 | # 124 | postJSON <- function(jsonst) { #pad.meta.data (i.e. pmd is sent) 125 | solr_update_url <- getUpdateURL() 126 | 127 | response <- try(postForm(solr_update_url, 128 | .opts = list(postfields = jsonst, 129 | httpheader = c('Content-Type' = 'application/json', 130 | Accept = 'application/json'), 131 | ssl.verifypeer=FALSE 132 | )), silent=T) #end of PostForm 133 | if(class(response) %in% c("try-error")){ 134 | print(response) 135 | } else { 136 | print(response) 137 | return(fromJSON(response)$responseHeader[1]) 138 | } 139 | ########## Commit - only if it doesn't work the other way ############### 140 | #return(fromJSON(getURL(getCommitURL()))) 141 | } 142 | 143 | # 144 | #test it out 145 | # 146 | TestAll <- function() { 147 | status <- postJSON(toJSON(fromJSON(paste(readLines("./pads/solr-docs-IPL-T20.json"), collapse="\n")))) 148 | 149 | if(status) print("An error occurred creating indexes.") 150 | 151 | #delete test 152 | status <- deleteAllIndexes() # Status of 0 means that the operation was successfully completed. 153 | if(status) print("An error occurred while deleting all indexes.") 154 | 155 | #add doc test 156 | status <- postDoc(doc) # Status of 0 means that the operation was successfully completed. 157 | if(status) print("An error occurred while adding a document.") 158 | 159 | 160 | #Query SOlr 161 | docs <- querySolr("baseball", "tags") 162 | print(docs) 163 | 164 | docs <- querySolr("baseball") #if no field is defined, default filed all_text will be searched 165 | print(docs) 166 | } 167 | -------------------------------------------------------------------------------- /GenerateWDIPADS.R: -------------------------------------------------------------------------------- 1 | # Generate PADS from WDI data 2 | # Get series list and the associated meta data from the database 3 | # 4 | # Author: Jitender Aswani, Co-Founder @datadolph.in 5 | # Date: 3/15/2013 6 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 7 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 8 | # All rights reserved. 9 | 10 | setwd("/Users/homemac/R") 11 | source("CreatePADS.R") 12 | # 13 | #initiate a connection to wdi database 14 | # 15 | getWDIDBCon <- function(multi.statements=FALSE) { 16 | if(multi.statements) 17 | return(dbConnect(MySQL(), user="wdi", password="wdi", dbname="wdi", host="localhost", 18 | client.flag=CLIENT_MULTI_STATEMENTS)) 19 | else 20 | return(dbConnect(MySQL(), user="wdi", password="wdi", dbname="wdi", host="localhost")) 21 | } 22 | 23 | # 24 | # get data for a given series and country 25 | # 26 | 27 | getSeriesData <- function(series.name, series.country){ 28 | con <- getWDIDBCon() 29 | on.exit(disconnectDB(con)) # On function exit, close connection 30 | 31 | sql <- sprintf("select * from wdi.wdi_data where Indicator_Code = '%s' and Country_Name ='%s';", series.name, series.country) 32 | print(sql) 33 | res <- dbSendQuery(con, sql) 34 | series.data <- fetch(res, n = -1) 35 | dbClearResult(res) 36 | if(!is.null(series.data) && ncol(series.data) > 4) 37 | return(series.data[,5:ncol(series.data)]) 38 | else 39 | return(NULL) 40 | } 41 | #series.data <- getSeriesData('AG.AGR.TRAC.NO', 'C') 42 | 43 | 44 | padifyWDI <- function(){ 45 | pads <- getSystemPads() 46 | pads.cat <- getCategories() 47 | pads.subcat <- getSubcategories() 48 | solr.index.docs <- list() 49 | con <- getWDIDBCon() 50 | #read wdi data 51 | wdi.data <- dbReadTable(con, "wdi_data") 52 | #read sereis meta data - first four columns only 53 | wdi.series <- dbReadTable(con, "wdi_series")[,1:4] 54 | #read country meta data - only two columns are needed for the time being 55 | wdi.cats <- dbReadTable(con, "wdi_categories") 56 | 57 | series <- list() 58 | series["source"] <- "World Bank" 59 | 60 | for(i in 1:nrow(wdi.series)){ 61 | if(i > 39 && i < 46){ 62 | series["name"] <- wdi.series$SeriesCode[i] 63 | series["title"] <- wdi.series$Indicator_Name[i] 64 | series["longdef"] <- wdi.series$Long_definition[i] 65 | series["shortdef"] <- wdi.series$Short_definition[i] 66 | wdi.cats.matched <- wdi.cats[wdi.cats$SeriesCode==series$name,] 67 | series["category"] <- wdi.cats.matched$category_name 68 | series["subcategory"] <- wdi.cats.matched$subcategory_name 69 | series["category_id"]<- wdi.cats.matched$category_id 70 | series["subcategory_id"]<- wdi.cats.matched$subcategory_id 71 | # 72 | logMessage(paste("Starting loading series", i, series$name, sep=":")) 73 | print(paste("Starting loading series", i, series$name, sep=":")) 74 | 75 | #don't continue if number of na are greather than 1/5th of number of cols 76 | all.series.data <- wdi.data[wdi.data$Indicator_Code==series$name,] 77 | for(j in 1:nrow(all.series.data)){ 78 | series.data <- all.series.data[j,] 79 | if(!is.null(series.data) && 80 | ncol(series.data) > 4 && 81 | sum(is.na(series.data)) < .9 * ncol(series.data)) { 82 | 83 | series["country"] <- series.data$Country_Name 84 | 85 | logMessage(paste("Starting country", series$country, sep=":")) 86 | print(paste("Starting country", series$country, sep=":")) 87 | 88 | 89 | series.data <- t(series.data[,5:ncol(series.data)]) 90 | #generate data.frame 91 | series.data <- data.frame(gsub("X", "", rownames(series.data)), 92 | series.data[,1], 93 | row.names=seq(1:nrow(series.data)), stringsAsFactors=FALSE) 94 | #change colnames 95 | colnames(series.data) <- c("date", paste(series$name, series$country, sep=".")) 96 | 97 | #Remove NA from series column 98 | series.data <- series.data[!is.na(series.data[2]),] 99 | #Create pad and get pad meta data 100 | pmd <- createPAD(series.data, paste(series$title, series$country, sep=", "), 101 | series$longdef, series$category, series$subcategory, 102 | series$source, paste(series$name, series$country, sep="."), 103 | tolower(paste(series$category, series$subcategory, series$source, series$country, sep=","))) 104 | 105 | 106 | pads <- rbind(pads, list(pmd$id, pmd$title, pmd$desc, pmd$records, pmd$columns, series$category_id, 107 | series$subcategory_id, pmd$analyzed, pmd$stories, pmd$src, pmd$src_file, "", "", pmd$tags)) 108 | # 109 | #generate solr doc 110 | # 111 | doc <- list("type"="pad", "id" = pmd$id, "title" = pmd$title, "desc"=pmd$desc, "category"=pmd$category, 112 | "subcategory"=pmd$subcategory, "tags"=pmd$tags) 113 | solr.index.docs[[length(solr.index.docs)+1]] <- doc 114 | 115 | } else { 116 | logMessage(paste(i, series$name, "was empty for ", series$country, sep=":")) 117 | print("This series was empty.") 118 | } 119 | }# inner for loop ends here 120 | } 121 | }# for loop for series ends here 122 | 123 | # now save solr doc 124 | generateSolrDocFromAllDocs(solr.index.docs) 125 | 126 | # Save pads to a csv file 127 | saveData(pads, "./pads/system_pads.csv") 128 | 129 | #now save pads to db 130 | scon <- getDBCon() 131 | dbRemoveTable(scon, "system_pads") 132 | dbWriteTable(scon, name="system_pads", value=pads, row.names = F, overwrite = T) 133 | disconnectDB(scon) 134 | 135 | dumpLogs() 136 | } 137 | 138 | padifyWDI() 139 | 140 | 141 | #use extreme caution when running this funciton - it will remove all meta, cahce, data and empty system_pads table 142 | deleteEverything <- function(){ 143 | unlink("./pads/meta/*.*") 144 | unlink("./pads/cache/*.*") 145 | unlink("./pads/data/*.*") 146 | emptySystemPads() 147 | unlink("./pads/solr-docs.json") 148 | } 149 | #read wdi data 150 | #wdi.data <- dbReadTable(getWDIDBCon(), "wdi_data")[1:10,] 151 | #wdi.data.i <- t(wdi.data) 152 | -------------------------------------------------------------------------------- /PresedentialElectionsResults.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 2012-9-9 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # Description: Financial Markets and President Elect: Do Financial Markets Favor A Republican Over A Democrat? 6 | # Packages Used: RCurl, RJSONIO 7 | # Blog Reference: http://allthingsbusinessanalytics.blogspot.com/2012/11/financial-markets-and-president-elect.html 8 | # All rights reserved. 9 | 10 | require(XML) 11 | require(plyr) 12 | require(quantmod) 13 | 14 | setwd("C:/mydir") 15 | 16 | ####################################################################### 17 | # Get SPY data for US election dates 18 | ####################################################################### 19 | 20 | #Presedential elections are first tuesday after first monday every 4 years 21 | election.years <- seq(1952,2012, by=4) 22 | election.months <- seq(as.Date("1952/11/1"), by="4 year", length.out=16) 23 | election.dates <- election.months + ((9-as.POSIXlt(election.months)$wday) %% 7) 24 | election.dates.1dl <- election.dates + 1 25 | 26 | getSymbols("^GSPC",src="yahoo", from="1950-1-1") 27 | colnames(GSPC) <- c("Open", "High", "LOW", "Close", "Volume", "Adjusted.Close") 28 | write.zoo(GSPC, file="pads/data/financials/SP500_historical_OHLC_data.csv", sep=",") 29 | 30 | #Generate Monthly Returns for 2008 and save them as a file 31 | write.zoo(monthlyReturn(GSPC["2008"]), file="pads/data/financials/SP500_2008_monthly_returns.csv", sep=",") 32 | 33 | 34 | # only concerned about close 35 | SPY.close <- Ad(GSPC) 36 | 37 | #system("cat pads/data/financials/SP500_historical_OHLC_data.csv") 38 | spy.daily.returns <- dailyReturn(SPY.close) 39 | #one.day.return <- subset(spy.daily.returns, index(spy.daily.returns) %in% election.dates.1daylater) 40 | #one.day.return$year <- format(index(one.day.return), "%Y") 41 | 42 | getQuote <- function(year, SPY) 43 | { 44 | #Determine election date 45 | election.month <- as.Date(paste(year, "11/1", sep="/")) 46 | election.date <- election.month + ((9-as.POSIXlt(election.month)$wday) %% 7) 47 | #election.date <- election.dates[which(format(election.dates, "%Y")==year)] 48 | 49 | #Get dates 1-day, 1-week, 4-weeks, 12-weeks and 52-weeks afte the election day 50 | quote.dates <- c(election.date, election.date+1, election.date+7, 51 | election.date+(4*7), election.date+(12*7), election.date+(52*7), election.date+(50*7*4)) 52 | if(year == 2008) 53 | quote.dates[length(quote.dates)] <- Sys.Date() 54 | 55 | # Check to see if the dates exist in SPY index, if not advance or lag by 1 day 56 | quote.dates <- as.Date(sapply(quote.dates, function(x) 57 | if(length(SPY.close[x])){return(as.Date(x))}else{ return(as.Date(x)-1)})) 58 | 59 | #Get the quotes for all these dates 60 | quotes <- SPY[quote.dates] 61 | returns <- sapply(quote.dates, 62 | function(x) (as.numeric(quotes[x])/as.numeric(quotes[quote.dates[1]]) - 1)*100) 63 | r <- list() 64 | return(r[[year]]<-returns) 65 | } 66 | 67 | #Remove 2012 from this vector 68 | election.years <- election.years[!election.years==2012] 69 | returns <- do.call(cbind, llply(election.years, function(x) getQuote(x, SPY.close))) 70 | colnames(returns) <- election.years 71 | #Remove first row 72 | returns <- returns[-1,] 73 | t.returns <- t(returns) 74 | old.rownames <- rownames(t.returns) 75 | obama.1d.return <- as.numeric(spy.daily.returns["2012-11-07", 1]) * 100 76 | t.returns <- rbind(t.returns, c(obama.1d.return, NA, NA, NA, NA, NA)) 77 | rownames(t.returns) <- c(old.rownames, 2012) 78 | colnames(t.returns) <- c("1_day_return", "1_week_return", "4_weeks_return", "12_weeks_return", 79 | "52_weeks_return", "term_return") 80 | write.csv(t.returns, "pads/data/history/SP500 returns after US president elections_1.csv", row.names = TRUE) 81 | 82 | ####################################################################### 83 | # Get Presidential Names 84 | ####################################################################### 85 | 86 | wiki <- "http://en.wikipedia.org/wiki/United_States_presidential_election" 87 | tables <- readHTMLTable(wiki, stringsAsFactors=FALSE) 88 | #print(names(tables)) 89 | wiki.table <- tables[[4]] 90 | #Create a new column by removing all the special chars from the winner column 91 | wiki.table$sub <- gsub("\\*", "", wiki.table$Winner) 92 | #Replace all meta chars 93 | #data$sub <- gsub("[^A-Za-z0-9 ]+", ",", data$sub) 94 | wiki.table$sub <- gsub("\\(|\\)", ",", wiki.table$sub) 95 | #Now split and create a new dataframe 96 | us.prez.elect <- data.frame(do.call("rbind", strsplit(wiki.table$sub, ",", fixed=TRUE))) 97 | drops <- c("X3","X4") 98 | us.prez.elect <- us.prez.elect[,!(names(us.prez.elect) %in% drops)] 99 | election_years <- gsub("[^A-Za-z0-9]+", "", wiki.table$"Election year") 100 | election_years[1] <- 1788 101 | election_years[20] <- 1864 102 | colnames(us.prez.elect) <- c("president", "party") 103 | rownames(us.prez.elect) <- election_years 104 | election.months <- seq(as.Date("1788/11/1"), by="4 year", length.out=(2016-1788)/4) 105 | election.dates <- election.months + ((9-as.POSIXlt(election.months)$wday) %% 7) 106 | us.prez.elect$election_date <- as.Date(election.dates) 107 | write.csv(us.prez.elect, "pads/data/history/US president elects.csv", row.names = TRUE) 108 | us.prez.elect.since.1950 <- subset(us.prez.elect, rownames(us.prez.elect) %in% election.years) 109 | 110 | # 111 | # Merge the two data frames 112 | # 113 | new.col.names <- c('election_year', colnames(us.prez.elect.since.1950), colnames(t.returns)) 114 | prez.data <- merge(us.prez.elect.since.1950, t.returns, by=0) 115 | colnames(prez.data) <- new.col.names 116 | write.csv(prez.data, "pads/data/history/US president elects and SP500 returns.csv", row.names = FALSE) 117 | 118 | 119 | #all.dates <- seq(as.Date("1952-1-1"), Sys.Date(), by="day") 120 | #election.year.dates <- subset(all.dates, years(all.dates) %in% election.years) 121 | #election.months <- subset(election.year.dates, months(election.year.dates)=="November") 122 | #elections.months.days <- subset(election.months, weekdays(election.months) %in% c("Monday", "Tuesday")) 123 | #df <- data.frame(date=elections.months.days, day=weekdays(elections.months.days)) 124 | #election.years <- seq(as.Date("1952/1/1"), by="4 year", length.out=16) 125 | #http://statistics.berkeley.edu/classes/s133/dates.html 126 | #sapply(quote.dates, function(x) if(length(SPY.close[x])){cat("True\n")}else{ cat("False\n")}) 127 | #x <- SPY.close[paste(election.date, election.date+(52*7), sep="/")] 128 | #d <- dailyReturn(x) 129 | #w <- weeklyReturn(x) 130 | #keeps <- c("y","a") 131 | #DF[keeps] 132 | -------------------------------------------------------------------------------- /SentimentAnalysis.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 2012-30-1 3 | # Description: Extracts tweets from twitter and run sentiment analysis on using list of sentiment words from Hu and Liu 4 | # Packages Used: RCurl, XML, TwitteR, RJSONIO 5 | # Blog Reference: http://www.r-bloggers.com/updated-sentiment-analysis-and-a-word-cloud-for-netflix-the-r-way/ 6 | # Download 7 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 8 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 9 | # All rights reserved. 10 | # Revised Sentiment Analyis using Hu & Liu's library of 6,800 negative and positive words 11 | 12 | #Populate the list of sentiment words from Hu and Liu (http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html) 13 | huliu.pwords <- scan(paste(SRC_DIR, 'opinion-lexicon/positive-words.txt', sep=""), what='character', comment.char=';') 14 | huliu.nwords <- scan(paste(SRC_DIR,'opinion-lexicon/negative-words.txt', sep=""), what='character', comment.char=';') 15 | # Add some extra words 16 | huliu.nwords <- c(huliu.nwords,'wtf','wait','waiting','epicfail', 'crash', 'bug', 'bugy', 'bugs', 'slow', 'lie') 17 | #Remove some words for example sap and Cloud 18 | huliu.nwords <- huliu.nwords[!huliu.nwords=='sap'] 19 | huliu.nwords <- huliu.nwords[!huliu.nwords=='cloud'] 20 | 21 | # 22 | # clean up a large character string 23 | # 24 | cleanText <- function(x) { 25 | # tolower 26 | x = tolower(x) 27 | # remove rt 28 | #x = gsub("rt", "", x) 29 | # remove at 30 | x = gsub("@\\w+", "", x) 31 | # remove punctuation 32 | x = gsub("[[:punct:]]", "", x) 33 | # remove control characters 34 | x <- gsub('[[:cntrl:]]', '', x) 35 | # remove numbers 36 | x = gsub("[[:digit:]]", "", x) 37 | # remove links http 38 | x = gsub("http\\w+", "", x) 39 | # remove tabs 40 | x = gsub("[ |\t]{2,}", " ", x) 41 | # remove new lines 42 | x = gsub("[ |\n]{1,}", " ", x) 43 | # remove blank spaces at the beginning 44 | x = gsub("^ ", "", x) 45 | # remove blank spaces at the end 46 | x = gsub(" $", "", x) 47 | return(x) 48 | } 49 | 50 | # 51 | # clean up tweets or any other doucment for corpus and sentiment analysis 52 | # 53 | cleanContent <- function(content){ 54 | # clean out non-ASCII characters, remove numbers, puncuations, stop words 55 | content <- sapply(content, function(x) iconv(x, "latin1", "ASCII", sub="")) 56 | content <- cleanText(content) # clean up 57 | # remove stop-words 58 | content <- removeWords(content, 59 | c(stopwords("english"), "twitter", "wikipedia")) 60 | return(content) 61 | } 62 | # 63 | # build a generic tag cloud 64 | # 65 | buildTagCloud <- function (content, word.threshold=2){ 66 | #cleanup 67 | content <- cleanContent(content) 68 | # make corpus for text mining 69 | content.corpus <- Corpus(VectorSource(content)) 70 | 71 | #build a term document 72 | #content.dtm <- TermDocumentMatrix(content.corpus, control = list(stopwords = TRUE, minWordLength = 5)) 73 | content.dtm <- TermDocumentMatrix(content.corpus, control = list(minWordLength = 5)) 74 | # get a matrix 75 | content.m = as.matrix(content.dtm) 76 | # get word counts in decreasing order 77 | content.words <- sort(rowSums(content.m), decreasing=TRUE) 78 | # create a data frame with words and their frequencies 79 | content.df = data.frame(text=names(content.words), size=content.words) 80 | #write.csv(content.words, "company-word-tag.csv", row.names=F) 81 | return( content.df[content.df$size > word.threshold,]) 82 | } 83 | 84 | # 85 | # get sentiment score for each tweet 86 | # 87 | getSentimentScore <- function(tweets) { 88 | scores <- laply(tweets, function(singleTweet) { 89 | tweetWords <- unlist(str_split(tolower(singleTweet), '\\s+')) 90 | # compare our words to the dictionaries of positive & negative terms 91 | # match() returns the position of the matched term or NA, apply is.na to convert to boolean 92 | pos.matches <- !is.na(match(tweetWords, huliu.pwords)) 93 | neg.matches <- !is.na(match(tweetWords, huliu.nwords)) 94 | # and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum(): 95 | score <- sum(pos.matches) - sum(neg.matches) 96 | return(score) 97 | }) 98 | return(data.frame(SentimentScore=scores, Tweet=tweets)) 99 | } 100 | 101 | # 102 | #perform sentiment analysis 103 | # 104 | performSentimentAnalysis <- function(tweets){ 105 | # 106 | # perform twitter sentiment analysis for each tweet 107 | # 108 | tweets <- cleanContent(tweets) 109 | # call getSentiment score 110 | ss <- getSentimentScore(tweets) 111 | 112 | # Get rid of tweets that have zero score and seperate +ve from -ve tweets 113 | ss$posTweets <- as.numeric(ss$SentimentScore >=1) 114 | ss$negTweets <- as.numeric(ss$SentimentScore <=-1) 115 | 116 | # Let's summarize now 117 | summary <- list(TweetsFetched=length(ss$SentimentScore), 118 | PositiveTweets=sum(ss$posTweets), NegativeTweets=sum(ss$negTweets), 119 | AverageScore=round(mean(ss$SentimentScore),3)) 120 | 121 | # some tweets have no score - positive offsets negative - so the next line is necessary 122 | summary$TweetsWithScore <- summary$PositiveTweets + summary$NegativeTweets 123 | 124 | #Get Sentiment Score 125 | summary$SentimentScore <- round(summary$PositiveTweets/summary$TweetsWithScore, 2) 126 | return(summary) 127 | } 128 | 129 | # 130 | #search twitter using hash tags 131 | # 132 | searchTwitterHashtag <- function(tw.hashtag, certificate.path, how.many=300, what.lang="en") { 133 | tweets <- try(searchTwitter(tw.hashtag, lang=what.lang, n=how.many, cainfo=certificate.path), silent=T) 134 | if("try-error" %in% class(tweets)) 135 | return(data.frame(error="Oops an error occurred")) 136 | return(tweets) 137 | } 138 | 139 | #search twitter using handle 140 | searchTwitterHandle <- function(tw.handle, certificate.path, how.many=300) { 141 | tweets <- try(userTimeline(tw.handle, n=how.many, cainfo=certificate.path), silent=T) 142 | if("try-error" %in% class(tweets)) 143 | return(list(error="Oops an error occurred")) 144 | return(tweets) 145 | } 146 | 147 | #download ca cert file 148 | downloadCACertFile <- function(){ 149 | download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.pem") 150 | } 151 | 152 | # Certain words may not be relevant for your secenario to be counted as positive and negative on Hu and Liu list 153 | # Remove them before sending the words to this list 154 | # Fitler words 155 | filterWords <- function(words){ 156 | if(length(which(words %in% filter.words)) > 0) 157 | words <- words[-which(words %in% filter.words)] 158 | return(words) 159 | } 160 | 161 | # example 162 | #get tweets and perform sentiment analysis 163 | tweets <- searchTwitterHandle(twitter.handle, CERTIFICATE_PATH, 300) 164 | summary <- performSentimentAnalysis(tweets) 165 | 166 | -------------------------------------------------------------------------------- /GenerateCrimePADS.R: -------------------------------------------------------------------------------- 1 | # Generate crime PADS 2 | # Author: Jitender Aswani, Co-Founder @datadolph.in 3 | # Date: 3/15/2013 4 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 5 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 6 | # All rights reserved. 7 | 8 | source("CreatePADS.R") 9 | verbose <- T 10 | # 11 | # generate 12 | # 13 | generate <- function() { 14 | #initialize system 15 | initializeSystem() 16 | assign("crime.folder.path", "./pads/raw-data/crime", envir=.GlobalEnv) 17 | assign("dataset", "US-Crime", envir=.GlobalEnv) 18 | 19 | #prepare pad meta data 20 | series <- list() 21 | series["source"] <- " BJS, FBI, Uniform Crime Reports" 22 | series["category"] <- "Social" 23 | series["subcategory"] <- "US Crime Data" 24 | series["tags"] <- tolower(paste(series$category, series$subcategory, series$source, "usdoj, Supplementary Homicide Report, USA", sep=",")) 25 | series["desc"] <- "Homicide Trends in United States: Data are based on annual estimates of homicide from previously published versions of Crime in the United States. Data for 1989 to 2008 reflect updated homicide estimates from Crime in the United States, 2008. Data for 2009 and 2010 reflect updated homicide estimates from Crime in the United States, 2010." 26 | series["pagetag"] <- "crunchbase" 27 | 28 | series.data <- read.csv(paste(crime.folder.path, "1.csv", sep="/"), stringsAsFactors=F) 29 | 30 | series["title"] <- "Homicide Rate per 100,000 Population (1950-2008)" 31 | padify(series, series.data[c(1,2)]) 32 | 33 | series["title"] <- "Homicide Incidents per 100,000 Population (1950-2008)" 34 | padify(series, series.data[c(1,3)]) 35 | 36 | series["title"] <- "Homicide Rate & Incidents per 100,000 Population (1950-2008)" 37 | padify(series, series.data) 38 | 39 | series.data <- read.csv(paste(crime.folder.path, "2.csv", sep="/"), stringsAsFactors=F) 40 | series.data[2:4] <- data.frame(apply(series.data[2:4], 2, function(x) as.numeric(gsub("%", "", x, fixed=T)))) 41 | series["title"] <- "Victims and Offenders by Age in USA, Rate per 100,000 (1980-2008)" 42 | padify(series, series.data) 43 | 44 | series.data <- read.csv(paste(crime.folder.path, "3.csv", sep="/"), stringsAsFactors=F) 45 | series.data[2:4] <- data.frame(apply(series.data[2:4], 2, function(x) as.numeric(gsub("%", "", x, fixed=T)))) 46 | series["title"] <- "Victims and Offenders by Gender in USA (1980-2008)" 47 | padify(series, series.data) 48 | 49 | series.data <- read.csv(paste(crime.folder.path, "4.csv", sep="/"), stringsAsFactors=F) 50 | series.data[2:4] <- data.frame(apply(series.data[2:4], 2, function(x) as.numeric(gsub("%", "", x, fixed=T)))) 51 | series["title"] <- "Victims and Offenders by Race in USA (1980-2008)" 52 | padify(series, series.data) 53 | 54 | series.data <- read.csv(paste(crime.folder.path, "2-1.csv", sep="/"), stringsAsFactors=F) 55 | series["title"] <- "Victims and Offenders by Age in USA, Rate per 100,000, (1980-2008)" 56 | padify(series, series.data) 57 | 58 | series.data <- read.csv(paste(crime.folder.path, "3-1.csv", sep="/"), stringsAsFactors=F) 59 | series["title"] <- "Victims and Offenders by Gender in USA, Rate per 100,000 (1980-2008)" 60 | padify(series, series.data) 61 | 62 | series.data <- read.csv(paste(crime.folder.path, "4-1.csv", sep="/"), stringsAsFactors=F) 63 | series["title"] <- "Victims and Offenders by Race in USA, Rate per 100,000 (1980-2008)" 64 | padify(series, series.data) 65 | 66 | # Homicide victimization rates by age, (1980-2008) 67 | series.data <- read.csv(paste(crime.folder.path, "5.csv", sep="/"), stringsAsFactors=F) 68 | title <- "Homicide Victimization Rates By Age" 69 | period <- "(1980-2008)" 70 | colnames(series.data) <- gsub("X", "ages", colnames(series.data)) 71 | col.names <- colnames(series.data) 72 | for(i in 2:ncol(series.data)) 73 | { 74 | series["title"] <- paste(title, ", ", col.names[i], period, sep="") 75 | print(i) 76 | padify(series, series.data[c(1,i)]) 77 | } 78 | series["title"] <- "Homicide Victimization Rates for All Ages (1980-2008)" 79 | padify(series, series.data) 80 | 81 | 82 | # Homicide offending rates by age, 1980-2008 83 | series.data <- read.csv(paste(crime.folder.path, "5-1.csv", sep="/"), stringsAsFactors=F) 84 | title <- "Homicide Offending Rates By Age" 85 | period <- "(1980-2008)" 86 | colnames(series.data) <- gsub("X", "ages", colnames(series.data)) 87 | col.names <- colnames(series.data) 88 | for(i in 2:ncol(series.data)) 89 | { 90 | series["title"] <- paste(title, ", ", col.names[i], period, sep="") 91 | print(i) 92 | padify(series, series.data[c(1,i)]) 93 | } 94 | series["title"] <- "Homicide Offending Rates for All Ages (1980-2008)" 95 | padify(series, series.data) 96 | 97 | #Average age of homicide victims and offenders, 1980-2008 98 | series.data <- read.csv(paste(crime.folder.path, "5-2.csv", sep="/"), stringsAsFactors=F) 99 | title <- "Average Age of Homicide" 100 | period <- "(1980-2008)" 101 | colnames(series.data) <- gsub("X", "ages", colnames(series.data)) 102 | col.names <- colnames(series.data) 103 | for(i in 2:ncol(series.data)) 104 | { 105 | series["title"] <- paste(title, col.names[i], period, sep=" ") 106 | print(i) 107 | padify(series, series.data[c(1,i)]) 108 | } 109 | series["title"] <- "Average Age of Homicide Victims and Offenders (1980-2008)" 110 | padify(series, series.data) 111 | 112 | #Figure 6. Percent of homicides in which offender was known to victim by age of victim, 1980-2008 113 | series.data <- read.csv(paste(crime.folder.path, "11.csv", sep="/"), stringsAsFactors=F) 114 | title <- "Percent of Homicides in Which Offender was Known to Victim (1980-2008)" 115 | padify(series, series.data) 116 | 117 | #Figure 7. Number of homicides of children under age 5, by race of victim, 1980-2008 118 | #*Other race includes American Indians, Alaska Natives, Asians, Hawaiians, and other Pacific Islanders. 119 | series.data <- read.csv(paste(crime.folder.path, "12.csv", sep="/"), stringsAsFactors=F) 120 | title <- "Number of homicides of children under age 5, by race of victim" 121 | period <- "(1980-2008)" 122 | col.names <- colnames(series.data) 123 | for(i in 2:ncol(series.data)) 124 | { 125 | series["title"] <- paste(title, "-", col.names[i], period, sep=" ") 126 | padify(series, series.data[c(1,i)]) 127 | } 128 | series["title"] <- "Number of homicides of children under age 5, by race of victim (1980-2008)" 129 | padify(series, series.data) 130 | 131 | #Figure 8. Homicide victimization rates for children under age 5 by race of victim, 1980-2008 132 | #Rate per 100,000 population Rate per 100,000 population 133 | series.data <- read.csv(paste(crime.folder.path, "12-1.csv", sep="/"), stringsAsFactors=F) 134 | title <- "Rate of homicides of children under age 5, by race of victim" 135 | period <- "(1980-2008)" 136 | col.names <- colnames(series.data) 137 | for(i in 2:ncol(series.data)) 138 | { 139 | series["title"] <- paste(title, "-", col.names[i], "per 100,000 population", period, sep=" ") 140 | padify(series, series.data[c(1,i)]) 141 | } 142 | series["title"] <- "Rate of homicides of children under age 5, by race of victim, per 100,000 population (1980-2008)" 143 | padify(series, series.data) 144 | 145 | #clean up 146 | cleaupSystem() 147 | updateCatPadCount() 148 | } 149 | generate() 150 | -------------------------------------------------------------------------------- /FetchData/FetchIPLData.R: -------------------------------------------------------------------------------- 1 | # Author: Jitender Aswani, Co-Founder @datadolph.in 2 | # Date: 3/15/2013 3 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 4 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 5 | # All rights reserved. 6 | 7 | #http://www.wisdenrecords.com/Records/India/Test/Bowling/Most_Wickets_in_Career.html 8 | #http://www.wisdenrecords.com/Records/India/Test/Batting/Most_Career_Runs.html 9 | #http://www.wisdenrecords.com/Records/Pakistan/Test/Batting/Most_Career_Runs.html 10 | #http://www.wisdenrecords.com/Records/India/Test/Batting/Most_Career_Runs.html 11 | require("RJSONIO") 12 | require("plyr") 13 | library("RCurl") 14 | 15 | init <- function() { 16 | assign("folder.path", "./pads/raw-data/cricket/", envir=.GlobalEnv) 17 | assign("years", 2008:2013, envir=.GlobalEnv) 18 | assign("ipl.base.url", 'http://dynamic.pulselive.com/dynamic/data/core/cricket/2012/ipl', envir=.GlobalEnv) 19 | assign("ipl.end.url.runs", '/stats/player/full/mostRuns.js', envir=.GlobalEnv) 20 | assign("ipl.end.url.wickets", "/stats/player/full/mostWickets.js", envir=.GlobalEnv) 21 | assign("ipl.end.url.standings", "/groupStandings.js", envir=.GlobalEnv) 22 | } 23 | 24 | # Save the JSON 25 | saveStatsData <- function(out.file, jsonSt){ 26 | file.out <- file(paste(folder.path, out.file, ".JSON", sep=""), 'wt') 27 | cat(jsonSt, file=file.out, fill = TRUE) 28 | close(file.out) 29 | } 30 | 31 | #write a csv file 32 | saveStatsDataAsCSV <- function(data, out.file){ 33 | file.location <- paste(folder.path, out.file, ".csv", sep="") 34 | write.csv(data, file.location, row.names=F) 35 | } 36 | 37 | # 38 | # batting 39 | # 40 | battingStastByPlayer <- function(l){ 41 | #return (data.frame(matrix(unlist(l), nrow=1, byrow=T))) 42 | return (data.frame(l$team$fullName, l$player$fullName, l$player$nationality, l$player$dateOfBirth, 43 | matrix(unlist(l$battingStats), nrow=1, byrow=T), stringsAsFactors = F)) 44 | } 45 | 46 | getBattingStats <- function(ipl.json, year){ 47 | ipl.batting.stats <- ldply(ipl.json$mostRuns[[1]]$topPlayers, battingStastByPlayer) 48 | colnames(ipl.batting.stats) <- c("team", "player", "country", "dob", names(ipl.json$mostRuns[[1]]$topPlayers[[1]]$battingStats)) 49 | ipl.batting.stats$hs <- sapply(ipl.batting.stats$hs, function(x) sub("\\*", "", x)) 50 | ipl.batting.stats$year <- year 51 | # get age 52 | ipl.batting.stats$age_in_months <- sapply(ipl.batting.stats$dob, function(x) getAge(x, paste(year, "/04/01", sep=""), "m")) 53 | return(ipl.batting.stats) 54 | } 55 | 56 | battingStats <- function(year){ 57 | ipl.url <- paste(ipl.base.url, year, ipl.end.url.runs, sep="") 58 | print(ipl.url) 59 | content <- try(getURL(ipl.url), silent=F) 60 | if(class(content) %in% c("try-error")) stop("Can't continue...") 61 | content <- sub("\\);", "", sub(paste("onMostRuns", "\\(", sep=""), "", content)) 62 | saveStatsData(paste("ipl", year, "batting-stats", sep="-"), content) 63 | ipl.json <- fromJSON(content) 64 | return(getBattingStats(ipl.json, year)) 65 | } 66 | 67 | battingStatsFromFiles <- function(year){ 68 | file.name <- paste(folder.path, paste("ipl", year, "batting-stats", sep="-"), ".json", sep="") 69 | file.out <- file(file.name) 70 | ipl.json <- fromJSON(paste(readLines(file.out), collapse = "\n")) 71 | close(file.out) 72 | #assign("ipl", ipl.json, envir=.GlobalEnv) 73 | return(getBattingStats(ipl.json, year)) 74 | } 75 | 76 | # 77 | # Bowling stats 78 | # 79 | bowlingStastByPlayer <- function(l){ 80 | #return (data.frame(matrix(unlist(l), nrow=1, byrow=T))) 81 | return (data.frame(l$team$fullName, l$player$fullName, l$player$nationality, l$player$dateOfBirth, 82 | matrix(unlist(l$bowlingStats), nrow=1, byrow=T), stringsAsFactors = F)) 83 | } 84 | 85 | getBowlingStats <- function(ipl.json, year){ 86 | ipl.bowling.stats <- ldply(ipl.json$mostWickets[[1]]$topPlayers, bowlingStastByPlayer) 87 | colnames(ipl.bowling.stats) <- c("team", "player", "country", "dob", names(ipl.json$mostWickets[[1]]$topPlayers[[1]]$bowlingStats)) 88 | ipl.bowling.stats$year <- year 89 | # get age 90 | ipl.bowling.stats$age_in_months <- sapply(ipl.bowling.stats$dob, function(x) getAge(x, paste(year, "/04/01", sep=""), "m")) 91 | 92 | return(ipl.bowling.stats) 93 | } 94 | 95 | bowlingStats <- function(year){ 96 | ipl.url <- paste(ipl.base.url, year, ipl.end.url.wickets, sep="") 97 | print(ipl.url) 98 | content <- try(getURL(ipl.url), silent=T) 99 | if(class(content) %in% c("try-error")) stop("Can't continue...") 100 | content <- sub("\\);", "", sub(paste("onMostWickets", "\\(", sep=""), "", content)) 101 | saveStatsData(paste("ipl", year, "bowling-stats", sep="-"), content) 102 | ipl.json <- fromJSON(content) 103 | return(getBowlingStats(ipl.json, year)) 104 | } 105 | 106 | bowlingStatsFromFiles <- function(year){ 107 | file.name <- paste(folder.path, paste("ipl", year, "bowling-stats", sep="-"), ".json", sep="") 108 | file.out <- file(file.name) 109 | ipl.json <- fromJSON(paste(readLines(file.out), collapse = "\n")) 110 | close(file.out) 111 | return(getBowlingStats(ipl.json, year)) 112 | } 113 | 114 | # 115 | # standings 116 | # 117 | standingStatsBySeason <- function(l){ 118 | return (data.frame(l$position, l$team$fullName, l$played, l$won, l$lost, l$tied, l$noResult, l$points, l$netRunRate)) 119 | } 120 | 121 | getStandingStats <- function(ipl.json, year){ 122 | ipl.standing.stats <- ldply(ipl.json$groups[[1]]$standings, standingStatsBySeason) 123 | colnames(ipl.standing.stats) <- c("position", "team", "played", "won", "lost", "tied", "noResult", "points", "netrunrate") 124 | ipl.standing.stats$year <- year 125 | return(ipl.standing.stats) 126 | } 127 | 128 | standingStats <- function(year){ 129 | ipl.url <- paste(ipl.base.url, year, ipl.end.url.standings, sep="") 130 | print(ipl.url) 131 | content <- try(getURL(ipl.url), silent=F) 132 | if(class(content) %in% c("try-error")) stop("Can't continue...") 133 | content <- sub("\\);", "", sub(paste("onGroupStandings", "\\(", sep=""), "", content)) 134 | saveStatsData(paste("ipl", year, "standings-stats", sep="-"), content) 135 | ipl.json <- fromJSON(content) 136 | #assign("ipl", ipl.json, envir=.GlobalEnv) 137 | return(getStandingStats(ipl.json, year)) 138 | } 139 | 140 | standingStatsFromFiles <- function(year){ 141 | file.name <- paste(folder.path, paste("ipl", year, "standings-stats", sep="-"), ".json", sep="") 142 | file.out <- file(file.name) 143 | ipl.json <- fromJSON(paste(readLines(file.out), collapse = "\n")) 144 | close(file.out) 145 | #assign("ipl", ipl.json, envir=.GlobalEnv) 146 | return(getStandingStats(ipl.json, year)) 147 | } 148 | 149 | 150 | # 151 | # get IPL stats 152 | # 153 | getStats <- function(){ 154 | init() 155 | # 156 | #get batting stats 157 | # 158 | batting.stats <- adply(years, 1, battingStatsFromFiles) 159 | colnames(batting.stats) <- c("season", colnames(batting.stats)[2:9], "fours", "sixes", 160 | colnames(batting.stats)[12:13], "fiftys", "hundreds", 161 | colnames(batting.stats)[16:ncol(batting.stats)]) 162 | 163 | saveStatsDataAsCSV(batting.stats, "IPL-T20-Batting-Stats-For-All-Seasons") 164 | 165 | # 166 | #get bowling stats 167 | # 168 | bowling.stats <- adply(years, 1, bowlingStatsFromFiles) 169 | colnames(bowling.stats) <- c("season", colnames(bowling.stats)[2:10], "fours", 170 | "sixes", colnames(bowling.stats)[13:21], "four_wickets", 171 | "five_wickets", "ten_wickets", 172 | colnames(bowling.stats)[25:ncol(bowling.stats)]) 173 | 174 | saveStatsDataAsCSV(bowling.stats, "IPL-T20-Bowling-Stats-For-All-Seasons") 175 | 176 | # 177 | # get group stats 178 | # 179 | group.standings <- adply(years, 1, standingStatsFromFiles) 180 | colnames(group.standings) <- c("season", colnames(group.standings)[2:ncol(group.standings)]) 181 | saveStatsDataAsCSV(group.standings, "IPL-T20-Standing-Stats-For-All-Seasons") 182 | } 183 | 184 | if(F){ 185 | 186 | #http://dynamic.pulselive.com/dynamic/data/core/cricket/2012/ipl2013/groupStandings.js 187 | #http://dynamic.pulselive.com/dynamic/data/core/cricket/2012/ipl2012/groupStandings.js 188 | 189 | ipl.json <- fromJSON(paste(readLines(paste(folder.path, "ipl-2013-sample-bat.json", sep="")), collapse = "\n")) 190 | ipl <- read.csv(paste(folder.path, "IPLstats.csv", sep=""), stringsAsFactors=F) 191 | 192 | #http://dynamic.pulselive.com/dynamic/data/core/cricket/2012/ipl2012/stats/player/full/mostWickets.js 193 | for(y in years){ 194 | aaply(ipl, 1, getStat, y) 195 | } 196 | 197 | ipl.url <- "http://dynamic.pulselive.com/dynamic/data/core/cricket/2012/ipl2012/stats/player/full/mostRuns.js" 198 | content <- getURL(ipl.url) 199 | content <- sub("\\);", "", sub("onMostRuns\\(", "", content),) 200 | 201 | theurl <- "http://en.wikipedia.org/wiki/Brazil_national_football_team" 202 | tables <- readHTMLTable(theurl) 203 | n.rows <- unlist(lapply(tables, function(t) dim(t)[1])) 204 | tables[[which.max(n.rows)]] 205 | colnames(ipl.batting.stats) <- sapply(names(unlist(ipl.json$mostRuns[[1]]$topPlayers[[1]])), function(x) {gsub("(.*)\\.", "",x)}) 206 | gsub("(.*)\\.", "", "team.fullName") 207 | } 208 | 209 | 210 | -------------------------------------------------------------------------------- /FetchData/FetchImmigrationData.R: -------------------------------------------------------------------------------- 1 | require(RCurl) 2 | require(data.table) 3 | require(plyr) 4 | setwd("/Users/homemac/dolphy/R/pads/raw-data/immigration") 5 | 6 | 7 | fetchData <- function(url,yr) { 8 | #url <- "http://www.flcdatacenter.com/download/H1B_efile_FY02_text.zip" 9 | #yr <- 2 10 | #temp <- tempfile() 11 | #tempdir() 12 | temp <- paste("temp", yr, ".zip", sep="_") 13 | download.file(url,temp) 14 | #data <- read.csv(unz(temp, read.file)) 15 | #write.csv(data,csv.file, row.names=F) 16 | #l.f <- unzip("H1B_efile_FY03_text.zip") #unzips the file 17 | unzip(temp) #unzips the file 18 | unlink(temp) 19 | #"EFILE_FY2006.txt" 20 | #H1B_efile_FY02.txt 21 | } 22 | 23 | fy <- 5:6 24 | u <- paste("http://www.flcdatacenter.com/download/H1B_efile_FY0", fy, "_text.zip", sep="") 25 | sapply(seq_along(u), function(i) fetchData(u[i], fy[i])) 26 | 27 | readTextData <- function() { 28 | filenames <- list.files(path=".", pattern="*.txt") 29 | 30 | print(i) 31 | #yr <- 2000 + as.integer(gsub("[^0-9]", "", gsub("(\\./)(.*)_(.*)_(.*)(\\.txt)", "\\4", i))) 32 | yr <- 2000 + as.integer(gsub("[^0-9]", "", gsub("(.*)_(.*)_(.*)(\\.txt)", "\\3", i))) 33 | filename <- gsub("(.*)(\\.txt)", "\\1", i) 34 | print(yr) 35 | print(filename) 36 | pad <- data.table(read.csv(i, stringsAsFactors=F)) 37 | col.names <- colnames(pad) 38 | setnames(pad, col.names, tolower(col.names)) 39 | #colnames(pad) <- tolower(colnames(pad)) 40 | write.csv(pad,paste(filename, ".csv", sep=""), row.names=F) 41 | } 42 | } 43 | 44 | readCSVData <- function() { 45 | filenames <- list.files(path=".", pattern="*.csv") 46 | 47 | for(i in filenames) { 48 | print(i) 49 | #yr <- 2000 + as.integer(gsub("[^0-9]", "", gsub("(\\./)(.*)_(.*)_(.*)(\\.txt)", "\\4", i))) 50 | yr <- 2000 + as.integer(gsub("[^0-9]", "", gsub("(.*)_(.*)_(.*)(\\.csv)", "\\3", i))) 51 | pad <- data.table(read.csv(i, stringsAsFactors=F)) 52 | col.names <- colnames(pad) 53 | setnames(pad, col.names, tolower(col.names)) 54 | generateStats(pad, yr) 55 | } 56 | } 57 | 58 | generateStats <- function(dt, yr) { 59 | # 60 | # by total 61 | # 62 | dt.total <- dt[,list(year=yr, total=length(approval_status), 63 | approved=length(which(approval_status=="Certified")), 64 | denied=length(which(approval_status=="Denied")), 65 | hold=length(which(approval_status=="Hold")), 66 | pending=length(which(approval_status=="Pending")))] 67 | 68 | # save this to csv 69 | write.csv(dt.total, paste("US_H1B_breakdown_by_status_", yr, ".csv", sep=""), row.names=F) 70 | # bind this to csv 71 | H1B.total <<- rbind(H1B.total, dt.total) 72 | 73 | # 74 | # by company 75 | # 76 | dt.by.company <- dt[, list(year=yr, total=length(approval_status), 77 | approved=length(which(approval_status=="Certified")), 78 | denied=length(which(approval_status=="Denied")), 79 | hold=length(which(approval_status=="Hold")), 80 | pending=length(which(approval_status=="Pending"))), 81 | by=list(employer=name)][order(-total)] 82 | # save this to csv 83 | write.csv(dt.by.company[1:15,], paste("US_H1B_top_companies_", yr, ".csv", sep=""), row.names=F) 84 | # bind this to csv 85 | H1B.by.company <<- rbind(H1B.by.company, dt.by.company) 86 | 87 | # 88 | # by state 89 | # 90 | dt.by.state <- dt[,list(year=yr, total=length(approval_status), 91 | approved=length(which(approval_status=="Certified")), 92 | denied=length(which(approval_status=="Denied")), 93 | hold=length(which(approval_status=="Hold")), 94 | pending=length(which(approval_status=="Pending"))), 95 | by=state][order(-total)] 96 | # save this to csv 97 | write.csv(dt.by.state[1:15,], paste("US_H1B_top_states_", yr, ".csv", sep=""), row.names=F) 98 | # bind this to csv 99 | H1B.by.state <<- rbind(H1B.by.state, dt.by.state) 100 | 101 | # 102 | # by city 103 | # 104 | 105 | dt.by.city <- dt[,list(year=yr, total=length(approval_status), 106 | approved=length(which(approval_status=="Certified")), 107 | denied=length(which(approval_status=="Denied")), 108 | hold=length(which(approval_status=="Hold")), 109 | pending=length(which(approval_status=="Pending"))), 110 | by=city][order(-total)] 111 | # save this to csv 112 | write.csv(dt.by.city[1:15,], paste("US_H1B_top_cites_", yr, ".csv", sep=""), row.names=F) 113 | # bind this to csv 114 | H1B.by.city <<- rbind(H1B.by.city, dt.by.city) 115 | 116 | # 117 | # by title 118 | # 119 | dt.by.title <- dt[,list(year=yr, total=length(approval_status), 120 | approved=length(which(approval_status=="Certified")), 121 | denied=length(which(approval_status=="Denied")), 122 | hold=length(which(approval_status=="Hold")), 123 | pending=length(which(approval_status=="Pending"))), 124 | by=job_title][order(-total)] 125 | # save this to csv 126 | write.csv(dt.by.title[1:15,], paste("US_H1B_top_titles_", yr, ".csv", sep=""), row.names=F) 127 | # bind this to csv 128 | H1B.by.title <<- rbind(H1B.by.title, dt.by.title) 129 | 130 | 131 | 132 | #contribs <- read.csv("path/to/file", colClasses=c(CTRIB_AMT="Currency")) 133 | 134 | # 135 | # by wage 136 | # 137 | dt$wage_offered <- as.numeric(gsub("[\\$,]","",dt$wage_rate_1)) 138 | dt$prevailing_wages <- as.numeric(gsub("[\\$,]","",dt$prevailing_wage_1)) 139 | dt.by.wage <- dt[,list(year=yr, wage_offered, 140 | prevailing_wages, job_title, employer=name), 141 | by=list(rate_per_1=="Year", 142 | approval_status=="Certified")][,rate_per_1:=NULL][,approval_status:=NULL][order(-wage_offered)][wage_offered < 200000] 143 | # save this to csv 144 | write.csv(dt.by.wage, paste("US_H1B_top_wages_", yr, ".csv", sep=""), row.names=F) 145 | # bind this to csv 146 | H1B.by.wages <<- rbind(H1B.by.wages, dt.by.wage) 147 | } 148 | 149 | H1B.total <- data.table() 150 | H1B.by.company <- data.table() 151 | H1B.by.state <- data.table() 152 | H1B.by.city <- data.table() 153 | H1B.by.title <- data.table() 154 | H1B.by.wages <- data.table() 155 | 156 | readCSVData() 157 | write.csv(H1B.total, "US_H1B_breakdown_by_status_2002-2007.csv", row.names=F) 158 | write.csv(H1B.by.company, "US_H1B_top_companies_2002-2007.csv", row.names=F) 159 | write.csv(H1B.by.state, "US_H1B_top_states_2002-2007.csv", row.names=F) 160 | write.csv(H1B.by.city, "US_H1B_top_cities_2002-2007.csv", row.names=F) 161 | write.csv(H1B.by.title, "US_H1B_top_titles_2002-2007.csv", row.names=F) 162 | write.csv(H1B.by.wages, "US_H1B_top_wages_2002-2007.csv", row.names=F) 163 | 164 | 165 | #lt <- count(ldf2, "approval_status") 166 | 167 | # 'x' %in% colnames(df) 168 | lt2 <- as.data.frame.matrix(with(ldf2, table(name, approval_status))) 169 | lt2$company <- rownames(lt2) 170 | rownames(lt2) <- seq(1:nrow(lt2)) 171 | 172 | 173 | 174 | u <- "http://www.omegahat.org/RCurl/data.gz" 175 | 176 | 177 | 178 | 179 | setClass("Currency") 180 | setAs("character", "Currency", 181 | function(from) as.numeric(sub("$","",from, fixed=TRUE))) 182 | 183 | setClass("CurrencyCommas") 184 | setAs("character", "CurrencyCommas", 185 | function(from) as.numeric(gsub(",", "", from) ) ) 186 | gsub("[\\$,]","","$1,075.20 ") 187 | if(url.exists(u)) { 188 | content <- getBinaryURL(u) #read in the https as a binary 189 | dump.file <- file("temp.zip", open = "wb") 190 | writeBin(content, dump.file) 191 | close(dump.file) #use this to close the connection 192 | l <- unzip("temp.zip", list=TRUE) #unzips the file 193 | #dir() #Look at files in the directory. We want "SRC2010.mdb" 194 | unlink("temp.zip") #delete the zip file 195 | # shell.exec("SRC2010.mdb") 196 | } 197 | library(RODBC) 198 | channel <- odbcConnectAccess("http://www.foreignlaborcert.doleta.gov/pdf/quarter_4_2011/H-1b_iCert_LCA_FY2011_Q4.mdb 199 | ") 200 | demographics <- sqlFetch(channel, sqtable="Demographic Factors", colnames = FALSE, rownames = FALSE) 201 | ave.class.sz <- sqlFetch(channel, sqtable="Average Class Size", colnames = FALSE, rownames = FALSE) 202 | att.susp <- sqlFetch(channel, sqtable="Attendance and Suspensions", colnames = FALSE, rownames = FALSE) 203 | drop.out <- sqlFetch(channel, sqtable="High School Completers", colnames = FALSE, rownames = FALSE) 204 | staff <- sqlFetch(channel, sqtable="Staff", colnames = FALSE, rownames = FALSE) 205 | close(channel) 206 | 207 | 208 | ## method 2 using curl 209 | CAINFO = paste(system.file(package="RCurl"), "/CurlSSL/ca-bundle.crt", sep = "") 210 | 211 | cookie = 'cookiefile.txt' 212 | curlH = getCurlHandle( 213 | cookiefile = cookie, 214 | useragent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en - US; rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6", 215 | header = FALSE, 216 | verbose = TRUE, 217 | netrc = TRUE, 218 | maxredirs = as.integer(20), 219 | followlocation = TRUE, 220 | ssl.verifypeer = TRUE) 221 | 222 | 223 | destfile = "log2.csv" 224 | content = getBinaryURL(url, curl = curlH, cainfo = CAINFO) 225 | ## write to file 226 | writeBin(content, destfile) 227 | ## read from binary object 228 | 229 | ## read from binary object 230 | csv.data2 <- read.csv(textConnection(rawToChar(content))) 231 | head(csv.data2) 232 | csv.data2 == csv.data -------------------------------------------------------------------------------- /CreatePADS.R: -------------------------------------------------------------------------------- 1 | # Read a CSV file 2 | # Classify the column types 3 | # Determine measures and dimensions 4 | # Generate meta data and store JSON 5 | # Query Solr and store the information in SOLR 6 | # Save the data as cache 7 | 8 | # Author: Jitender Aswani, Co-Founder @datadolph.in 9 | # Date: 3/15/2013 10 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 11 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 12 | # All rights reserved. 13 | 14 | require("RJSONIO") 15 | require("plyr") 16 | require("data.table") 17 | require("stringr") 18 | require("lubridate") 19 | require("zoo") 20 | 21 | source("UtilPADS.R") 22 | source("ClassifyData.R") 23 | source("MySQLFunctions.R") 24 | source("MongoFns.R") 25 | source("DefaultChartForPAD.R") 26 | 27 | # initializeSystem <- function(stack=0){ 28 | # 29 | # # turn off scientific formatting - getOption("scipen") 30 | # options(scipen=999) 31 | # 32 | # #initialize solr docs 33 | # #assign("solr.index.docs", list(), envir=.GlobalEnv) 34 | # 35 | # #mongo 36 | # mongo <- getDefaultMongoDBCon() 37 | # assign("mongo", mongo, envir=.GlobalEnv) 38 | # 39 | # #initialize stack 40 | # mysql.db <- list(user="ddfin_dev", pass="BfNdW87Ym9FmcYj7", name="ddfin_dev", host="localhost") 41 | # if(stack == 1) { 42 | # #prod 43 | # mysql.db <- list(user="ddfin_prod", pass="8MQ5CRDzQufHKXTx", name="ddfin_prod", host="localhost") 44 | # } 45 | # assign("mysql.db", mysql.db, envir=.GlobalEnv) 46 | # 47 | # #mysql 48 | # mysql <- getMSDBCon() 49 | # assign("mysql", mysql, envir=.GlobalEnv) 50 | # 51 | # #error counter 52 | # assign("error.count", 0, envir=.GlobalEnv) 53 | # 54 | # #initialize log file 55 | # assign("vec.log", vector(), envir=.GlobalEnv) 56 | # 57 | # #verbose mode 58 | # assign("verbose", F, envir=.GlobalEnv) 59 | # 60 | # #min log 61 | # assign("log.all", F, envir=.GlobalEnv) 62 | # } 63 | 64 | # 65 | # loadStates 66 | # 67 | loadStates <- function(){ 68 | states <- data.table(readFile("./pads/raw-data/states_names.csv")) 69 | setkey(states, state_code) 70 | assign("states", states, envir=.GlobalEnv) 71 | } 72 | 73 | 74 | 75 | # 76 | # Initialize Padification Process 77 | # 78 | initializeSystem <- function(stack=0){ 79 | 80 | # turn off scientific formatting - getOption("scipen") 81 | options(scipen=999) 82 | 83 | # day and months facors 84 | lMonths <- c("January","February","March", "April","May","June","July","August","September", "October","November","December") 85 | lDays <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday") 86 | assign("lMonths", lMonths, envir=.GlobalEnv) 87 | assign("lDays", lDays, envir=.GlobalEnv) 88 | 89 | #loadStates 90 | loadStates() 91 | 92 | #initialize solr docs 93 | #assign("solr.index.docs", list(), envir=.GlobalEnv) 94 | 95 | #mongo 96 | mongo <- getDefaultMongoDBCon() 97 | assign("mongo", mongo, envir=.GlobalEnv) 98 | 99 | #initialize stack 100 | mysql.db <- list(user="ddfin_dev", pass="b31nd1$$", name="ddfin_dev", host="localhost") 101 | if(stack == 1) { 102 | #prod 103 | mysql.db <- list(user="ddfin_prod", pass="g0f0r1t", name="ddfin_prod", host="localhost") 104 | } 105 | assign("mysql.db", mysql.db, envir=.GlobalEnv) 106 | 107 | #mysql 108 | mysql <- getMSDBCon() 109 | assign("mysql", mysql, envir=.GlobalEnv) 110 | 111 | #error counter 112 | assign("error.count", 0, envir=.GlobalEnv) 113 | 114 | #initialize log file 115 | assign("vec.log", vector(), envir=.GlobalEnv) 116 | 117 | #verbose mode 118 | assign("verbose", F, envir=.GlobalEnv) 119 | 120 | #min log 121 | assign("log.all", F, envir=.GlobalEnv) 122 | } 123 | 124 | # 125 | #cleanup 126 | # 127 | cleaupSystem <- function() { 128 | 129 | #disconnect Mongo 130 | disconnectMongoDB(mongo) 131 | 132 | # disconnect msdb 133 | disconnectMSDB(mysql) 134 | 135 | #save the logs 136 | dumpLogs() 137 | 138 | # save the solr index to FS for later editing - this is now done through mysql 139 | #persistSolrIndex(solr.index.docs, dataset) 140 | } 141 | 142 | # 143 | # create pad - the padification process starts here 144 | # 145 | createPAD <- function(pad, title, desc, category, subcategory, data.source, source.file, 146 | tags, pagetag) { 147 | 148 | if(ncol(pad) == 0) stop('dolphy can not padify a pad that has zero columns') 149 | 150 | #only continue if there is one or more colum 151 | if(verbose) print(paste("Starting the padification process for source file", source.file, sep=":")) 152 | logMessage(paste("Starting the padification process for source file", source.file, sep=":")) 153 | 154 | #Very first task, classify pad 155 | pad.classified <- classifyData(pad) 156 | 157 | #if(verbose) print(pad.classified) 158 | 159 | # Generate extra date columns 160 | #if(length(which(col.classes == "Date")) > 0) { 161 | # pad.classified <- generateExtraDateCols(pad.classified, which(col.classes == "Date")) 162 | # col.classes <- laply(pad.classified, getClass) 163 | #} 164 | 165 | meta.data <- list(id="", "title"=title, "desc"=desc, 166 | "records"=as.integer(0), 167 | "columns"=as.integer(0), 168 | "category"=category, 169 | "subcategory"=subcategory, 170 | "src"=data.source, 171 | "src_file"= source.file, 172 | "cache_location"="", 173 | "dList"=list(), "mList"=list(), 174 | "tags"=tags, 175 | "pagetag"=pagetag 176 | ) 177 | 178 | #Start filling up the meta data 179 | meta.data$records <- nrow(pad.classified) 180 | meta.data$columns <- ncol(pad.classified) 181 | 182 | #Replace all meta chars from column names including spaces 183 | colnames(pad.classified) <- replaceMetaChars(colnames(pad.classified)) 184 | 185 | col.classes <- laply(pad.classified, getClass) 186 | col.names <- colnames(pad.classified) 187 | 188 | #Create a dict (name=value pair) 189 | cols <- list() 190 | cols[col.names]=col.classes 191 | meta.data$dList <- cols[which(unlist(cols)!="numeric")] 192 | meta.data$mList <- cols[which(unlist(cols)=="numeric")] 193 | #meta.data$mList <- names(which(sapply(pad.classified, is.numeric))) 194 | #meta.data$dList <- setdiff(colnames(pad.classified), meta.data$mList) 195 | 196 | # Get unique id for pad 197 | padID <- getPadUID() 198 | #print(paste("Assing pad id of ", padID, sep="::")) 199 | logMessage(paste("Assigning pad id of ", padID, sep="::")) 200 | meta.data$id <- padID 201 | #meta.data$meta_location <- getMetaDataFileURL(padID) 202 | meta.data$cache_location <- getCacheDataFileURL(padID) 203 | #meta.data$data_location <- getDataFileURL(padID) 204 | 205 | #Save transformed dataset as data.frame for later reading 206 | assign(padID, pad.classified) 207 | names <- c(eval(padID)) 208 | 209 | #get default chart info.... 210 | meta.data <- getDefaultChartInfo(pad.classified, meta.data) 211 | 212 | #Save the cahce 213 | if(verbose) print("saving cache") 214 | if(log.all) logMessage("Saving cache...") 215 | save(list=names, file=meta.data$cache_location) 216 | 217 | #Save meta data 218 | #jsonSt <- toJSON(meta.data) 219 | #saveMetaData(meta.data$meta_location, jsonSt) 220 | 221 | #Save underlying data into csv 222 | #saveData(pad.classified, meta.data$data_location) 223 | 224 | if(verbose) print("Padification Completed!") 225 | logMessage("Padification Completed!") 226 | return(meta.data) 227 | } 228 | 229 | # 230 | # padify 231 | # 232 | padify <- function(series, series.data, x.plot.band=NULL, y.plot.band=NULL){ 233 | #Remove rows that have NAs 234 | series.data <- series.data[rowSums(is.na(series.data)) != ncol(series.data),] 235 | 236 | # Remove rows that have at least 1 NA 237 | series.data <- series.data[complete.cases(series.data),] 238 | 239 | if("data.table" %in% class(series.data)) 240 | series.data <- as.data.frame(series.data, , stringsAsFactors=F) 241 | #create pad 242 | pmd <- try(createPAD(series.data, series$title, 243 | series$desc, series$category, series$subcategory, 244 | series$source, paste(series$name, series$country, sep="."), 245 | series$tags, series$pagetag), silent=T) 246 | 247 | #assign("pmd", pmd, envir=.GlobalEnv) 248 | # 249 | # Add this pad to system pads table in the database, to mongo db and save the cache 250 | # 251 | #check to see if the padification went through 252 | if(class(pmd) %in% c("try-error")) { 253 | if(verbose) print("couldn't padify process.") 254 | logMessage("couldn't padify the pad...") 255 | assign("error.count", error.count+1, envir=.GlobalEnv) 256 | } else { 257 | if(!is.null(x.plot.band)) 258 | pmd$default$xaxis$plotbands <- x.plot.band 259 | 260 | if(!is.null(y.plot.band)) 261 | pmd$default$yaxis$plotbands <- y.plot.band 262 | 263 | if(verbose) print("adding the pad to mysql") 264 | if(log.all) logMessage("Saving PAD meta data in the database...") 265 | insertPadToMySQL(pmd) 266 | 267 | # insert into mongo 268 | if(verbose) print("adding the pad to mongo db") 269 | if(log.all) print("adding the pad to mongo db") 270 | pad <- fromJSON(toJSON(pmd)) 271 | 272 | insertPadToMongo(pmd$id, pad) 273 | return (pmd$id) 274 | # #generate solr doc 275 | # if(verbose) print("adding the pad to solr vector") 276 | # if(log.all) print("adding the pad to mongo") 277 | # doc <- list("type"="pad", "id" = pmd$id, "title" = pmd$title, "desc"=pmd$desc, "category"=pmd$category, 278 | # "subcategory"=pmd$subcategory, "tags"=pmd$tags, "author"="system") 279 | # solr.index.docs[[length(solr.index.docs)+1]] <<- doc 280 | } 281 | } -------------------------------------------------------------------------------- /ClassifyData.R: -------------------------------------------------------------------------------- 1 | # Classify Data 2 | # Author: Jitender Aswani, Co-Founder @datadolph.in 3 | # Date: 3/15/2013 4 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 5 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 6 | # All rights reserved. 7 | 8 | 9 | 10 | # 11 | # Generate extra date cols 12 | # 13 | generateExtraDateCols <- function(data, col.index) { 14 | lMonths <- c("January","February","March", "April","May","June","July","August","September", "October","November","December") 15 | lDays <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday") 16 | #data$Day_generated=factor(weekdays(data[,col.index]), levels=lDays, ordered=TRUE) 17 | data$Day_g=factor(weekdays(data[,col.index]), levels=lDays) 18 | #data$Month_generated=factor(months(data[,col.index]), levels=lMonths, ordered=TRUE) 19 | data$Month_g=factor(months(data[,col.index]), levels=lMonths) 20 | data$Quarter_g=as.factor(quarters(data[,col.index])) 21 | data$Year_g=as.factor(years(data[,col.index])) 22 | #data$Year=as.numeric(format(data[,col.index], "%Y")) 23 | #as.numeric(format(date1, "%m")) 24 | #format(date1, "%b") ## Month, char, abbreviated 25 | return(data) 26 | } 27 | # 28 | # Define all supported date formats 29 | # 30 | getAllowedDateFormats <- function() { 31 | #d = '([0 ]?[1-9]|[12][0-9]|3[01])' # 1-31 with leading zero or space for single digits 32 | #m = '([0 ]?[1-9]|1[0-2])' # 01-12 with leading zero or space for single digits 33 | d = '([0][1-9]|[12][0-9]|3[01])' # 1-31 with leading zero for single digits 34 | m = '([0][1-9]|1[0-2])' # 01-12 with leading zero for single digit 35 | Y = '(17[0-9][0-9]|18[0-9][0-9]|19[0-9][0-9]|20[0-9][0-9])' # 1700 - 2099 36 | #Y = '(19[0-9][0-9]|20[0-9][0-9])' # 1900 - 2099 37 | y = '([0-9][0-9])' # 00-99 38 | b='([a-zA-Z]{3})' 39 | B='([a-zA-Z]+)' 40 | #q = '([qQ][1-4])' 41 | #patterns <- c("%Y%m%d", "%m%d%y", "%m%d%Y", "%d%m%y", "%d%m%Y", "%y%m%d") 42 | #patterns <- c("Ymd", "mdy", "mdY", "dmy", "dmY", "ymd", "y", "Y", "by", "bY", "By", "BY") 43 | #removed %y from the list to avoid getting numerical column identified, moved %Y at the front to avoid 4 digit year being classified as %d%m%y 44 | patterns <- c("Y","Ymd", "mdy", "mdY", "dmy", "dmY", "ymd","by", "bY", "By", "BY") 45 | #patterns <- c("Y") 46 | separators <- c("", " ", "/", "-", ".") 47 | #strptime {base} /^[a-zA-Z]{3}\s+[0-9]{2}$/ 48 | allowed.date.list <- list() 49 | 50 | for (i in 1:length(patterns)) { 51 | for (j in 1:length(separators)) { 52 | p <- patterns[i] 53 | s <- separators[j] 54 | pattern.length <- nchar(p) 55 | chars <- substring(p,seq(1, pattern.length), seq(1, pattern.length)) 56 | m.chars <- paste("%", chars, sep="") 57 | format <- paste(m.chars,collapse=s) 58 | if(!s %in% c("", " ")){ 59 | s <- paste("\\", s, "", sep="" ) 60 | } 61 | regex <- '' 62 | if( pattern.length == 3) { 63 | regex <- paste('^', eval(get(chars[1])), s, eval(get(chars[2])), s, eval(get(chars[3])), '$', sep="") 64 | } else if ( pattern.length == 2) { 65 | regex <- paste('^', eval(get(chars[1])), s, eval(get(chars[2])),'$', sep="") 66 | } else if ( pattern.length == 1) { 67 | regex <- paste('^', eval(get(chars[1])),'$', sep="") 68 | } 69 | allowed.date.list[[regex]] = format 70 | } 71 | } 72 | return (allowed.date.list) 73 | } 74 | 75 | # 76 | # get date format for sample of values 77 | # 78 | getDateFormat <- function(sample.dates) { 79 | allowed.date.list <- getAllowedDateFormats() 80 | #sample.dates <- sample.dates[!is.na(sample.dates)] 81 | for(i in 1:length(allowed.date.list)) { 82 | #Check how many items matced a date pattern 83 | matched <- sum(grepl(names(allowed.date.list)[i], sample.dates)) 84 | if(matched == length(sample.dates)) { 85 | #print(names(allowed.date.list)[i]) 86 | return(allowed.date.list[[i]]) 87 | } 88 | } 89 | return(NULL) 90 | } 91 | 92 | #Test this function 93 | testDates <- function() { 94 | #dates <- c("March 2010","December 2011", "February 2012", "March 2014", "Jan 2013") 95 | dates <- c("1962", "1949", "1932", "1963", "1925")#, 1937, 1912, 1944, 1930, 1958, 1939, 1919, 1915) 96 | #dates <- seq(as.Date("2000/1/1"), by="month", length.out=1000) 97 | #Remove NA: dates <- dates[!is.na(dates)] 98 | date.format <- getDateFormat(dates) 99 | #d <- as.Date(dates, date.format) 100 | if(date.format %in% c("%y", "%Y")) { 101 | dates = as.Date(paste("1 1 ", dates, sep=""), format=paste("%d %m ", date.format, sep="")) 102 | } else if (date.format %in% c("%b%y","%b %y", "%b/%y", "%b-%y", "%b.%y", 103 | "%b%Y", "%b %Y", "%b/%Y", "%b-%Y", "%b.%Y", 104 | "%B%y", "%B %y", "%B/%y", "%B-%y", "%B.%y", 105 | "%B%Y", "%B %Y", "%B/%Y", "%B-%Y", "%B.%Y")) { 106 | sep <- gsub("%[a-zA-Z]{1}(.*)%[a-zA-Z]{1}", "\\1", date.format) 107 | dates = as.Date(paste("1", dates, sep=sep), format=paste("%d", date.format, sep=sep)) 108 | 109 | } else { 110 | dates = as.Date(dates, format=date.format) 111 | } 112 | } 113 | 114 | # 115 | # Test if date 116 | # 117 | isDate <- function(col, colname="date") { 118 | allowed.date.headers <- c("date", "time", "index") 119 | #if(tolower(colname) %in% allowed.date.headers || grepl("[date]")) { 120 | # print("date found in the header") 121 | #} 122 | return (getDateFormat(col)) 123 | } 124 | 125 | # 126 | # Test if logical 127 | # 128 | isColLogical <- function(col) { 129 | if(length(levels(factor(col))) == 2) { 130 | bool.types <- c('true', 'false', 't', 'f', '1', '0', 'y', 'n', 'yes', 'no') 131 | countBool <- sum(sapply(tolower(col), function(x) x %in% bool.types), na.rm=TRUE) 132 | if (countBool == length(col)) 133 | return(TRUE) 134 | else 135 | return(FALSE) 136 | } else 137 | return (FALSE) 138 | } 139 | 140 | # 141 | #test if number 142 | # 143 | isNumber <- function(col) { 144 | countBool <- sum(sapply(col, is.numeric), na.rm=TRUE) 145 | if (countBool == length(col)) 146 | return(TRUE) 147 | else 148 | return(FALSE) 149 | } 150 | 151 | # 152 | #Main funciton which takes a column (or a vector) and attempts to classify it... 153 | # 154 | classifyData <- function(data) { 155 | #Default sample size 156 | sample.size <- 30 157 | if(nrow(data) <= 30) 158 | sample.size <- nrow(data) 159 | 160 | #Remove rows that have NA 161 | if(ncol(data) == 1) { # this is an intersting one - if pad has only one column, the above line returns a vector rather than a data.frame hence this loop 162 | col.names <- colnames(data) 163 | data <- as.data.frame(data[rowSums(is.na(data)) != ncol(data),]) 164 | 165 | #Take a small sample 166 | data.sample <- as.data.frame(data[sample(1:nrow(data)[1], size=sample.size, replace=FALSE),]) 167 | colnames(data) <- col.names 168 | } else { 169 | data <- data[rowSums(is.na(data)) != ncol(data),] 170 | #Take a small sample 171 | data.sample <- data[sample(1:nrow(data)[1], size=sample.size, replace=FALSE),] 172 | } 173 | cls <- sapply(data.sample, class) 174 | #print(cls) 175 | classified <- list() 176 | col.names <- colnames(data.sample) 177 | for (i in 1:ncol(data.sample)) { 178 | 179 | col <- data.sample[,i] 180 | col <- col[!is.na(col)] #remove NA 181 | 182 | #check for boolean 183 | if(isColLogical(col)) { 184 | #storage.mode(data[,i]) = "logical" 185 | data[,i] = as.factor(data[,i]) 186 | #classes.classified[[i]] = "logical" 187 | next 188 | } 189 | 190 | #check for date names (in date, time) 191 | date.format <- isDate(col) 192 | #print(date.format) 193 | if(!is.null(date.format)) { 194 | if(date.format %in% c("%y", "%Y")) { 195 | data[,i] = as.Date(paste("1 1 ", data[,i], sep=""), format=paste("%d %m ", date.format, sep="")) 196 | } else if (date.format %in% c("%b%y","%b %y", "%b/%y", "%b-%y", "%b.%y", 197 | "%b%Y", "%b %Y", "%b/%Y", "%b-%Y", "%b.%Y", 198 | "%B%y", "%B %y", "%B/%y", "%B-%y", "%B.%y", 199 | "%B%Y", "%B %Y", "%B/%Y", "%B-%Y", "%B.%Y")) { 200 | sep <- gsub("%[a-zA-Z]{1}(.*)%[a-zA-Z]{1}", "\\1", date.format) 201 | data[,i] = as.Date(paste("1", data[,i], sep=sep), format=paste("%d", date.format, sep=sep)) 202 | } else { 203 | data[,i] = as.Date(data[,i], format=date.format) 204 | generateExtraDateCols(data, i) 205 | } 206 | next 207 | } 208 | 209 | #check for int 210 | if (isNumber(col)){ 211 | data[,i] = as.numeric(data[,i]) 212 | next 213 | } 214 | #check for factor 215 | 216 | } 217 | if(verbose) print("Classification Done!") 218 | logMessage("Classification Done!") 219 | 220 | return(data) 221 | } 222 | 223 | 224 | testClassification <- function() { 225 | 226 | #l <- c(1,0,1,0,0,0,1,0) 227 | #z <- sample(c(TRUE,FALSE),1000000,rep=TRUE) 228 | #z <- tolower(z) 229 | #Geneates a matrix 230 | #date.patterns <- unlist( 231 | # lapply(separtors, function(y) 232 | # lapply(patterns, function(x) paste(substring(x,seq(1,nchar(x), 2), seq(2,nchar(x), 2)), 233 | # collapse=y) 234 | # ))) 235 | 236 | #sample.df <- data.frame(bigmove=sample(c("y","n"),1000,rep=TRUE), 237 | # date=seq(as.Date("2000/1/1"), by="month", length.out=1000), 238 | # value=rep(1:100,100), stringsAsFactors = FALSE) 239 | 240 | sample.df <- data.frame(dates=c("1-31-2012", "1-1-2011", "1-1-2010", "1-1-2009"), pass=c(1,0,1,0), name=c("a", "bsd", "adaads", "dafds"), stringsAsFactors = FALSE) 241 | print(sapply(sample.df, class)) 242 | data <- classifyData(sample.df) 243 | print(sapply(data, class)) 244 | } 245 | 246 | if(FALSE) 247 | { 248 | #Symbol Meaning Example 249 | #%d day as a number (0-31) 01-31 250 | #%a 251 | #%A abbreviated weekday 252 | #unabbreviated weekday Mon 253 | #Monday 254 | #%m month (00-12) 00-12 255 | #%b 256 | #%B abbreviated month 257 | #unabbreviated month Jan 258 | #January 259 | #%y 260 | #%Y 2-digit year 261 | #4-digit year 07 262 | #2007 263 | # metaChar = c("$","*","+",".","?","[","^","{","|","(","\\") 264 | # 265 | #`~!@#$%^&*()_|+\-=?;:'",.<>\{\}\[\]\\\/] 266 | 267 | } -------------------------------------------------------------------------------- /FetchData/GenerateFlightStats.R: -------------------------------------------------------------------------------- 1 | # 2 | # Generate Flight Stats 3 | # 4 | initForStats <- function(){ 5 | assign("flights.folder.path", "./pads/raw-data/flights/", envir=.GlobalEnv) 6 | assign("verbose", T, envir=.GlobalEnv) 7 | flights.files <- list.files(path=flights.folder.path, pattern="*.csv.gz") 8 | assign("flights.files", flights.files, envir=.GlobalEnv) 9 | assign("period", length(flights.files), envir=.GlobalEnv) 10 | } 11 | 12 | 13 | getFlightsStatusByAirlines <- function(flights, yr){ 14 | # 15 | # get flights stats By airlines 16 | # 17 | if(verbose) cat("Getting stats for airlines:", '\n') 18 | airlines.stats <- flights[, list(dep_airports=length(unique(origin)), 19 | flights=length(origin), 20 | flights_cancelled=sum(cancelled, na.rm=T), 21 | flights_diverted=sum(diverted, na.rm=T), 22 | flights_departed_late=length(which(depdelay > 0)), 23 | flights_arrived_late=length(which(arrdelay > 0)), 24 | total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]), 25 | avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])), 26 | median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)])), 27 | miles_traveled=sum(distance, na.rm=T) 28 | ), 29 | by=uniquecarrier][, year:=yr] 30 | #change col order 31 | setcolorder(airlines.stats, c("year", colnames(airlines.stats)[-ncol(airlines.stats)])) 32 | #save this data 33 | saveData(airlines.stats, paste(flights.folder.path, "stats/5/airlines_stats_", yr, ".csv", sep="")) 34 | #clear up space 35 | rm(airlines.stats) 36 | 37 | # 38 | # get flights stats By airlines by month 39 | # 40 | if(verbose) cat("Getting stats for airlines by month:", '\n') 41 | airlines.stats <- flights[, list(dep_airports=length(unique(origin)), 42 | flights=length(origin), 43 | flights_cancelled=sum(cancelled, na.rm=T), 44 | flights_diverted=sum(diverted, na.rm=T), 45 | flights_departed_late=length(which(depdelay > 0)), 46 | flights_arrived_late=length(which(arrdelay > 0)), 47 | total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]), 48 | avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])), 49 | median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)])), 50 | miles_traveled=sum(distance, na.rm=T) 51 | ), 52 | by=list(uniquecarrier, month)][, year:=yr] 53 | #change col order 54 | setcolorder(airlines.stats, c("year", colnames(airlines.stats)[-ncol(airlines.stats)])) 55 | #save this data 56 | saveData(airlines.stats, paste(flights.folder.path, "stats/6/airlines_stats_monthly_", yr, ".csv", sep="")) 57 | #clear up space 58 | rm(airlines.stats) 59 | 60 | } 61 | 62 | getFlightsStatsByAirport <- function(flights, yr){ 63 | # 64 | # get flights stats By airport 65 | # 66 | if(verbose) cat("Getting stats for airport:", '\n') 67 | airport.stats <- flights[, list(flights=length(uniquecarrier), 68 | flights_cancelled=sum(cancelled, na.rm=T), 69 | flights_departed_late=length(which(depdelay > 0)), 70 | total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]), 71 | avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])), 72 | median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)])) 73 | ), 74 | by=origin][, year:=yr] 75 | #change col order 76 | setcolorder(airport.stats, c("year", colnames(airport.stats)[-ncol(airport.stats)])) 77 | #save this data 78 | saveData(airport.stats, paste(flights.folder.path, "stats/3/airport_stats_", yr, ".csv", sep="")) 79 | #clear up space 80 | rm(airport.stats) 81 | 82 | # 83 | # get flights stats By airport by month 84 | # 85 | if(verbose) cat("Getting stats for airport by month:", '\n') 86 | airport.stats <- flights[, list(flights=length(uniquecarrier), 87 | flights_cancelled=sum(cancelled, na.rm=T), 88 | flights_departed_late=length(which(depdelay > 0)), 89 | total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]), 90 | avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])), 91 | median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)])) 92 | ), by=list(origin, month)][, year:=yr] 93 | #change col order 94 | setcolorder(airport.stats, c("year", colnames(airport.stats)[-ncol(airport.stats)])) 95 | #save this data 96 | saveData(airport.stats, paste(flights.folder.path, "stats/4/airport_stats_monthly_", yr, ".csv", sep="")) 97 | #clear up space 98 | rm(airport.stats) 99 | } 100 | 101 | getFlightStatsForYear <- function(flights, yr){ 102 | # 103 | # starting with flights data at aggregate level for every year 104 | # 105 | if(verbose) cat("Getting flight stats: ", '\n') 106 | flights.stats <- flights[, list(airlines = length(unique(uniquecarrier)), 107 | flights=length(uniquecarrier), 108 | flights_cancelled=sum(cancelled, na.rm=T), 109 | flights_diverted=sum(diverted, na.rm=T), 110 | flights_departed_late=length(which(depdelay > 0)), 111 | flights_arrived_late=length(which(arrdelay > 0)), 112 | total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]), 113 | avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])), 114 | median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)])), 115 | miles_traveled=sum(distance, na.rm=T), 116 | dep_airports=length(unique(origin)), 117 | arr_airports=length(unique(dest)), 118 | all_airports=length(union(unique(origin), unique(dest))), 119 | flights_delayed_reason_carrier=sum(!is.na(carrierdelay)), 120 | flights_delayed_reason_weather=sum(!is.na(weatherdelay)), 121 | flights_delayed_reason_security=sum(!is.na(securitydelay)) 122 | )][, year:=yr] 123 | #save this data 124 | saveData(flights.stats, paste(flights.folder.path, "stats/1/flights_stats_", yr, ".csv", sep="")) 125 | #clear up space 126 | rm(flights.stats) 127 | 128 | # 129 | # get flights stats By month for every year 130 | # 131 | if(verbose) cat("Getting flight stats by month: ", '\n') 132 | flights.stats.month <- flights[, list(airlines = length(unique(uniquecarrier)), 133 | flights=length(uniquecarrier), 134 | flights_cancelled=sum(cancelled, na.rm=T), 135 | flights_diverted=sum(diverted, na.rm=T), 136 | flights_departed_late=length(which(depdelay > 0)), 137 | flights_arrived_late=length(which(arrdelay > 0)), 138 | total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]), 139 | avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])), 140 | median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)])), 141 | miles_traveled=sum(distance, na.rm=T) 142 | ), 143 | by=month][, year:=yr] 144 | #change col order 145 | setcolorder(flights.stats.month, c("year", colnames(flights.stats.month)[-ncol(flights.stats.month)])) 146 | #save this data 147 | saveData(flights.stats.month, paste(flights.folder.path, "stats/2/flights_stats_by_month_", yr, ".csv", sep="")) 148 | #clear up space 149 | rm(flights.stats.month) 150 | 151 | } 152 | 153 | mapFlightStats <- function(){ 154 | for(j in 1:period) { 155 | if( j > 2) { 156 | yr <- as.integer(gsub("[^0-9]", "", gsub("(.*)(\\.csv)", "\\1", flights.files[j]))) 157 | flights.data.file <- paste(flights.folder.path, flights.files[j], sep="") 158 | if(verbose) cat(yr, ": Reading : ", flights.data.file, "\n") 159 | #flights.data.file <- "./pads/raw-data/flights/1987.csv.gz" 160 | flights <- data.table(read.csv(flights.data.file, stringsAsFactors=F)) 161 | col.names <- colnames(flights) 162 | setnames(flights, col.names, tolower(col.names)) 163 | flights <- flights[, list(year, month, uniquecarrier, origin, 164 | dest, cancelled, diverted, depdelay, 165 | arrdelay, distance, carrierdelay, 166 | weatherdelay,securitydelay)] 167 | setkeyv(flights, c("year", "uniquecarrier", "dest", "origin", "month")) 168 | if(verbose) cat("Starting analysis on: ", yr, "\n") 169 | getFlightStatsForYear(flights, yr) 170 | getFlightsStatusByAirlines(flights, yr) 171 | getFlightsStatsByAirport(flights, yr) 172 | } 173 | } 174 | } 175 | 176 | 177 | 178 | # 179 | #reduce all results 180 | # 181 | reduceFlightStats <- function(){ 182 | n <- 1:6 183 | folder.path <- paste("./pads/raw-data/flights/stats/", n, "/", sep="") 184 | print(folder.path) 185 | for(i in n){ 186 | filenames <- paste(folder.path[i], list.files(path=folder.path[i], pattern="*.csv"), sep="") 187 | dt <- do.call("rbind", lapply(filenames, read.csv, stringsAsFactors=F)) 188 | print(nrow(dt)) 189 | saveData(dt, paste("./pads/raw-data/flights/stats/", i, ".csv", sep="")) 190 | } 191 | } 192 | 193 | # 194 | # Run this job - initialize, generate stats for individiual years and then aggregate them together 195 | # to get single file for fligths, airports and airlines 196 | # 197 | runJob <- function(){ 198 | initForStats() 199 | #mapFlightStats() 200 | #reduceFlightStats() 201 | } -------------------------------------------------------------------------------- /flights_map_reduce.R: -------------------------------------------------------------------------------- 1 | # Authors: Jitender Aswani, Co-Founder @ datadolph.in 2 | # Date: 2013-15-5 3 | # Description: The Airline data set consists of flight arrival and departure details for all commercial flights from 1987 to 2008. 4 | # The approximately 120MM records (CSV format) occupy 12GB space. Data can be downloaded from here: http://stat-computing.org/dataexpo/2009/ 5 | # This R code simulates Map-Reduce functionality to analyze 22 years of historical data on flights. 6 | # Packages Used: data.table & plyr 7 | # Blog Reference: http://blog.datadolph.in/2013/06/big-data-analysis-performance-story-of-chicago-ohare-airport/ 8 | # Blog Reference: http://allthingsr.blogspot.com/2013/06/simulating-map-reduce-in-r-for-big-data.html 9 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 10 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 11 | # All rights reserved. 12 | 13 | # 14 | # initialize and read all 22 compressed CSV files 15 | # 16 | initForStats <- function(){ 17 | assign("flights.folder.path", "./raw-data/flights/", envir=.GlobalEnv) 18 | assign("verbose", T, envir=.GlobalEnv) 19 | flights.files <- list.files(path=flights.folder.path, pattern="*.csv.gz") 20 | assign("flights.files", flights.files, envir=.GlobalEnv) 21 | assign("period", length(flights.files), envir=.GlobalEnv) 22 | } 23 | 24 | # 25 | # get flights stats By airlines 26 | # 27 | getFlightsStatusByAirlines <- function(flights, yr){ 28 | # 29 | # by Year 30 | # 31 | if(verbose) cat("Getting stats for airlines:", '\n') 32 | airlines.stats <- flights[, list(dep_airports=length(unique(origin)), 33 | flights=length(origin), 34 | flights_cancelled=sum(cancelled, na.rm=T), 35 | flights_diverted=sum(diverted, na.rm=T), 36 | flights_departed_late=length(which(depdelay > 0)), 37 | flights_arrived_late=length(which(arrdelay > 0)), 38 | total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]), 39 | avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])), 40 | median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)])), 41 | miles_traveled=sum(distance, na.rm=T) 42 | ), 43 | by=uniquecarrier][, year:=yr] 44 | #change col order 45 | setcolorder(airlines.stats, c("year", colnames(airlines.stats)[-ncol(airlines.stats)])) 46 | #save this data 47 | saveData(airlines.stats, paste(flights.folder.path, "stats/5/airlines_stats_", yr, ".csv", sep="")) 48 | #clear up space 49 | rm(airlines.stats) 50 | 51 | # 52 | # by month 53 | # 54 | if(verbose) cat("Getting stats for airlines by month:", '\n') 55 | airlines.stats <- flights[, list(dep_airports=length(unique(origin)), 56 | flights=length(origin), 57 | flights_cancelled=sum(cancelled, na.rm=T), 58 | flights_diverted=sum(diverted, na.rm=T), 59 | flights_departed_late=length(which(depdelay > 0)), 60 | flights_arrived_late=length(which(arrdelay > 0)), 61 | total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]), 62 | avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])), 63 | median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)])), 64 | miles_traveled=sum(distance, na.rm=T) 65 | ), 66 | by=list(uniquecarrier, month)][, year:=yr] 67 | #change col order 68 | setcolorder(airlines.stats, c("year", colnames(airlines.stats)[-ncol(airlines.stats)])) 69 | #save this data 70 | saveData(airlines.stats, paste(flights.folder.path, "stats/6/airlines_stats_monthly_", yr, ".csv", sep="")) 71 | #clear up space 72 | rm(airlines.stats) 73 | 74 | } 75 | 76 | # 77 | # get flights stats By airport 78 | # 79 | getFlightsStatsByAirport <- function(flights, yr){ 80 | # 81 | # by year 82 | # 83 | if(verbose) cat("Getting stats for airport:", '\n') 84 | airport.stats <- flights[, list(flights=length(uniquecarrier), 85 | flights_cancelled=sum(cancelled, na.rm=T), 86 | flights_departed_late=length(which(depdelay > 0)), 87 | total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]), 88 | avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])), 89 | median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)])) 90 | ), 91 | by=origin][, year:=yr] 92 | #change col order 93 | setcolorder(airport.stats, c("year", colnames(airport.stats)[-ncol(airport.stats)])) 94 | #save this data 95 | saveData(airport.stats, paste(flights.folder.path, "stats/3/airport_stats_", yr, ".csv", sep="")) 96 | #clear up space 97 | rm(airport.stats) 98 | 99 | # 100 | # by month 101 | # 102 | if(verbose) cat("Getting stats for airport by month:", '\n') 103 | airport.stats <- flights[, list(flights=length(uniquecarrier), 104 | flights_cancelled=sum(cancelled, na.rm=T), 105 | flights_departed_late=length(which(depdelay > 0)), 106 | total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]), 107 | avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])), 108 | median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)])) 109 | ), by=list(origin, month)][, year:=yr] 110 | #change col order 111 | setcolorder(airport.stats, c("year", colnames(airport.stats)[-ncol(airport.stats)])) 112 | #save this data 113 | saveData(airport.stats, paste(flights.folder.path, "stats/4/airport_stats_monthly_", yr, ".csv", sep="")) 114 | #clear up space 115 | rm(airport.stats) 116 | } 117 | 118 | # 119 | # get flights stats 120 | # 121 | getFlightStatsForYear <- function(flights, yr){ 122 | # 123 | # for every year 124 | # 125 | if(verbose) cat("Getting flight stats: ", '\n') 126 | flights.stats <- flights[, list(airlines = length(unique(uniquecarrier)), 127 | flights=length(uniquecarrier), 128 | flights_cancelled=sum(cancelled, na.rm=T), 129 | flights_diverted=sum(diverted, na.rm=T), 130 | flights_departed_late=length(which(depdelay > 0)), 131 | flights_arrived_late=length(which(arrdelay > 0)), 132 | total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]), 133 | avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])), 134 | median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)])), 135 | miles_traveled=sum(distance, na.rm=T), 136 | dep_airports=length(unique(origin)), 137 | arr_airports=length(unique(dest)), 138 | all_airports=length(union(unique(origin), unique(dest))), 139 | flights_delayed_reason_carrier=sum(!is.na(carrierdelay)), 140 | flights_delayed_reason_weather=sum(!is.na(weatherdelay)), 141 | flights_delayed_reason_security=sum(!is.na(securitydelay)) 142 | )][, year:=yr] 143 | #save this data 144 | saveData(flights.stats, paste(flights.folder.path, "stats/1/flights_stats_", yr, ".csv", sep="")) 145 | #clear up space 146 | rm(flights.stats) 147 | 148 | # 149 | # by month for every year 150 | # 151 | if(verbose) cat("Getting flight stats by month: ", '\n') 152 | flights.stats.month <- flights[, list(airlines = length(unique(uniquecarrier)), 153 | flights=length(uniquecarrier), 154 | flights_cancelled=sum(cancelled, na.rm=T), 155 | flights_diverted=sum(diverted, na.rm=T), 156 | flights_departed_late=length(which(depdelay > 0)), 157 | flights_arrived_late=length(which(arrdelay > 0)), 158 | total_dep_delay_in_mins=sum(depdelay[which(depdelay > 0)]), 159 | avg_dep_delay_in_mins=round(mean(depdelay[which(depdelay > 0)])), 160 | median_dep_delay_in_mins=round(median(depdelay[which(depdelay > 0)])), 161 | miles_traveled=sum(distance, na.rm=T) 162 | ), 163 | by=month][, year:=yr] 164 | #change col order 165 | setcolorder(flights.stats.month, c("year", colnames(flights.stats.month)[-ncol(flights.stats.month)])) 166 | #save this data 167 | saveData(flights.stats.month, paste(flights.folder.path, "stats/2/flights_stats_by_month_", yr, ".csv", sep="")) 168 | #clear up space 169 | rm(flights.stats.month) 170 | 171 | } 172 | 173 | # 174 | #map all calculations 175 | # 176 | mapFlightStats <- function(){ 177 | for(j in 1:period) { 178 | if( j > 2) { 179 | yr <- as.integer(gsub("[^0-9]", "", gsub("(.*)(\\.csv)", "\\1", flights.files[j]))) 180 | flights.data.file <- paste(flights.folder.path, flights.files[j], sep="") 181 | if(verbose) cat(yr, ": Reading : ", flights.data.file, "\n") 182 | flights <- data.table(read.csv(flights.data.file, stringsAsFactors=F)) 183 | col.names <- colnames(flights) 184 | setnames(flights, col.names, tolower(col.names)) 185 | flights <- flights[, list(year, month, uniquecarrier, origin, 186 | dest, cancelled, diverted, depdelay, 187 | arrdelay, distance, carrierdelay, 188 | weatherdelay,securitydelay)] 189 | setkeyv(flights, c("year", "uniquecarrier", "dest", "origin", "month")) 190 | if(verbose) cat("Starting analysis on: ", yr, "\n") 191 | getFlightStatsForYear(flights, yr) 192 | getFlightsStatusByAirlines(flights, yr) 193 | getFlightsStatsByAirport(flights, yr) 194 | } 195 | } 196 | } 197 | 198 | 199 | # 200 | #reduce all results 201 | # 202 | reduceFlightStats <- function(){ 203 | n <- 1:6 204 | folder.path <- paste("./raw-data/flights/stats/", n, "/", sep="") 205 | print(folder.path) 206 | for(i in n){ 207 | filenames <- paste(folder.path[i], list.files(path=folder.path[i], pattern="*.csv"), sep="") 208 | dt <- do.call("rbind", lapply(filenames, read.csv, stringsAsFactors=F)) 209 | print(nrow(dt)) 210 | saveData(dt, paste("./raw-data/flights/stats/", i, ".csv", sep="")) 211 | } 212 | } 213 | 214 | # 215 | # Run this job - initialize, generate stats for individual years and then aggregate them together 216 | # to get single file for flights, airports and airlines 217 | # 218 | runJob <- function(){ 219 | initForStats() 220 | mapFlightStats() 221 | reduceFlightStats() 222 | -------------------------------------------------------------------------------- /Util.R: -------------------------------------------------------------------------------- 1 | # Utility Funcitons 2 | # Author: Jitender Aswani, Co-Founder @datadolph.in 3 | # Date: 3/15/2013 4 | # Copyright (c) 2011, under the Creative Commons Attribution-NonCommercial 3.0 Unported (CC BY-NC 3.0) License 5 | # For more information see: https://creativecommons.org/licenses/by-nc/3.0/ 6 | # All rights reserved. 7 | 8 | 9 | 10 | # 11 | #open a log file with today's date 12 | # 13 | openLogfile <- function(){ 14 | file.log <- file( paste("logs/pads_status.log",dataset, Sys.Date(), sep="_"), 'w') 15 | #initialize log file 16 | assign("file.log", file.log, envir=.GlobalEnv)} 17 | 18 | # 19 | #close the log file 20 | # 21 | closeLogfile <- function(){ 22 | close(file.log) 23 | } 24 | 25 | # 26 | # log a message in a vector 27 | # 28 | logMessage <- function(mesg){ 29 | #tryCatch(cat(paste(Sys.Date(), mesg, sep=":"), file = file.log), 30 | # error=function(e){ 31 | # openLogfile() 32 | # cat(paste(Sys.Date(), mesg, sep=":"), file = file.log) 33 | # }, 34 | # finally={ 35 | # } 36 | #) 37 | #cat("\n", file = file.log) 38 | vec.log[length(vec.log)+1] <<- paste(Sys.Date(), mesg, sep=":") 39 | } 40 | 41 | # 42 | # dump logs to the file system 43 | # 44 | dumpLogs <- function(){ 45 | openLogfile() 46 | cat(paste(vec.log, collpase="\n"), file = file.log) 47 | closeLogfile() 48 | } 49 | 50 | # 51 | # Clean Meta files folder 52 | # 53 | cleanMetaFiles <- function(){ 54 | folder.meta <- "./pads/meta/" 55 | meta.files <- list.files(path=folder.meta, pattern="*.json") 56 | for(i in meta.files) { 57 | meta.file.name <- paste(folder.meta, i, sep="") 58 | #delete the file 59 | unlink(meta.file.name) 60 | } 61 | } 62 | 63 | # 64 | # Clean csv files folder 65 | # 66 | cleanDataFiles <- function(){ 67 | folder.data <- "./pads/data/" 68 | data.files <- list.files(path=folder.data, pattern="*.csv") 69 | for(i in data.files) { 70 | data.file.name <- paste(folder.data, i, sep="") 71 | #delete the file 72 | unlink(data.file.name) 73 | } 74 | } 75 | 76 | # 77 | # Clean cache files folder 78 | # 79 | cleanCacheFiles <- function(){ 80 | folder.cache <- "./pads/cache/" 81 | cache.files <- list.files(path=folder.cache, pattern="*.rdata") 82 | for(i in cache.files) { 83 | cache.file.name <- paste(folder.cache, i, sep="") 84 | #delete the file 85 | unlink(cache.file.name) 86 | } 87 | } 88 | 89 | # 90 | #read a csv file 91 | # 92 | readFile <- function(file.location) { 93 | out <- tryCatch( 94 | read.csv(file.location, as.is=TRUE, header=TRUE, stringsAsFactors=FALSE, strip.white=TRUE), 95 | error=function(e) { 96 | message(paste("Unable to read the file:", pad.location)) 97 | message("Error message:") 98 | message(e) 99 | # Choose a return value in case of error 100 | return(NA) 101 | }, 102 | finally={ 103 | #message(paste("Processed URL:", url)) 104 | #message("Some other message at the end") 105 | } 106 | ) 107 | return(out) 108 | } 109 | 110 | # 111 | #write a csv file 112 | # 113 | saveData <- function(data, file.location){ 114 | write.csv(data, file.location, row.names=F) 115 | } 116 | 117 | 118 | # 119 | # remove leading/trailing spaces from a data.frame 120 | # 121 | trimData <- function(data){ 122 | return(data.frame(lapply(data, function(x) gsub("(^ +)|( +$)", "", x)), stringsAsFactors=F)) 123 | } 124 | 125 | # 126 | # replace NA in every column with zeros 127 | # 128 | replaceNAWithZeros <- function(data) { 129 | data[is.na(data)] <- 0 # replace NA with 0s 130 | return(data) 131 | } 132 | 133 | # 134 | # replace NULL in every cell with zeros 135 | # 136 | replaceNULLWithZeros <- function(data) { 137 | data[is.null(data)] <- 0 # replace null with 0s 138 | return(data) 139 | } 140 | 141 | # 142 | # replace NULL in every cell with NA 143 | # 144 | replaceNULLWithNA <- function(data) { 145 | data[is.null(data)] <- NA # replace null with 0s 146 | return(data) 147 | } 148 | 149 | # 150 | #get row wise json 151 | # 152 | getRowWiseJson <- function (jsonDT) { 153 | require("RJSONIO") 154 | row.json <- apply(jsonDT, 1, toJSON) 155 | json.st <- paste('[', paste(row.json, collapse=', '), ']') 156 | return (json.st) 157 | } 158 | 159 | # 160 | #get unique ID for pads 161 | # 162 | getPadUID <- function(){ 163 | systime <- as.character(unclass(Sys.time())) 164 | return(paste("pad", substr(systime, 1,10), substr(systime, 12,16), sep="")) 165 | } 166 | 167 | # 168 | #get GUID 169 | # 170 | getGUID <- function() { 171 | baseuuid <- paste(sample(c(letters[1:6],0:9),30,replace=TRUE),collapse="") 172 | 173 | paste( 174 | substr(baseuuid,1,8), 175 | "-", 176 | substr(baseuuid,9,12), 177 | "-", 178 | "4", 179 | substr(baseuuid,13,15), 180 | "-", 181 | sample(c("8","9","a","b"),1), 182 | substr(baseuuid,16,18), 183 | "-", 184 | substr(baseuuid,19,30), 185 | sep="", 186 | collapse="" 187 | ) 188 | } 189 | 190 | # 191 | #get csv data file url 192 | # 193 | getDataFileURL <- function(id) { 194 | return(paste("./pads/data/",id, ".csv", sep="")) 195 | } 196 | 197 | # 198 | #get JSON data file url 199 | # 200 | getMetaDataFileURL <- function(id) { 201 | return(paste("./pads/meta/",id, ".json", sep="")) 202 | } 203 | 204 | # 205 | #get R.data file url 206 | # 207 | getCacheDataFileURL <- function(id) { 208 | return(paste("./pads/cache/",id, ".rdata", sep="")) 209 | } 210 | 211 | # 212 | # Save the JSON Meta data for a PAD 213 | # 214 | saveMetaData <- function(out.file, jsonSt){ 215 | file.out <- file(out.file, 'wt') 216 | cat(jsonSt, file=file.out, fill = TRUE) 217 | close(file.out) 218 | } 219 | 220 | # 221 | #replace meta chars from an array of col names 222 | # 223 | replaceMetaChars <- function(col.names) { 224 | return(gsub("[^A-Za-z0-9]+", "_", col.names)) 225 | } 226 | 227 | # 228 | #remove meta chars from an array of values 229 | # 230 | removeMetaChars <- function(values) { 231 | return(gsub("[^A-Za-z0-9 ]+", "", values)) 232 | } 233 | 234 | # 235 | #remove non-numeric chars from an array of values 236 | # 237 | removeNonNumericChars <- function(values) { 238 | return(gsub("[^0-9.]+", "", values, fixed=T)) 239 | } 240 | 241 | 242 | # 243 | #remove meta from numeric columns 244 | # 245 | removeMetaFromNumeric <- function(values) { 246 | #as.numeric( gsub("(,+)|(%+)|(\\$+)","",x) ) 247 | return(as.numeric(gsub("[^0-9.]","",values))) 248 | } 249 | 250 | # 251 | #camel case 252 | # 253 | tocamel <- function(s, strict = FALSE) { 254 | cap <- function(s) paste(toupper(substring(s,1,1)), 255 | {s <- substring(s,2); if(strict) tolower(s) else s}, 256 | sep = "", collapse = " " ) 257 | sapply(strsplit(s, split = " "), cap, USE.NAMES = !is.null(names(s))) 258 | } 259 | 260 | 261 | # 262 | # clean a name 263 | # 264 | cleanName <- function(name) { 265 | clean1 <-gsub("[,]*[ ]*[&]*[ ]+(the|inc|llc|llp|ltd|limited|lp|ll|affiliates|incorporated|incorporation|corp|co|corp|corporation|stores|international|n\\.a|u\\.s\\.a)*[\\.]*[,]*$", "", name, ignore.case = T) 266 | clean2 <- gsub("[,]*[ ]*[&]*[ ]+(the|inc|llc|llp|ltd|limited|lp|affiliates|incorporated|incorporation|corp|co|corp|corporation|stores|international|n\\.a|u\\.s\\.a)*[\\.]*[,]*$", "", clean1, ignore.case = T) 267 | clean3 <- gsub("[&]", "and", clean2) 268 | clean4 <- gsub("-", "", clean3) 269 | #clean5 <- gsub("[^a-zA-Z]", "_" , clean4) 270 | #clean5 <- gsub("(_)+", "_" , unique(clean4)) 271 | return (clean4) 272 | } 273 | 274 | # 275 | # Get Class function to return a single class for every object 276 | # 277 | getClass <- function(obj) { 278 | obj.class <- class(obj) 279 | if(length(obj.class) > 1) 280 | return(obj.class[2]) 281 | else 282 | return(obj.class) 283 | } 284 | 285 | # 286 | # Generate solr doc from pad meta 287 | # 288 | generateSolrDocFromPADMeta <- function(meta.data) { 289 | generateSolrDoc(meta.data$id, meta.data$title, meta.data$category, 290 | meta.data$subcategory,meta.data$tags) 291 | } 292 | 293 | # 294 | # Generate solr doc and append it to a solr file for later processing 295 | # 296 | generateSolrDoc <- function(id, title, category, subcategory, tags) { 297 | file.out <- file("./pads/solr-docs.json", "r+") 298 | lines <- readLines(file.out) 299 | if(length(lines) != 0) 300 | solr.index.docs <- fromJSON(paste(lines, collapse="")) 301 | doc <- list("type"="pad", "id" = id, "title" = title, "category"=category, 302 | "subcategory"=subcategory, "tags"=tags) 303 | solr.index.docs[[length(solr.index.docs)+1]] <- doc 304 | cat(toJSON(solr.index.docs), file=file.out, fill = TRUE) 305 | close(file.out) 306 | } 307 | 308 | # 309 | # Save solr.index.doc to file system for later processing 310 | # 311 | persistSolrIndex <- function(solr.index.docs, source.name="") { 312 | file.out <- file(paste("./pads/solr-docs-", source.name, ".json", sep=""), "w") 313 | cat(toJSON(solr.index.docs), file=file.out, fill = TRUE) 314 | close(file.out) 315 | } 316 | 317 | # 318 | # Update multiple pads in a System PADS table 319 | # 320 | updateSystemPads <- function(file="./pads/QualifiedPADS.csv") { 321 | pads <- read.csv(file, stringsAsFactors=FALSE) 322 | for(i in 1:nrow(pads)) { 323 | print(paste("#", i, "Updating...", pads$id[i], sep=" ")) 324 | updateSystemPad(pads$id[i], pads$title[i], pads$desc[i], pads$category[i], 325 | pads$subcategory[i], pads$src[i], pads$tags[i]) 326 | 327 | ###### Update solr document 328 | generateSolrDoc(pads$id[i], pads$title[i], pads$category[i], pads$subcategory[i], 329 | pads$src[i], pads$tags[i]) 330 | } 331 | } 332 | 333 | # 334 | # get age 335 | # 336 | getAge <- function(dob, as.of.when, units){ 337 | switch(units, 338 | w = {# weeks 339 | return(difftime(strptime(as.of.when, format = "%Y/%m/%d"), 340 | strptime(dob, format = "%Y/%m/%d"),units="weeks")) 341 | }, 342 | m = { 343 | # months 344 | return((as.yearmon(strptime(as.of.when, format = "%Y/%m/%d"))- 345 | as.yearmon(strptime(dob, format = "%Y/%m/%d")))*12) 346 | }, 347 | y = { 348 | return(year(strptime(as.of.when, format = "%Y/%m/%d"))- 349 | year(strptime(dob, format = "%Y/%m/%d"))) 350 | }, 351 | ) 352 | } 353 | # 354 | # get time period for a give vector of dates 355 | # 356 | getTimePeriod <- function (date.vector){ 357 | yr <- year(date.vector) 358 | min.year <- min(yr, na.rm=T) 359 | max.year <- max(yr, na.rm=T) 360 | time.period <- paste(min.year, max.year, sep="-") 361 | return(time.period) 362 | } 363 | 364 | # 365 | # escpae meta chars 366 | # 367 | escapeMetaChars <- function(values){ 368 | return(gsub("[^A-Za-z0-9 ]+", "", values)) 369 | } 370 | 371 | # 372 | # remove special chars 373 | # 374 | removeSpecialChars <- function (node) { 375 | #val = gsub("[[:space:]]", "", xmlValue(node)) 376 | val = gsub("[\xc2\xa0]", "", xmlValue(node)) 377 | } 378 | 379 | # 380 | # set class num.with.dollar 381 | # 382 | setClass("num.with.dollar") 383 | setAs("character", "num.with.dollar", 384 | function(from) as.numeric(gsub("$","",from, fixed=TRUE))) 385 | 386 | # 387 | # set class num.with.commas 388 | # 389 | setClass("num.with.commas") 390 | setAs("character", "num.with.commas", 391 | function(from) as.numeric(gsub(",", "", from, fixed=TRUE))) 392 | 393 | # 394 | # set class mix.dates 395 | # an example - 02009-09-25 September 25, 2009 396 | # 397 | setClass("mixed.dates") 398 | setAs("character", "mixed.dates", 399 | function(from) as.Date(gsub("^0", "", str_extract(from, "^[0-9-]+")), "%Y-%m-%d")) 400 | 401 | # 402 | # revenrse string 403 | # strReverse(c("abc", "Statistics")) 404 | # 405 | strReverse <- function(x) 406 | sapply(lapply(strsplit(x, NULL), rev), paste, collapse="") 407 | 408 | # 409 | # get older data 410 | # 411 | # x <- mdy(stats$effective_date) 412 | getOlderDate <- function(x, year=1930){ 413 | m <- year(x) %% 100 414 | year(x) <- ifelse(m > year %% 100, 1900+m, 2000+m) 415 | x 416 | } 417 | -------------------------------------------------------------------------------- /learning/TwitterMap.R: -------------------------------------------------------------------------------- 1 | ######################################################################################### 2 | # An R function to make a personalized map of people you follow and who follow you on twitter. 3 | # R functions Copyright (C) 2011 Jeff Leek (jtleek@gmail.com), and the Simply Statistics Blog 4 | # (http://simplystatistics.tumblr.com, http://twitter.com/simplystats) 5 | # 6 | # This program is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # This program is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details, see . 15 | # 16 | # 17 | # These functions depend on the packages: twitteR, maps, geosphere, and RColorBrewer. It will 18 | # attempt to install them if they are not installed when you source this function. Care 19 | # should be used when using this function since the twitteR API has rate limiting in place. 20 | # If you have a large number of followers, or run the function many times, you may be 21 | # rate limited. 22 | # 23 | # 24 | # How to use: 25 | # # Source the function 26 | # source("http://biostat.jhsph.edu/~jleek/code/twitterMap.R") 27 | # 28 | # # Make your twittermap 29 | # twitterMap("simplystats") 30 | # 31 | # #If your location can't be found or latitude longitude can't be calculated 32 | # #choose a bigger city near you. The list of cities used by twitterMap 33 | # #can be found like so: 34 | # data(world.cities) 35 | # grep("Baltimore",world.cities[,1]) 36 | # 37 | # # Then make the map using that big city 38 | # twitterMap("simplystats",userLocation="Baltimore") 39 | # 40 | # #If you want both your followers and people you follow in a plot you can do: 41 | # twitterMap("simplystats",plotType="both") 42 | # 43 | ######################################################################################## 44 | getPckg <- function(pckg) install.packages(pckg, repos = "http://cran.r-project.org") 45 | 46 | pckg = try(require(twitteR)) 47 | if(!pckg) { 48 | cat("Installing 'twitteR' from CRAN\n") 49 | getPckg("twitteR") 50 | require("twitteR") 51 | } 52 | 53 | pckg = try(require(maps)) 54 | if(!pckg) { 55 | cat("Installing 'maps' from CRAN\n") 56 | getPckg("maps") 57 | require("maps") 58 | } 59 | 60 | pckg = try(require(geosphere)) 61 | if(!pckg) { 62 | cat("Installing 'geosphere' from CRAN\n") 63 | getPckg("geosphere") 64 | require("geosphere") 65 | } 66 | 67 | 68 | pckg = try(require(RColorBrewer)) 69 | if(!pckg) { 70 | cat("Installing 'RColorBrewer' from CRAN\n") 71 | getPckg("RColorBrewer") 72 | require("RColorBrewer") 73 | } 74 | 75 | 76 | twitterMap <- function(userName,userLocation=NULL,fileName="twitterMap.pdf",nMax = 1000,plotType=c("followers","both","following")){ 77 | 78 | # Get location data 79 | cat("Getting data from Twitter, this may take a moment.\n") 80 | tmp = getUser(userName) 81 | if(is.null(userLocation)){ 82 | userLocation = location(tmp) 83 | userLocation = trim(userLocation) 84 | if(nchar(userLocation) < 2){stop("We can not find your location from Twitter")} 85 | } 86 | 87 | followers=tmp$getFollowers(n=nMax) 88 | followersLocation = sapply(followers,function(x){location(x)}) 89 | following = tmp$getFriends(n=nMax) 90 | followingLocation = sapply(following,function(x){location(x)}) 91 | 92 | 93 | # Load the geographic data 94 | data(world.cities) 95 | data(us.cities) 96 | data(canada.cities) 97 | 98 | # Find the latitude and longitude of the user 99 | cat("Getting geographic (latitude/longitude) of Twitter users.\n") 100 | userLL <- findLatLon(userLocation)$latlon 101 | if(any(is.na(userLL))){stop("We can't find the latitude and longitude of your location from Twitter")} 102 | 103 | 104 | # Find the latitude and longitude of each of the followers/following 105 | # and calcualte the distance to the user 106 | 107 | followersLL = matrix(NA,nrow=length(followers),ncol=4) 108 | followingLL = matrix(NA,nrow=length(following),ncol=4) 109 | 110 | for(i in 1:length(followers)){ 111 | if(length(followersLocation[[i]]) > 0){ 112 | tmpLL = findLatLon(trim(followersLocation[[i]])) 113 | if(any(!is.na(tmpLL$latlon))){ 114 | followersLL[i,] = c(unlist(tmpLL$latlon),distCosine(userLL,tmpLL$latlon),unlist(tmpLL$cont)) 115 | } 116 | } 117 | } 118 | 119 | for(i in 1:length(following)){ 120 | if(length(followingLocation[[i]]) > 0){ 121 | tmpLL = findLatLon(trim(followingLocation[[i]])) 122 | if(any(!is.na(tmpLL$latlon))){ 123 | followingLL[i,] = c(unlist(tmpLL$latlon),distCosine(userLL,tmpLL$latlon),unlist(tmpLL$cont)) 124 | } 125 | } 126 | } 127 | 128 | followingLL = followingLL[order(-followingLL[,3]),] 129 | followersLL = followersLL[order(-followersLL[,3]),] 130 | 131 | followingLL = followingLL[!is.na(followingLL[,1]),] 132 | followersLL = followersLL[!is.na(followersLL[,1]),] 133 | 134 | 135 | cat("Plotting results.\n") 136 | # Set up the colors 137 | cols = brewer.pal(7,"Set2") 138 | 139 | # Both followers and following 140 | if(plotType=="both"){ 141 | pdf(fileName,height=12,width=10) 142 | data(worldMapEnv) 143 | par(mfrow=c(2,1),mar=rep(0,4)) 144 | map('world',col="#191919",bg="black",fill=T,mar=rep(0,4),border=0) 145 | 146 | mtext(paste("@",userName," Follower Map",sep=""),col="lightgrey") 147 | nFollowers = dim(followersLL)[1] 148 | for(i in 1:nFollowers){ 149 | greatC = getGreatCircle(userLL,followersLL[i,1:2]) 150 | lines(greatC,col=cols[followersLL[i,4]],lwd=0.8) 151 | } 152 | 153 | legend(-180,0,legend = c(paste("Asia",sum(followersLL[,4]==1)),paste("Africa",sum(followersLL[,4]==2)),paste("N. America",sum(followersLL[,4]==3)),paste("S. America",sum(followersLL[,4]==4)),paste("Australia/N.Z.",sum(followersLL[,4]==5)),paste("Europe",sum(followersLL[,4]==6))),text.col=cols[1:6],bg="black",cex=0.75) 154 | 155 | 156 | map('world',col="#191919",bg="black",fill=T,mar=rep(0,4),border=0) 157 | mtext(paste("@",userName," Following Map",sep=""),col="lightgrey") 158 | nFollowing = dim(followingLL)[1] 159 | for(i in 1:nFollowing){ 160 | greatC = getGreatCircle(userLL,followingLL[i,1:2]) 161 | lines(greatC,col=cols[followingLL[i,4]],lwd=0.8) 162 | } 163 | 164 | legend(-180,0,legend = c(paste("Asia",sum(followingLL[,4]==1)),paste("Africa",sum(followingLL[,4]==2)),paste("N. America",sum(followingLL[,4]==3)),paste("S. America",sum(followingLL[,4]==4)),paste("Australia/N.Z.",sum(followingLL[,4]==5)),paste("Europe",sum(followingLL[,4]==6))),text.col=cols[1:6],bg="black",cex=0.75) 165 | 166 | mtext("Created by @simplystats twitterMap",side=1,adj=1,cex=0.8,col="grey") 167 | dev.off() 168 | } 169 | 170 | ## Just followers 171 | if(plotType=="followers"){ 172 | pdf(fileName,height=6,width=10) 173 | data(worldMapEnv) 174 | map('world',col="#191919",bg="black",fill=T,mar=rep(0,4),border=0) 175 | 176 | mtext(paste("@",userName," Follower Map",sep=""),col="lightgrey") 177 | nFollowers = dim(followersLL)[1] 178 | for(i in 1:nFollowers){ 179 | greatC = getGreatCircle(userLL,followersLL[i,1:2]) 180 | lines(greatC,col=cols[followersLL[i,4]],lwd=0.8) 181 | } 182 | 183 | legend(-180,0,legend = c(paste("Asia",sum(followersLL[,4]==1)),paste("Africa",sum(followersLL[,4]==2)),paste("N. America",sum(followersLL[,4]==3)),paste("S. America",sum(followersLL[,4]==4)),paste("Australia/N.Z.",sum(followersLL[,4]==5)),paste("Europe",sum(followersLL[,4]==6))),text.col=cols[1:6],bg="black",cex=0.75) 184 | mtext("Created by @simplystats twitterMap",side=1,adj=1,cex=0.8,col="grey") 185 | dev.off() 186 | 187 | } 188 | 189 | ## Just following 190 | if(plotType=="following"){ 191 | pdf(fileName,height=6,width=10) 192 | data(worldMapEnv) 193 | map('world',col="#191919",bg="black",fill=T,mar=rep(0,4),border=0) 194 | mtext(paste("@",userName," Following Map",sep=""),col="lightgrey") 195 | nFollowing = dim(followingLL)[1] 196 | for(i in 1:nFollowing){ 197 | greatC = getGreatCircle(userLL,followingLL[i,1:2]) 198 | lines(greatC,col=cols[followingLL[i,4]],lwd=0.8) 199 | } 200 | 201 | legend(-180,0,legend = c(paste("Asia",sum(followingLL[,4]==1)),paste("Africa",sum(followingLL[,4]==2)),paste("N. America",sum(followingLL[,4]==3)),paste("S. America",sum(followingLL[,4]==4)),paste("Australia/N.Z.",sum(followingLL[,4]==5)),paste("Europe",sum(followingLL[,4]==6))),text.col=cols[1:6],bg="black",cex=0.75) 202 | 203 | mtext("Created by @simplystats twitterMap",side=1,adj=1,cex=0.8,col="grey") 204 | dev.off() 205 | 206 | } 207 | 208 | } 209 | 210 | 211 | findLatLon <- function(loc){ 212 | latlon = NA 213 | cont = NA 214 | 215 | # Asia = 1, Africa = 2, North America = 3, South America = 4, Australia/New Zealand = 5, Europe = 6 216 | continents = matrix(NA,nrow=length(unique(world.cities[,2])),ncol=2) 217 | continents[,1] = unique(world.cities[,2]) 218 | continents[1:10,2] = c(1,1,1,2,1,1,1,1,1,1) 219 | continents[11:20,2]= c(1,1,2,1,1,2,1,2,2,2) 220 | continents[21:30,2] = c(2,1,6,6,6,6,6,6,6,6) 221 | continents[31:40,2] = c(6,6,6,6,2,4,4,1,2,1) 222 | continents[41:50,2] = c(4,6,1,4,6,1,3,1,6,6) 223 | continents[51:60,2] = c(3,2,4,2,6,1,6,1,3,2) 224 | continents[61:70,2] = c(1,2,2,2,3,6,3,3,6,6) 225 | continents[71:80,2] = c(1,1,2,6,3,4,3,4,6,1) 226 | continents[81:90,2] = c(3,3,3,2,2,6,6,6,6,4) 227 | continents[91:100,2] = c(2,5,2,2,3,1,1,1,1,1) 228 | continents[101:110,2] = c(1,2,1,1,1,3,2,5,1,6) 229 | continents[111:120,2] = c(1,6,1,1,2,6,1,1,6,2) 230 | continents[121:130,2] = c(6,6,6,1,1,3,4,3,4,2) 231 | continents[131:140,2] = c(6,6,2,2,1,1,1,4,1,1) 232 | continents[141:150,2] = c(1,2,2,1,1,1,4,6,6,2) 233 | continents[151:160,2] = c(4,1,1,1,1,2,4,6,2,2) 234 | continents[161:170,2] = c(1,2,2,1,6,2,1,1,6,1) 235 | continents[171:180,2] = c(1,1,1,2,6,2,2,6,1,1) 236 | continents[181:190,2] = c(2,6,2,1,6,6,3,3,3,3) 237 | continents[191:200,2] = c(2,2,2,2,3,2,3,2,3,1) 238 | continents[201:210,2] = c(3,2,2,2,2,2,2,1,6,2) 239 | continents[211:220,2] = c(1,3,1,6,2,4,3,6,3,4) 240 | continents[221:230,2] = c(1,1,1,3,2,3,3,6,1,6) 241 | continents[231:232,2] = c(2,1) 242 | 243 | 244 | # Get the first element of the location 245 | # firstElement = strsplit(loc,"[^[:alnum:]]")[[1]][1] 246 | firstElement = strsplit(loc,",")[[1]][1] 247 | if(is.na(firstElement)){firstElement="zzzzzzzzz"} 248 | 249 | # See if it is a city 250 | tmp = grep(firstElement,world.cities[,1],fixed=TRUE) 251 | tmp2 = grep(firstElement,state.name,fixed=TRUE) 252 | tmp3 = grep(firstElement,world.cities[,2],fixed=TRUE) 253 | 254 | if(length(tmp) == 1){ 255 | latlon = world.cities[tmp,c(5,4)] 256 | cont = continents[which(world.cities[tmp,2]==continents[,1]),2] 257 | }else if(length(tmp) > 1){ 258 | tmpCities = world.cities[tmp,] 259 | latlon = tmpCities[which.max(tmpCities$pop),c(5,4)] 260 | cont = continents[which(tmpCities[which.max(tmpCities$pop),2]==continents[,1]),2] 261 | }else if(length(tmp2) == 1){ 262 | latlon = c(state.center$x[tmp2],state.center$y[tmp2]) 263 | cont = 3 264 | }else if(length(tmp3) > 0){ 265 | tmpCities = world.cities[tmp3,] 266 | latlon = tmpCities[which.max(tmpCities$pop),c(5,4)] 267 | cont = continents[which(tmpCities[which.max(tmpCities$pop),2]==continents[,1]),2] 268 | } 269 | 270 | return(list(latlon=latlon,cont=as.numeric(cont))) 271 | 272 | } 273 | 274 | 275 | getGreatCircle = function(userLL,relationLL){ 276 | tmpCircle = greatCircle(userLL,relationLL) 277 | start = which.min(abs(tmpCircle[,1] - userLL[1,1])) 278 | end = which.min(abs(tmpCircle[,1] - relationLL[1])) 279 | greatC = tmpCircle[start:end,] 280 | return(greatC) 281 | } 282 | 283 | trim <- function (x) gsub("^\\s+|\\s+$", "", x) --------------------------------------------------------------------------------