├── DataScienceBook.txt ├── MappingScripts.R ├── MyMode.R ├── README.md └── twitterSupport.R /MappingScripts.R: -------------------------------------------------------------------------------- 1 | # Mapping scripts 2 | # EnsurePackage(x) - Installs and loads a package if necessary 3 | EnsurePackage<-function(x) 4 | { 5 | x <- as.character(x) 6 | if (!require(x,character.only=TRUE)) 7 | { 8 | install.packages(pkgs=x,repos="http://cran.r-project.org") 9 | require(x,character.only=TRUE) 10 | } 11 | } 12 | 13 | 14 | # Format an URL for the Google Geocode API 15 | MakeGeoURL <- function(address) 16 | { 17 | 18 | root <- "http://maps.google.com/maps/api/geocode/" 19 | 20 | url <- paste(root, "json?address=", address, "&sensor=false", sep = "") 21 | 22 | return(URLencode(url)) 23 | } 24 | 25 | Addr2latlng <- function(address) 26 | { 27 | url <- MakeGeoURL(address) 28 | 29 | apiResult <- getURL(url) 30 | 31 | geoStruct <- fromJSON(apiResult, simplify = FALSE) 32 | 33 | lat <- NA 34 | lng <- NA 35 | 36 | 37 | try(lat <- geoStruct$results[[1]]$geometry$location$lat, silent=TRUE) 38 | try(lng <- geoStruct$results[[1]]$geometry$location$lng, silent=TRUE) 39 | 40 | return(c(lat, lng)) 41 | } 42 | 43 | # Process a whole list of addresses 44 | ProcessAddrList <- function(addrList) 45 | { 46 | resultDF <- data.frame(atext=character(),X=numeric(),Y=numeric(),EID=numeric()) 47 | i <- 1 48 | 49 | for (addr in addrList) 50 | { 51 | latlng = Addr2latlng(addr) 52 | resultDF <- rbind(resultDF, data.frame(atext=addr,X=latlng[[2]],Y=latlng[[1]], EID=i)) 53 | i <- i + 1 54 | } 55 | 56 | return(resultDF) 57 | } -------------------------------------------------------------------------------- /MyMode.R: -------------------------------------------------------------------------------- 1 | MyMode <- function(myVector) 2 | { 3 | uniqueValues <- unique(myVector) 4 | uniqueCounts <- tabulate(match(myVector,uniqueValues)) 5 | 6 | return(uniqueValues[which.max(uniqueCounts)]) 7 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | data-science-r 2 | ============== 3 | 4 | R code and documentation for "Introduction to Data Science" by Jeffrey Stanton -------------------------------------------------------------------------------- /twitterSupport.R: -------------------------------------------------------------------------------- 1 | # EnsurePackage(x) - Installs and loads a package if necessary 2 | EnsurePackage<-function(x) 3 | { 4 | x <- as.character(x) 5 | if (!require(x,character.only=TRUE)) 6 | { 7 | install.packages(pkgs=x,repos="http://cran.r-project.org") 8 | require(x,character.only=TRUE) 9 | } 10 | } 11 | 12 | # PrepareTwitter() - Load packages for working with twitteR 13 | PrepareTwitter<-function() 14 | { 15 | EnsurePackage("bitops") 16 | EnsurePackage("RCurl") 17 | EnsurePackage("RJSONIO") 18 | EnsurePackage("twitteR") 19 | } 20 | 21 | # TweetFrame() - Return a dataframe based on a search of Twitter 22 | TweetFrame<-function(searchTerm, maxTweets) 23 | { 24 | tweetList <- searchTwitter(searchTerm, n=maxTweets) 25 | 26 | # as.data.frame() coerces each list element into a row 27 | # lapply() applies this to all of the elements in twtList 28 | # rbind() takes all of the rows and puts them together 29 | # do.call() gives rbind() all the rows as individual elements 30 | tweetDF <- do.call("rbind", lapply(tweetList,as.data.frame)) 31 | 32 | # This last step sorts the tweets in arrival order 33 | return(tweetDF[order(as.integer(tweetDF$created)), ]) 34 | } 35 | 36 | # CleanTweets() - Takes the junk out of a vector of tweet texts 37 | CleanTweets<-function(tweets) 38 | { 39 | # Remove redundant spaces 40 | tweets <- str_replace_all(tweets," "," ") 41 | # Get rid of URLs 42 | tweets <- str_replace_all(tweets, "http://t.co/[a-z,A-Z,0-9]{8}","") 43 | # Take out retweet header, there is only one 44 | tweets <- str_replace(tweets,"RT @[a-z,A-Z]*: ","") 45 | tweets <- str_replace_all(tweets,"#[a-z,A-Z]*","") 46 | tweets <- str_replace_all(tweets,"@[a-z,A-Z]*","") 47 | return(tweets) 48 | } 49 | 50 | # ArrivalProbability - Given a list of arrival times 51 | # calculates the delays between them with lagged differences 52 | # then computes a list of cumulative probabilties of arrival 53 | # for a list of time increments 54 | # times - A sorted, ascending list of arrival times in POSIXct 55 | # increment - the time increment for each new probability 56 | # max - the highest time increment 57 | # 58 | # Returns - an ordered list of probabilities in a numeric vector 59 | # suitable for plotting with plot() 60 | ArrivalProbability<-function(times, increment, max) 61 | { 62 | # Initialize an empty vector 63 | plist <- NULL 64 | 65 | # Probability is defined over the size of this sample 66 | # of arrival times 67 | timeLen <- length(times) 68 | 69 | # May not be necessary, but checks for input mistake 70 | if (increment>max) {return(NULL)} 71 | 72 | for (i in seq(increment, max, by=increment)) 73 | { 74 | # diff() requires a sorted list of times 75 | # diff() calculates the delays between neighboring times 76 | # the logical test max) {return(NULL)} 96 | 97 | for (i in seq(increment, max, by=increment)) 98 | { 99 | # the logical test