├── DataScienceBook.txt
├── MappingScripts.R
├── MyMode.R
├── README.md
└── twitterSupport.R


/MappingScripts.R:
--------------------------------------------------------------------------------
 1 | # Mapping scripts
 2 | # EnsurePackage(x) - Installs and loads a package if necessary
 3 | EnsurePackage<-function(x)
 4 | {
 5 |   x <- as.character(x)
 6 |   if (!require(x,character.only=TRUE))
 7 |   {
 8 |     install.packages(pkgs=x,repos="http://cran.r-project.org")
 9 |     require(x,character.only=TRUE)
10 |   }
11 | }
12 | 
13 | 
14 | # Format an URL for the Google Geocode API
15 | MakeGeoURL <- function(address) 
16 | {
17 |   
18 |   root <- "http://maps.google.com/maps/api/geocode/"
19 |   
20 |   url <- paste(root, "json?address=", address, "&sensor=false", sep = "")
21 |   
22 |   return(URLencode(url))
23 | }
24 | 
25 | Addr2latlng <- function(address) 
26 | {
27 |   url <- MakeGeoURL(address)
28 |   
29 |   apiResult <- getURL(url)
30 |   
31 |   geoStruct <- fromJSON(apiResult, simplify = FALSE)
32 |   
33 |   lat <- NA
34 |   lng <- NA
35 |   
36 |   
37 |   try(lat <- geoStruct$results[[1]]$geometry$location$lat, silent=TRUE)
38 |   try(lng <- geoStruct$results[[1]]$geometry$location$lng, silent=TRUE)
39 |   
40 |   return(c(lat, lng))
41 | }
42 | 
43 | # Process a whole list of addresses
44 | ProcessAddrList <- function(addrList)
45 | {
46 |   resultDF <- data.frame(atext=character(),X=numeric(),Y=numeric(),EID=numeric())
47 |   i <- 1
48 |   
49 |   for (addr in addrList)
50 |   {
51 |     latlng = Addr2latlng(addr)
52 |     resultDF <- rbind(resultDF, data.frame(atext=addr,X=latlng[[2]],Y=latlng[[1]], EID=i))
53 |     i <- i + 1
54 |   }
55 |   
56 |   return(resultDF)
57 | }


--------------------------------------------------------------------------------
/MyMode.R:
--------------------------------------------------------------------------------
1 | MyMode <- function(myVector)
2 | {
3 |   uniqueValues <- unique(myVector)
4 |   uniqueCounts <- tabulate(match(myVector,uniqueValues))
5 |   
6 |   return(uniqueValues[which.max(uniqueCounts)])
7 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | data-science-r
2 | ==============
3 | 
4 | R code and documentation for "Introduction to Data Science" by Jeffrey Stanton


--------------------------------------------------------------------------------
/twitterSupport.R:
--------------------------------------------------------------------------------
  1 | # EnsurePackage(x) - Installs and loads a package if necessary
  2 | EnsurePackage<-function(x)
  3 | {
  4 |   x <- as.character(x)
  5 |   if (!require(x,character.only=TRUE))
  6 |   {
  7 |     install.packages(pkgs=x,repos="http://cran.r-project.org")
  8 |     require(x,character.only=TRUE)
  9 |   }
 10 | }
 11 | 
 12 | # PrepareTwitter() - Load packages for working with twitteR
 13 | PrepareTwitter<-function()
 14 | {
 15 |   EnsurePackage("bitops")
 16 |   EnsurePackage("RCurl")
 17 |   EnsurePackage("RJSONIO")
 18 |   EnsurePackage("twitteR")
 19 | }
 20 | 
 21 | # TweetFrame() - Return a dataframe based on a search of Twitter
 22 | TweetFrame<-function(searchTerm, maxTweets)
 23 | {
 24 |   tweetList <- searchTwitter(searchTerm, n=maxTweets)
 25 |   
 26 |   # as.data.frame() coerces each list element into a row
 27 |   # lapply() applies this to all of the elements in twtList
 28 |   # rbind() takes all of the rows and puts them together
 29 |   # do.call() gives rbind() all the rows as individual elements
 30 |   tweetDF <- do.call("rbind", lapply(tweetList,as.data.frame))
 31 |   
 32 |   # This last step sorts the tweets in arrival order
 33 |   return(tweetDF[order(as.integer(tweetDF$created)), ])
 34 | }
 35 | 
 36 | # CleanTweets() - Takes the junk out of a vector of tweet texts
 37 | CleanTweets<-function(tweets)
 38 | {
 39 |   # Remove redundant spaces
 40 |   tweets <- str_replace_all(tweets,"  "," ")
 41 |   # Get rid of URLs
 42 |   tweets <- str_replace_all(tweets, "http://t.co/[a-z,A-Z,0-9]{8}","")
 43 |   # Take out retweet header, there is only one
 44 |   tweets <- str_replace(tweets,"RT @[a-z,A-Z]*: ","")
 45 |   tweets <- str_replace_all(tweets,"#[a-z,A-Z]*","")
 46 |   tweets <- str_replace_all(tweets,"@[a-z,A-Z]*","")
 47 |   return(tweets)
 48 | }
 49 | 
 50 | # ArrivalProbability - Given a list of arrival times
 51 | # calculates the delays between them with lagged differences
 52 | # then computes a list of cumulative probabilties of arrival
 53 | # for a list of time increments
 54 | # times - A sorted, ascending list of arrival times in POSIXct
 55 | # increment - the time increment for each new probability
 56 | # max - the highest time increment
 57 | #
 58 | # Returns - an ordered list of probabilities in a numeric vector
 59 | # suitable for plotting with plot()
 60 | ArrivalProbability<-function(times, increment, max)
 61 | {
 62 |   # Initialize an empty vector
 63 |   plist <- NULL
 64 |   
 65 |   # Probability is defined over the size of this sample
 66 |   # of arrival times
 67 |   timeLen <- length(times)
 68 |   
 69 |   # May not be necessary, but checks for input mistake
 70 |   if (increment>max) {return(NULL)}
 71 |   
 72 |   for (i in seq(increment, max, by=increment))
 73 |   {
 74 |     # diff() requires a sorted list of times
 75 |     # diff() calculates the delays between neighboring times
 76 |     # the logical test <i provides a list of TRUEs and FALSEs
 77 |     # of length = timeLen, then sum() counts the TRUEs
 78 |     plist<-c(plist,(sum(as.integer(diff(times))<i))/timeLen)
 79 |   }
 80 |   return(plist)
 81 | }
 82 | 
 83 | # Like ArrivalProbability, but works with an unsorted list
 84 | # of delay times
 85 | DelayProbability<-function(delays, increment, max)
 86 | {
 87 |   # Initialize an empty vector
 88 |   plist <- NULL
 89 |   
 90 |   # Probability is defined over the size of this sample
 91 |   # of arrival times
 92 |   delayLen <- length(delays)
 93 |   
 94 |   # May not be necessary, but checks for input mistake
 95 |   if (increment>max) {return(NULL)}
 96 |   
 97 |   for (i in seq(increment, max, by=increment))
 98 |   {
 99 |     # the logical test <i provides a list of TRUEs and FALSEs
100 |     # of length = timeLen, then sum() counts the TRUEs
101 |     plist<-c(plist,(sum(delays<=i)/delayLen))
102 |   }
103 |   return(plist)
104 | }
105 | 
106 | # Compare tweets - Run poisson.test() on rate ratio for two tweet streams
107 | # search1 - the first hashtag or search twerm to look for
108 | # search2 - the second search term or hashtag to look for
109 | # numEvents - the number of events to sample for each search
110 | CompareTweets <- function(search1, search2, numEvents)
111 | {
112 |   tweetDF <- TweetFrame(search1, numEvents)
113 |   sortweetDF<-tweetDF[order(as.integer(tweetDF$created)), ] 
114 |   eventDelays1 <- as.integer(diff(sortweetDF$created))
115 |   meanDelays1 <- round(mean(eventDelays1))
116 |   
117 |   tweetDF <- TweetFrame(search2, numEvents)
118 |   sortweetDF<-tweetDF[order(as.integer(tweetDF$created)), ] 
119 |   eventDelays2 <- as.integer(diff(sortweetDF$created))
120 |   
121 |   eventCount1 <- sum(eventDelays1<=meanDelays1)
122 |   eventCount2 <- sum(eventDelays2<=meanDelays1)
123 |   
124 |   return(poisson.test(c(eventCount1,eventCount2),c(numEvents,numEvents)))
125 | }


--------------------------------------------------------------------------------