├── 01 - Hello, Data.R
├── 02 - Getting Data from the Web.R
├── 03 - Filtering and Summarizing Data.R
├── 04 - Restructuring Data.R
├── 05 - Building Models.R
├── 06 - Beyond the Linear Trend Line.R
├── 07 - Unstructured Data.R
├── 08 - Polishing Data.R
├── 09 - From Big to Smaller Data.R
├── 10 - Classification and Clustering.R
├── 11 - Social Network Analysis of the R ecosystem.R
├── 12 - Analysing Time-series.R
├── 13 - Data Around Us.R
├── 14 - Analyzing the R Community.R
├── README.md
└── data
    └── SMART_2013.RData


/01 - Hello, Data.R:
--------------------------------------------------------------------------------
  1 | ## Extracted code chunks from
  2 | ##
  3 | ##  Gergely Daróczi (2015): Mastering Data Analysis with R.
  4 | ##
  5 | ##    Chapter #1: Hello, Data! pp. 1-36.
  6 | ##
  7 | ##
  8 | ##  This file includes the code chunks from the above mentioned
  9 | ##  chapter except for the leading ">" and "+" characters, which
 10 | ##  stand for the prompt in the R console. The prompt was
 11 | ##  intentionally removed here along with arbitrary line-breaks,
 12 | ##  so that you copy and paste the R expressions to the R console
 13 | ##  in a more convenient and seamless way.
 14 | ##
 15 | ##  Code chunks are grouped here by the printed pages of the book.
 16 | ##  Two hash signs at the beginning of a line stands for a page
 17 | ##  break, while an extra empty line between the code chunks
 18 | ##  represents one or more paragraphs in the original book between
 19 | ##  the examples for easier navigation.
 20 | ##
 21 | ##  Sometimes extra instructions starting with a double hash are
 22 | ##  also provided on how to run the below expressions.
 23 | ##
 24 | ##
 25 | ##  Find more information on the book at http://bit.ly/mastering-R
 26 | ##  and you can contact me on Twitter and GitHub by the @daroczig
 27 | ##  handle, or mail me at daroczig@rapporter.net
 28 | ##
 29 | 
 30 | library('hflights')
 31 | write.csv(hflights, 'hflights.csv', row.names = FALSE)
 32 | 
 33 | ##
 34 | 
 35 | str(hflights)
 36 | 
 37 | ##
 38 | 
 39 | system.time(read.csv('hflights.csv'))
 40 | 
 41 | colClasses <- sapply(hflights, class)
 42 | system.time(read.csv('hflights.csv', colClasses = colClasses))
 43 | 
 44 | library(microbenchmark)
 45 | f <- function() read.csv('hflights.csv')
 46 | g <- function() read.csv('hflights.csv', colClasses = colClasses, nrows = 227496, comment.char = '')
 47 | res <- microbenchmark(f(), g())
 48 | res
 49 | 
 50 | ## p. 3 infobox
 51 | library(R.utils)
 52 | countLines('hflights.csv')
 53 | 
 54 | ##
 55 | 
 56 | boxplot(res, xlab  = '', main = expression(paste('Benchmarking ', italic('read.table'))))
 57 | 
 58 | library(sqldf)
 59 | system.time(read.csv.sql('hflights.csv'))
 60 | 
 61 | ##
 62 | 
 63 | library(ff)
 64 | system.time(read.csv.ffdf(file = 'hflights.csv'))
 65 | 
 66 | library(bigmemory)
 67 | system.time(read.big.matrix('hflights.csv', header = TRUE))
 68 | 
 69 | ##
 70 | 
 71 | library(data.table)
 72 | system.time(dt <- fread('hflights.csv'))
 73 | 
 74 | df <- as.data.frame(dt)
 75 | 
 76 | ##
 77 | 
 78 | is.data.frame(dt)
 79 | 
 80 | ## p. 6
 81 | .read.csv.orig   <- function() read.csv('hflights.csv')
 82 | .read.csv.opt    <- function() read.csv('hflights.csv', colClasses = colClasses, nrows = 227496, comment.char = '', stringsAsFactors = FALSE)
 83 | .read.csv.sql    <- function() read.csv.sql('hflights.csv')
 84 | .read.csv.ffdf   <- function() read.csv.ffdf(file = 'hflights.csv')
 85 | .read.big.matrix <- function() read.big.matrix('hflights.csv', header = TRUE)
 86 | .fread           <- function() fread('hflights.csv')
 87 | 
 88 | res <- microbenchmark(.read.csv.orig(), .read.csv.opt(), .read.csv.sql(), .read.csv.ffdf(), .read.big.matrix(), .fread(), times = 10)
 89 | 
 90 | print(res, digits = 6)
 91 | 
 92 | ##
 93 | 
 94 | df <- read.csv.sql('hflights.csv', "select * from file where Dest = '\"BNA\"'")
 95 | 
 96 | ##
 97 | 
 98 | df <- read.csv.sql('hflights.csv', "select * from file where Dest = 'BNA'", filter = 'tr -d ^\\" ')
 99 | 
100 | str(df)
101 | 
102 | system.time(read.csv.sql('hflights.csv', "select * from file"))
103 | system.time(read.csv.sql('hflights.csv', "select * from file where Dest = '\"BNA\"'"))
104 | 
105 | system.time(system('cat hflights.csv | grep BNA', intern = TRUE))
106 | 
107 | ##
108 | 
109 | sqldf("attach 'hflights_db' as new")
110 | 
111 | read.csv.sql('hflights.csv', sql = 'create table hflights as select * from file', dbname = 'hflights_db')
112 | 
113 | system.time(df <- sqldf("select * from hflights where Dest = '\"BNA\"'", dbname = "hflights_db"))
114 | 
115 | ##
116 | 
117 | ## to be pasted in the MySQL command-line:
118 | create database hflights_db;
119 | grant all privileges on hflights_db.* to 'user'@'localhost' identified by 'password';
120 | flush privileges;
121 | exit;
122 | 
123 | ##
124 | 
125 | library(RMySQL)
126 | con <- dbConnect(dbDriver('MySQL'), user = 'user', password = 'password', dbname = 'hflights_db')
127 | 
128 | dbWriteTable(con, 'hflights', hflights)
129 | dbListTables(con)
130 | 
131 | system.time(dbReadTable(con, 'hflights'))
132 | 
133 | system.time(dbGetQuery(con, 'select * from hflights'))
134 | 
135 | ##
136 | 
137 | options('sqldf.connection' = con)
138 | system.time(sqldf('select * from hflights'))
139 | 
140 | system.time(sqldf('SELECT * FROM hflights WHERE Dest = "BNA"'))
141 | 
142 | ##
143 | 
144 | dbSendQuery(con, 'CREATE INDEX Dest_idx ON hflights (Dest(3));')
145 | 
146 | system.time(sqldf('SELECT * FROM hflights WHERE Dest = "BNA"'))
147 | 
148 | options(sqldf.driver = 'SQLite')
149 | sqldf("CREATE INDEX Dest_idx ON hflights (Dest);", dbname = "hflights_db")
150 | system.time(sqldf("select * from hflights where Dest = '\"BNA\"'", dbname = "hflights_db"))
151 | 
152 | ##
153 | 
154 | ## to be pasted in the command-line:
155 | createuser --pwprompt user
156 | createdb hflights_db
157 | psql
158 | 
159 | ## to be pasted in the PostgreSQL command-line:
160 | \du
161 | \list
162 | grant all privileges on hflights_db to user
163 | \q
164 | 
165 | ##
166 | 
167 | library(RPostgreSQL)
168 | 
169 | ##
170 | 
171 | con <- dbConnect(dbDriver('PostgreSQL'), user = 'user', password = 'password', dbname = 'hflights_db')
172 | 
173 | dbListTables(con)
174 | dbExistsTable(con, 'hflights')
175 | 
176 | dbWriteTable(con, 'hflights', hflights)
177 | system.time(dbReadTable(con, 'hflights'))
178 | 
179 | system.time(dbGetQuery(con, "SELECT * FROM hflights WHERE \"Dest\" = 'BNA';"))
180 | 
181 | ##
182 | 
183 | ## to be pasted in bash to install ROracle on Arch Linux
184 | R CMD INSTALL --configure-args='--with-oci-lib=/usr/include/ --with-oci-inc=/usr/share/licenses/oracle-instantclient-basic' ROracle_1.1-11.tar.gz
185 | 
186 | library(ROracle)
187 | con <- dbConnect(dbDriver('Oracle'), user = 'pmuser', password = 'oracle', dbname = '//192.168.0.16:1521/PDB1')
188 | summary(con)
189 | 
190 | dbListTables(con)
191 | 
192 | ##
193 | 
194 | dbWriteTable(con, 'hflights', hflights)
195 | system.time(dbReadTable(con, 'hflights'))
196 | 
197 | system.time(dbGetQuery(con, "SELECT * FROM \"hflights\" WHERE \"Dest\" = 'BNA'"))
198 | 
199 | dbSendQuery(con, 'CREATE INDEX Dest_idx ON "hflights" ("Dest")')
200 | 
201 | ##
202 | 
203 | system.time(dbGetQuery(con, "SELECT * FROM \"hflights\" WHERE \"Dest\" = 'BNA'"))
204 | 
205 | ##
206 | 
207 | ## the content of /etc/odbcinst.ini on Linux
208 | [MySQL]
209 | Description     = ODBC Driver for MySQL
210 | Driver          = /usr/lib/libmyodbc.so
211 | Setup           = /usr/lib/libodbcmyS.so
212 | FileUsage       = 1
213 | 
214 | ## the content of /etc/odbc.ini on Linux
215 | [hflights]
216 | Description     = MySQL hflights test
217 | Driver          = MySQL
218 | Server          = localhost
219 | Database        = hflights_db
220 | Port            = 3306
221 | Socket          = /var/run/mysqld/mysqld.sock
222 | 
223 | ##
224 | 
225 | library(RODBC)
226 | con <- odbcConnect("hflights", uid = "user", pwd = "password")
227 | 
228 | system.time(hflights <- sqlQuery(con, "select * from hflights"))
229 | 
230 | sqlDrop(con, 'hflights')
231 | sqlSave(con, hflights, 'hflights')
232 | 
233 | ##
234 | 
235 | close(con)
236 | 
237 | library(dbConnect)
238 | DatabaseConnect()
239 | 
240 | ##
241 | 
242 | library(devtools)
243 | install_github('bigrquery', 'hadley')
244 | 


--------------------------------------------------------------------------------
/02 - Getting Data from the Web.R:
--------------------------------------------------------------------------------
  1 | ## Extracted code chunks from
  2 | ##
  3 | ##  Gergely Daróczi (2015): Mastering Data Analysis with R.
  4 | ##
  5 | ##    Chapter #2: Getting Data from the Web. pp. 37-64.
  6 | ##
  7 | ##
  8 | ##  This file includes the code chunks from the above mentioned
  9 | ##  chapter except for the leading ">" and "+" characters, which
 10 | ##  stand for the prompt in the R console. The prompt was
 11 | ##  intentionally removed here along with arbitrary line-breaks,
 12 | ##  so that you copy and paste the R expressions to the R console
 13 | ##  in a more convenient and seamless way.
 14 | ##
 15 | ##  Code chunks are grouped here by the printed pages of the book.
 16 | ##  Two hash signs at the beginning of a line stands for a page
 17 | ##  break, while an extra empty line between the code chunks
 18 | ##  represents one or more paragraphs in the original book between
 19 | ##  the examples for easier navigation.
 20 | ##
 21 | ##  Sometimes extra instructions starting with a double hash are
 22 | ##  also provided on how to run the below expressions.
 23 | ##
 24 | ##
 25 | ##  Find more information on the book at http://bit.ly/mastering-R
 26 | ##  and you can contact me on Twitter and GitHub by the @daroczig
 27 | ##  handle, or mail me at daroczig@rapporter.net
 28 | ##
 29 | 
 30 | str(read.csv('http://opengeocode.org/download/CCurls.txt'))
 31 | 
 32 | ##
 33 | 
 34 | library(RCurl)
 35 | df <- read.csv(text = getURL('https://data.consumerfinance.gov/api/views/x94z-ydhh/rows.csv?accessType=DOWNLOAD'))
 36 | str(df)
 37 | sort(table(df$Product))
 38 | 
 39 | ##
 40 | 
 41 | library(rjson)
 42 | u <- 'http://data.consumerfinance.gov/api/views'
 43 | fromJSON(file = u)
 44 | 
 45 | ##
 46 | 
 47 | res <- fromJSON(file = paste0(u, '/25ei-6bcr/rows.json?max_rows=5'))
 48 | names(res)
 49 | 
 50 | res <- res$data
 51 | class(res)
 52 | 
 53 | df <- as.data.frame(t(sapply(res, function(x) unlist(x[-13]))))
 54 | str(df)
 55 | 
 56 | ##
 57 | 
 58 | library(plyr)
 59 | df <- ldply(res, function(x) unlist(x[-13]))
 60 | 
 61 | names(df) <- sapply(res$meta$view$columns, `[`, 'name')[-13]
 62 | 
 63 | ##
 64 | 
 65 | library(XML)
 66 | doc <- xmlParse(paste0(u, '/25ei-6bcr/rows.xml?max_rows=5'))
 67 | df <- xmlToDataFrame(nodes = getNodeSet(doc,'//response/row/row'))
 68 | str(df)
 69 | 
 70 | is.number <- function(x)
 71 |     all(!is.na(suppressWarnings(as.numeric(as.character(x)))))
 72 | for (n in names(df))
 73 |     if (is.number(df[, n]))
 74 |         df[, n] <- as.numeric(as.character(df[, n]))
 75 | 
 76 | ##
 77 | 
 78 | doc <- getURL(paste0(u, '/25ei-6bcr/rows?max_rows=5'), httpheader = c(Accept = 'text/html'))
 79 | 
 80 | res <- readHTMLTable(doc)
 81 | 
 82 | df <- res[[1]]
 83 | df <- readHTMLTable(doc, which = 1)
 84 | 
 85 | res <- readHTMLTable('http://cran.r-project.org/web/packages/available_packages_by_name.html')
 86 | str(res)
 87 | 
 88 | ##
 89 | 
 90 | library(wordcloud)
 91 | wordcloud(res[[1]][, 2])
 92 | 
 93 | ##
 94 | 
 95 | page <- htmlParse('http://cran.r-project.org/web/views/WebTechnologies.html')
 96 | res <- unlist(xpathApply(page, "//h3[text()='CRAN packages:']/following-sibling::ul[1]/li", xmlValue))
 97 | str(res)
 98 | 
 99 | ##
100 | 
101 | res <- xpathSApply(page, "//h3[text()='CRAN packages:']/following-sibling::ul[1]/li", xmlValue)
102 | 
103 | xpathSApply(page, "//h3[text()='CRAN packages:']/following-sibling::ul[1]/li/a", xmlAttrs, 'href')
104 | 
105 | ##
106 | 
107 | detach('package:rjson')
108 | library(RSocrata)
109 | 
110 | ##
111 | 
112 | df <- read.socrata(paste0(u, '/25ei-6bcr'))
113 | str(df)
114 | 
115 | ##
116 | 
117 | library(quantmod)
118 | tail(getSymbols('A', env = NULL))
119 | 
120 | getFX('USD/EUR')
121 | 
122 | tail(USDEUR)
123 | 
124 | ##
125 | 
126 | methods(getSymbols)
127 | 
128 | str(stockSymbols())
129 | 
130 | ##
131 | 
132 | library(Quandl)
133 | Quandl('SEC/DIV_A')
134 | 
135 | ##
136 | 
137 | attr(Quandl('SEC/DIV_A', meta = TRUE), 'meta')$frequency
138 | 
139 | ##
140 | 
141 | library(devtools)
142 | install_bitbucket('GTrendsR', 'persican')
143 | 
144 | ##
145 | 
146 | library(GTrendsR)
147 | 
148 | conn <- gconnect('username', 'password')
149 | df   <- gtrends(conn, query = 'how to install R')
150 | tail(df)
151 | plot(df)
152 | 
153 | ##
154 | 
155 | library(weatherData)
156 | getWeatherForDate('London', start_date = Sys.Date()-7, end_date = Sys.Date())
157 | 


--------------------------------------------------------------------------------
/03 - Filtering and Summarizing Data.R:
--------------------------------------------------------------------------------
  1 | ## Extracted code chunks from
  2 | ##
  3 | ##  Gergely Daróczi (2015): Mastering Data Analysis with R.
  4 | ##
  5 | ##    Chapter #3: Filtering and Summarizing Data. pp. 65-84.
  6 | ##
  7 | ##
  8 | ##  This file includes the code chunks from the above mentioned
  9 | ##  chapter except for the leading ">" and "+" characters, which
 10 | ##  stand for the prompt in the R console. The prompt was
 11 | ##  intentionally removed here along with arbitrary line-breaks,
 12 | ##  so that you copy and paste the R expressions to the R console
 13 | ##  in a more convenient and seamless way.
 14 | ##
 15 | ##  Code chunks are grouped here by the printed pages of the book.
 16 | ##  Two hash signs at the beginning of a line stands for a page
 17 | ##  break, while an extra empty line between the code chunks
 18 | ##  represents one or more paragraphs in the original book between
 19 | ##  the examples for easier navigation.
 20 | ##
 21 | ##  Sometimes extra instructions starting with a double hash are
 22 | ##  also provided on how to run the below expressions.
 23 | ##
 24 | ##
 25 | ##  Find more information on the book at http://bit.ly/mastering-R
 26 | ##  and you can contact me on Twitter and GitHub by the @daroczig
 27 | ##  handle, or mail me at daroczig@rapporter.net
 28 | ##
 29 | 
 30 | library(sqldf)
 31 | sqldf('SELECT * FROM mtcars WHERE am=1 AND vs=1')
 32 | 
 33 | ##
 34 | 
 35 | subset(mtcars, am == 1 & vs == 1)
 36 | sqldf('SELECT * FROM mtcars WHERE am=1 AND vs=1', row.names = TRUE)
 37 | 
 38 | identical(
 39 |     sqldf('SELECT * FROM mtcars WHERE am=1 AND vs=1', row.names = TRUE),
 40 |     subset(mtcars, am == 1 & vs == 1)
 41 |     )
 42 | 
 43 | subset(mtcars, am == 1 & vs == 1, select = hp:wt)
 44 | 
 45 | ##
 46 | 
 47 | library(hflights)
 48 | system.time(sqldf("SELECT * FROM hflights WHERE Dest == 'BNA'", row.names = TRUE))
 49 | system.time(subset(hflights, Dest == 'BNA'))
 50 | 
 51 | ##
 52 | 
 53 | library(dplyr)
 54 | system.time(filter(hflights, Dest == 'BNA'))
 55 | 
 56 | str(select(filter(hflights, Dest == 'BNA'), DepTime:ArrTime))
 57 | 
 58 | mtcars$rownames <- rownames(mtcars)
 59 | select(filter(mtcars, hp > 300), c(rownames, hp))
 60 | 
 61 | ##
 62 | 
 63 | library(data.table)
 64 | hflights_dt <- data.table(hflights)
 65 | hflights_dt[, rownames := rownames(hflights)]
 66 | system.time(hflights_dt[Dest == 'BNA'])
 67 | 
 68 | ##
 69 | 
 70 | str(hflights_dt[Dest == 'BNA', list(DepTime, ArrTime)])
 71 | 
 72 | str(hflights_dt[Dest == 'BNA', c('DepTime', 'ArrTime'), with = FALSE])
 73 | 
 74 | aggregate(hflights$Diverted, by = list(hflights$DayOfWeek), FUN = mean)
 75 | 
 76 | ##
 77 | 
 78 | with(hflights, aggregate(Diverted, by = list(DayOfWeek), FUN = mean))
 79 | 
 80 | aggregate(Diverted ~ DayOfWeek, data = hflights, FUN = mean)
 81 | 
 82 | ##
 83 | 
 84 | tapply(hflights$Diverted, hflights$DayOfWeek, mean)
 85 | 
 86 | ##
 87 | 
 88 | library(plyr)
 89 | ddply(hflights, .(DayOfWeek), function(x) mean(x$Diverted))
 90 | 
 91 | ddply(hflights, .(DayOfWeek), summarise, Diverted = mean(Diverted))
 92 | 
 93 | ##
 94 | 
 95 | hflights_DayOfWeek <- group_by(hflights, DayOfWeek)
 96 | 
 97 | str(attributes(hflights_DayOfWeek))
 98 | 
 99 | ##
100 | 
101 | dplyr::summarise(hflights_DayOfWeek, mean(Diverted))
102 | 
103 | hflights_dt[, mean(Diverted), by = DayOfWeek]
104 | 
105 | ##
106 | 
107 | setkey(hflights_dt, 'DayOfWeek')
108 | hflights_dt[, mean(Diverted), by = DayOfWeek]
109 | 
110 | hflights_dt[, list('mean(Diverted)' = mean(Diverted)), by = DayOfWeek]
111 | 
112 | ##
113 | 
114 | AGGR1     <- function() aggregate(hflights$Diverted, by = list(hflights$DayOfWeek), FUN = mean)
115 | AGGR2     <- function() with(hflights, aggregate(Diverted, by = list(DayOfWeek), FUN = mean))
116 | AGGR3     <- function() aggregate(Diverted ~ DayOfWeek, data = hflights, FUN = mean)
117 | TAPPLY    <- function() tapply(hflights$Diverted, hflights$DayOfWeek, mean)
118 | PLYR1     <- function() ddply(hflights, .(DayOfWeek), function(x) mean(x$Diverted))
119 | PLYR2     <- function() ddply(hflights, .(DayOfWeek), summarise, Diverted = mean(Diverted))
120 | DPLYR     <- function() dplyr::summarise(hflights_DayOfWeek, mean(Diverted))
121 | 
122 | DPLYR_ALL <- function() {
123 |     hflights_DayOfWeek <- group_by(hflights, DayOfWeek)
124 |     dplyr::summarise(hflights_DayOfWeek, mean(Diverted))
125 | }
126 | 
127 | hflights_dt_nokey <- data.table(hflights)
128 | 
129 | key(hflights_dt_nokey)
130 | 
131 | ##
132 | 
133 | DT        <- function() hflights_dt_nokey[, mean(FlightNum), by = DayOfWeek]
134 | DT_KEY    <- function() hflights_dt[, mean(FlightNum), by = key(hflights_dt)]
135 | DT_ALL    <- function() {
136 |     hflights_dt_nokey <- data.table(hflights)
137 |     setkey(hflights_dt_nokey, 'DayOfWeek')
138 |     hflights_dt_nokey[, mean(FlightNum), by = DayOfWeek]
139 | }
140 | 
141 | library(microbenchmark)
142 | res <- microbenchmark(AGGR1(), AGGR2(), AGGR3(), TAPPLY(), PLYR1(), PLYR2(), DPLYR(), DPLYR_ALL(), DT(), DT_KEY(), DT_ALL(), times = 10)
143 | print(res, digits = 3)
144 | 
145 | ##
146 | 
147 | autoplot(res)
148 | 
149 | dplyr::summarise(group_by(hflights_dt, DayOfWeek), mean(Diverted))
150 | 
151 | ##
152 | 
153 | ddply(hflights, .(DayOfWeek), summarise, n = length(Diverted))
154 | 
155 | ##
156 | 
157 | ddply(hflights, .(DayOfWeek), nrow)
158 | 
159 | table(hflights$DayOfWeek)
160 | 
161 | count(hflights, 'DayOfWeek')
162 | 
163 | ##
164 | 
165 | dplyr::summarise(hflights_DayOfWeek, n())
166 | 
167 | attr(hflights_DayOfWeek, 'group_sizes')
168 | 
169 | hflights_dt[, .N, by = list(DayOfWeek)]
170 | 


--------------------------------------------------------------------------------
/04 - Restructuring Data.R:
--------------------------------------------------------------------------------
  1 | ## Extracted code chunks from
  2 | ##
  3 | ##  Gergely Daróczi (2015): Mastering Data Analysis with R.
  4 | ##
  5 | ##    Chapter #4: Restructuring Data. pp. 85-106.
  6 | ##
  7 | ##
  8 | ##  This file includes the code chunks from the above mentioned
  9 | ##  chapter except for the leading ">" and "+" characters, which
 10 | ##  stand for the prompt in the R console. The prompt was
 11 | ##  intentionally removed here along with arbitrary line-breaks,
 12 | ##  so that you copy and paste the R expressions to the R console
 13 | ##  in a more convenient and seamless way.
 14 | ##
 15 | ##  Code chunks are grouped here by the printed pages of the book.
 16 | ##  Two hash signs at the beginning of a line stands for a page
 17 | ##  break, while an extra empty line between the code chunks
 18 | ##  represents one or more paragraphs in the original book between
 19 | ##  the examples for easier navigation.
 20 | ##
 21 | ##  Sometimes extra instructions starting with a double hash are
 22 | ##  also provided on how to run the below expressions.
 23 | ##
 24 | ##
 25 | ##  Find more information on the book at http://bit.ly/mastering-R
 26 | ##  and you can contact me on Twitter and GitHub by the @daroczig
 27 | ##  handle, or mail me at daroczig@rapporter.net
 28 | ##
 29 | 
 30 | (m <- matrix(1:9, 3))
 31 | 
 32 | ##
 33 | 
 34 | t(m)
 35 | 
 36 | ##
 37 | 
 38 | library(dplyr)
 39 | library(hflights)
 40 | str(select(hflights, ends_with('delay')))
 41 | 
 42 | str(select(hflights, contains('T', ignore.case = FALSE)))
 43 | 
 44 | ##
 45 | 
 46 | str(select(hflights, matches('^[[:alpha:]]{5,6}$')))
 47 | 
 48 | table(nchar(names(hflights)))
 49 | 
 50 | names(select(hflights, -matches('^[[:alpha:]]{7,8}$')))
 51 | 
 52 | ##
 53 | 
 54 | str(arrange(hflights, ActualElapsedTime))
 55 | 
 56 | ##
 57 | 
 58 | hflights %>% arrange(ActualElapsedTime) %>% str
 59 | 
 60 | hflights %>%
 61 |     arrange(ActualElapsedTime) %>%
 62 |     select(ActualElapsedTime, Dest) %>%
 63 |     subset(Dest != 'AUS') %>%
 64 |     head %>%
 65 |     str
 66 | 
 67 | ##
 68 | 
 69 | library(data.table)
 70 | str(head(data.table(hflights, key = 'ActualElapsedTime')[Dest != 'AUS', c('ActualElapsedTime', 'Dest'), with = FALSE]))
 71 | 
 72 | str(head(na.omit(data.table(hflights, key = 'ActualElapsedTime'))[Dest != 'AUS', .(ActualElapsedTime, Dest)]))
 73 | 
 74 | system.time(str(head(data.table(na.omit(hflights), key = 'ActualElapsedTime')[Dest != 'AUS', c('ActualElapsedTime', 'Dest'), with = FALSE])))
 75 | system.time(str(head(na.omit(data.table(hflights, key = 'ActualElapsedTime'))[Dest != 'AUS', c('ActualElapsedTime', 'Dest'), with = FALSE])))
 76 | 
 77 | ##
 78 | 
 79 | hflights_dt <- data.table(hflights)
 80 | hflights_dt[, DistanceKMs := Distance / 0.62137]
 81 | 
 82 | system.time(hflights_dt$DistanceKMs <- hflights_dt$Distance / 0.62137)
 83 | system.time(hflights_dt[, DistanceKMs := Distance / 0.62137])
 84 | 
 85 | ##
 86 | 
 87 | library(pryr)
 88 | hflights_dt <- data.table(hflights)
 89 | address(hflights_dt)
 90 | 
 91 | hflights_dt$DistanceKMs <- hflights_dt$Distance / 0.62137
 92 | address(hflights_dt)
 93 | 
 94 | ##
 95 | 
 96 | hflights_dt <- data.table(hflights)
 97 | address(hflights_dt)
 98 | hflights_dt[, DistanceKMs := Distance / 0.62137]
 99 | address(hflights_dt)
100 | 
101 | system.time(within(hflights_dt, DistanceKMs <- Distance / 0.62137))
102 | 
103 | hflights_dt[, c('DistanceKMs', 'DiastanceFeets') := list(Distance / 0.62137, Distance * 5280)]
104 | 
105 | ##
106 | 
107 | carriers <- unique(hflights_dt$UniqueCarrier)
108 | hflights_dt[, paste('carrier', carriers, sep = '_') := lapply(carriers, function(x) as.numeric(UniqueCarrier == x))]
109 | str(hflights_dt[, grep('^carrier', names(hflights_dt)), with = FALSE])
110 | 
111 | ###
112 | 
113 | hflights <- hflights %>%
114 |     mutate(DistanceKMs = Distance / 0.62137)
115 | 
116 | hflights <- mutate(hflights, DistanceKMs = Distance / 0.62137)
117 | 
118 | ##
119 | 
120 | (wdays <- data.frame(
121 |     DayOfWeek       = 1:7,
122 |     DayOfWeekString = c('Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday')
123 |     ))
124 | 
125 | system.time(merge(hflights, wdays))
126 | system.time(merge(hflights_dt, wdays, by = 'DayOfWeek'))
127 | 
128 | ##
129 | 
130 | weekdays(as.Date(with(hflights, paste(Year, Month, DayofMonth, sep = '-'))))
131 | 
132 | ##
133 | 
134 | library(reshape2)
135 | 
136 | ##
137 | 
138 | head(melt(hflights))
139 | 
140 | hflights_melted <- melt(hflights, id.vars = 0, measure.vars = c('ActualElapsedTime', 'AirTime'))
141 | str(hflights_melted)
142 | 
143 | ##
144 | 
145 | library(ggplot2)
146 | ggplot(hflights_melted, aes(x = variable, y = value)) + geom_boxplot()
147 | 
148 | ##
149 | 
150 | hflights_melted <- melt(hflights, id.vars = 'Month', measure.vars = c('ActualElapsedTime', 'AirTime'))
151 | (df <- dcast(hflights_melted, Month ~ variable, fun.aggregate = mean, na.rm = TRUE))
152 | 
153 | ggplot(melt(df, id.vars = 'Month')) + geom_line(aes(x = Month, y = value, color = variable)) + scale_x_continuous(breaks = 1:12) + theme_bw() + theme(legend.position = 'top')
154 | 
155 | ##
156 | 
157 | hflights_melted <- melt(add_margins(hflights, 'Month'), id.vars = 'Month', measure.vars = c('ActualElapsedTime', 'AirTime'))
158 | (df <- dcast(hflights_melted, Month ~ variable, fun.aggregate = mean, na.rm = TRUE))
159 | 
160 | ##
161 | 
162 | library(tidyr)
163 | str(gather(hflights[, c('Month', 'ActualElapsedTime', 'AirTime')], variable, value, -Month))
164 | 


--------------------------------------------------------------------------------
/05 - Building Models.R:
--------------------------------------------------------------------------------
  1 | ## Extracted code chunks from
  2 | ##
  3 | ##  Gergely Daróczi (2015): Mastering Data Analysis with R.
  4 | ##
  5 | ##    Chapter #5: Building Models. pp. 107-126.
  6 | ##      -- written by Renata Nemeth and Gergely Toth
  7 | ##
  8 | ##
  9 | ##  This file includes the code chunks from the above mentioned
 10 | ##  chapter except for the leading ">" and "+" characters, which
 11 | ##  stand for the prompt in the R console. The prompt was
 12 | ##  intentionally removed here along with arbitrary line-breaks,
 13 | ##  so that you copy and paste the R expressions to the R console
 14 | ##  in a more convenient and seamless way.
 15 | ##
 16 | ##  Code chunks are grouped here by the printed pages of the book.
 17 | ##  Two hash signs at the beginning of a line stands for a page
 18 | ##  break, while an extra empty line between the code chunks
 19 | ##  represents one or more paragraphs in the original book between
 20 | ##  the examples for easier navigation.
 21 | ##
 22 | ##  Sometimes extra instructions starting with a double hash are
 23 | ##  also provided on how to run the below expressions.
 24 | ##
 25 | ##
 26 | ##  Find more information on the book at http://bit.ly/mastering-R
 27 | ##  and you can contact me on Twitter and GitHub by the @daroczig
 28 | ##  handle, or mail me at daroczig@rapporter.net
 29 | ##
 30 | 
 31 | library(gamlss.data)
 32 | data(usair)
 33 | 
 34 | model.0 <- lm(y ~ x3, data = usair)
 35 | summary(model.0)
 36 | 
 37 | ##
 38 | 
 39 | plot(y ~ x3, data = usair, cex.lab = 1.5)
 40 | abline(model.0, col = 'red', lwd = 2.5)
 41 | legend('bottomright', legend = 'y ~ x3', lty = 1, col = 'red', lwd = 2.5, title = 'Regression line')
 42 | 
 43 | ##
 44 | 
 45 | usair$prediction <- predict(model.0)
 46 | usair$residual<- resid(model.0)
 47 | plot(y ~ x3, data = usair, cex.lab = 1.5)
 48 | abline(model.0, col = 'red', lwd = 2.5)
 49 | segments(usair$x3, usair$y, usair$x3, usair$prediction, col = 'blue', lty = 2)
 50 | legend('bottomright', legend = c('y ~ x3', 'residuals'), lty = c(1, 2), col = c('red', 'blue'), lwd = 2.5, title = 'Regression line')
 51 | 
 52 | ##
 53 | 
 54 | model.1 <- update(model.0, . ~ . + x2)
 55 | summary(model.1)
 56 | 
 57 | ##
 58 | 
 59 | as.numeric(predict(model.1, data.frame(x2 = 150, x3 = 400)))
 60 | 
 61 | -0.05661 * 400 + 0.08243 * 150 + 26.32508
 62 | 
 63 | library(scatterplot3d)
 64 | plot3d <- scatterplot3d(usair$x3, usair$x2, usair$y, pch = 19, type = 'h', highlight.3d = TRUE, main = '3-D Scatterplot')
 65 | 
 66 | ##
 67 | 
 68 | plot3d$plane3d(model.1, lty = 'solid', col = 'red')
 69 | 
 70 | par(mfrow = c(1, 2))
 71 | plot(y ~ x3, data = usair, cex.lab = 1.5, main = '2D projection for x3')
 72 | abline(model.1, col = 'red', lwd = 2.5)
 73 | plot(y ~ x2, data = usair, cex.lab = 1.5, main = '2D projection for x2')
 74 | abline(lm(y ~ x2 + x3, data = usair), col = 'red', lwd = 2.5)
 75 | 
 76 | ##
 77 | 
 78 | library(Hmisc)
 79 | library(ggplot2)
 80 | library(gridExtra)
 81 | set.seed(7)
 82 | x  <- sort(rnorm(1000, 10, 100))[26:975]
 83 | y  <- x * 500 + rnorm(950, 5000, 20000)
 84 | df <- data.frame(x = x, y = y, cuts = factor(cut2(x, g = 5)), resid = resid(lm(y ~ x)))
 85 | scatterPl <- ggplot(df, aes(x = x, y = y)) + geom_point(aes(colour = cuts, fill = cuts), shape = 1, show_guide = FALSE) + geom_smooth(method = lm, level = 0.99) + theme_bw() + theme(axis.text.x = element_blank(), axis.text.y = element_blank(), text = element_text(size=20))
 86 | 
 87 | ##
 88 | 
 89 | plot_left <- ggplot(df,  aes(x = y, fill = cuts)) + geom_density(alpha = .5) + coord_flip() + scale_y_reverse() + xlab('Y values by groups') + theme_bw() + theme(legend.position = 'none') + theme(axis.text.x = element_blank(), axis.text.y = element_blank(), text = element_text(size=20))
 90 | plot_right <- ggplot(data = df, aes(x = resid, fill = cuts)) + geom_density(alpha = .5) + coord_flip() + theme_bw() + xlab('Residuals by groups') + theme(legend.position = 'none') + theme(axis.text.x = element_blank(), axis.text.y = element_blank(), text = element_text(size=20))
 91 | grid.arrange(plot_left, scatterPl, plot_right, ncol=3, nrow=1, widths=c(1, 3, 1))
 92 | 
 93 | library(gvlma)
 94 | gvlma(model.1)
 95 | 
 96 | ##
 97 | 
 98 | model.2 <- update(model.1, data = usair[-31, ])
 99 | gvlma(model.2)
100 | 
101 | ##
102 | 
103 | model.0 <- update(model.0, data = usair[-31, ])
104 | summary(model.0)[c('r.squared', 'adj.r.squared')]
105 | 
106 | summary(model.2)[c('r.squared', 'adj.r.squared')]
107 | 
108 | ##
109 | 
110 | summary(model.3 <- update(model.2, .~. -x2 + x1))$coefficients
111 | summary(model.4 <- update(model.2, .~. -x3 + x1))$coefficients
112 | AIC(model.3, model.4)
113 | 
114 | ##
115 | 
116 | plot(y ~ x5, data = usair, cex.lab = 1.5)
117 | abline(lm(y ~ x5, data = usair), col = 'red', lwd = 2.5, lty = 1)
118 | abline(lm(y ~ x5, data = usair[usair$x5<=45,]), col = 'red', lwd = 2.5, lty = 3)
119 | abline(lm(y ~ x5, data = usair[usair$x5>=30,]), col = 'red', lwd = 2.5, lty = 2)
120 | abline(v = c(30, 45),col = 'blue', lwd = 2.5)
121 | legend('topleft', legend = c('y ~ x5', 'y ~ x5 | x5<=45','y ~ x5 | x5>=30', 'Critical zone'), lty = c(1, 3, 2, 1) , col = c('red', 'red', 'red', 'blue'), lwd = rep(2.5, 4), title = NULL)
122 | 
123 | ##
124 | 
125 | library(partykit)
126 | library(rpart)
127 | plot(as.party(rpart(y ~ x5, data = usair)))
128 | 
129 | ##
130 | 
131 | usair$x5_3 <- cut2(usair$x5, c(30, 45))
132 | plot(y ~ as.numeric(x5_3), data = usair, cex.lab = 1.5, xlab = 'Categorized annual rainfall(x5)', xaxt = 'n')
133 | axis(1, at = 1:3, labels = levels(usair$x5_3))
134 | lines(tapply(usair$y, usair$x5_3, mean), col = 'red', lwd = 2.5, lty = 1)
135 | legend('topright', legend = 'Linear prediction', lty = 1, col = 'red', lwd = 2.5, title = NULL)
136 | 
137 | summary(glmmodel.1 <- glm(y ~ x2 + x3 + x5_3, data = usair[-31, ]))
138 | 


--------------------------------------------------------------------------------
/06 - Beyond the Linear Trend Line.R:
--------------------------------------------------------------------------------
  1 | ## Extracted code chunks from
  2 | ##
  3 | ##  Gergely Daróczi (2015): Mastering Data Analysis with R.
  4 | ##
  5 | ##    Chapter #6: Beyond the Linear Trend Line. pp. 127-152.
  6 | ##      -- written by Renata Nemeth and Gergely Toth
  7 | ##
  8 | ##
  9 | ##  This file includes the code chunks from the above mentioned
 10 | ##  chapter except for the leading ">" and "+" characters, which
 11 | ##  stand for the prompt in the R console. The prompt was
 12 | ##  intentionally removed here along with arbitrary line-breaks,
 13 | ##  so that you copy and paste the R expressions to the R console
 14 | ##  in a more convenient and seamless way.
 15 | ##
 16 | ##  Code chunks are grouped here by the printed pages of the book.
 17 | ##  Two hash signs at the beginning of a line stands for a page
 18 | ##  break, while an extra empty line between the code chunks
 19 | ##  represents one or more paragraphs in the original book between
 20 | ##  the examples for easier navigation.
 21 | ##
 22 | ##  Sometimes extra instructions starting with a double hash are
 23 | ##  also provided on how to run the below expressions.
 24 | ##
 25 | ##
 26 | ##  Find more information on the book at http://bit.ly/mastering-R
 27 | ##  and you can contact me on Twitter and GitHub by the @daroczig
 28 | ##  handle, or mail me at daroczig@rapporter.net
 29 | ##
 30 | 
 31 | library(catdata)
 32 | data(deathpenalty)
 33 | library(vcdExtra)
 34 | deathpenalty.expand <- expand.dft(deathpenalty)
 35 | binom.model.0 <- glm(DeathPenalty ~ DefendantRace, data = deathpenalty.expand, family = binomial)
 36 | summary(binom.model.0)
 37 | 
 38 | ##
 39 | 
 40 | exp(cbind(OR = coef(binom.model.0), confint(binom.model.0)))
 41 | 
 42 | ##
 43 | 
 44 | binom.model.1 <- update(binom.model.0, .~.+VictimRace)
 45 | summary(binom.model.1)
 46 | exp(cbind(OR = coef(binom.model.1), confint(binom.model.1)))
 47 | 
 48 | ##
 49 | 
 50 | prop.table(table(factor(deathpenalty.expand$VictimRace, labels = c('VictimRace=0', 'VictimRace=1')), factor(deathpenalty.expand$DefendantRace, labels = c('DefendantRace=0', 'DefendantRace=1'))), 1)
 51 | 
 52 | ##
 53 | 
 54 | library(lmtest)
 55 | lrtest(binom.model.1)
 56 | 
 57 | ##
 58 | 
 59 | library(BaylorEdPsych)
 60 | PseudoR2(binom.model.1)
 61 | 
 62 | ##
 63 | 
 64 | lrtest(binom.model.0, binom.model.1)
 65 | 
 66 | ##
 67 | 
 68 | dfa <- readRDS('SMART_2013.RData')
 69 | 
 70 | (ct <- xtabs(~model + failure, data = dfa))
 71 | 
 72 | ##
 73 | 
 74 | dfa <- dfa[dfa$model %in% names(which(rowSums(ct) - ct[, 1] > 0)), ]
 75 | 
 76 | library(ggplot2)
 77 | ggplot(rbind(dfa, data.frame(model = 'All', dfa[, -1])), aes(failure)) + geom_histogram(binwidth = 1, drop = TRUE, origin = -0.5)  + scale_y_log10() + scale_x_continuous(breaks=c(0:10)) + ylab('log(count)') + facet_wrap( ~ model, ncol = 3) + ggtitle('Histograms by manufacturer') + theme_bw()
 78 | 
 79 | ##
 80 | 
 81 | poiss.base <- glm(failure ~ model, offset(log(freq)), family = 'poisson', data = dfa)
 82 | summary(poiss.base)
 83 | 
 84 | ##
 85 | 
 86 | contrasts(dfa$model, sparse = TRUE)
 87 | 
 88 | ##
 89 | 
 90 | exp(1.7666)
 91 | 
 92 | lrtest(poiss.base)
 93 | 
 94 | ##
 95 | 
 96 | library(MASS)
 97 | model.negbin.0 <- glm.nb(failure ~ model, offset(log(freq)), data = dfa)
 98 | 
 99 | lrtest(poiss.base,model.negbin.0)
100 | 
101 | ##
102 | 
103 | model.negbin.1 <- update(model.negbin.0, .~. + capacity_bytes + age_month + temperature)
104 | model.negbin.2 <- update(model.negbin.1, .~. + PendingSector)
105 | lrtest(model.negbin.0, model.negbin.1, model.negbin.2)
106 | 
107 | summary(model.negbin.2)
108 | 
109 | ##
110 | 
111 | exp(data.frame(exp_coef = coef(model.negbin.2)))
112 | 
113 | ##
114 | 
115 | dfa$model <- relevel(dfa$model, 'WDC')
116 | 
117 | model.negbin.3 <- update(model.negbin.2, data = dfa)
118 | library(broom)
119 | format(tidy(model.negbin.3), digits = 4)
120 | 
121 | ##
122 | 
123 | library(data.table)
124 | dfa <- data.table(dfa)
125 | dfa[, temp6 := cut2(temperature, g = 6)]
126 | temperature.weighted.mean <- dfa[, .(wfailure = weighted.mean(failure, freq)), by = temp6]
127 | ggplot(temperature.weighted.mean, aes(x = temp6, y = wfailure)) + geom_bar(stat = 'identity') + xlab('Categorized temperature') + ylab('Weighted mean of disk faults') + theme_bw()
128 | 
129 | ##
130 | 
131 | model.negbin.4 <- update(model.negbin.0, .~. + capacity_bytes + age_month + temp6 + PendingSector, data = dfa)
132 | AIC(model.negbin.3, model.negbin.4)
133 | 
134 | weighted.means <- rbind(
135 |     dfa[, .(var = 'capacity', wfailure = weighted.mean(failure, freq)), by = .(value = capacity_bytes)],
136 |     dfa[, .(var = 'age', wfailure = weighted.mean(failure, freq)), by = .(value = age_month)])
137 | 
138 | ggplot(weighted.means, aes(x = value, y = wfailure)) + geom_step() + facet_grid(. ~ var, scales = 'free_x') + ylab('Weighted mean of disk faults') + xlab('') + theme_bw()
139 | 
140 | ##
141 | 
142 | dfa[, capacity_bytes := as.factor(capacity_bytes)]
143 | dfa[, age8 := cut2(age_month, g = 8)]
144 | model.negbin.5 <- update(model.negbin.0, .~. + capacity_bytes + age8 + temp6 + PendingSector, data = dfa)
145 | 
146 | AIC(model.negbin.5, model.negbin.4)
147 | 
148 | format(tidy(model.negbin.5), digits = 3)
149 | 
150 | ##
151 | 
152 | tmnb5 <- tidy(model.negbin.5)
153 | str(terms <- tmnb5$term[tmnb5$p.value < 0.05][-1])
154 | 
155 | library(plyr)
156 | ci <- ldply(terms, function(t) confint(model.negbin.5, t))
157 | 
158 | ##
159 | 
160 | names(ci) <- c('min', 'max')
161 | ci$term <- terms
162 | ci$variable <- sub('[A-Z0-9\\]\\[,() ]*$', '', terms, perl = TRUE)
163 | 
164 | ggplot(ci, aes(x = factor(term), color = variable, ymin = min, ymax = max)) + geom_errorbar(size = 1.3) + ylab('Coefficients (95% conf.int)') + xlab('') + theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.3), legend.position = 'top')
165 | 
166 | ##
167 | 
168 | PseudoR2(model.negbin.5)
169 | 


--------------------------------------------------------------------------------
/07 - Unstructured Data.R:
--------------------------------------------------------------------------------
  1 | ## Extracted code chunks from
  2 | ##
  3 | ##  Gergely Daróczi (2015): Mastering Data Analysis with R.
  4 | ##
  5 | ##    Chapter #7: Unstructured Data. pp. 153-168.
  6 | ##
  7 | ##
  8 | ##  This file includes the code chunks from the above mentioned
  9 | ##  chapter except for the leading ">" and "+" characters, which
 10 | ##  stand for the prompt in the R console. The prompt was
 11 | ##  intentionally removed here along with arbitrary line-breaks,
 12 | ##  so that you copy and paste the R expressions to the R console
 13 | ##  in a more convenient and seamless way.
 14 | ##
 15 | ##  Code chunks are grouped here by the printed pages of the book.
 16 | ##  Two hash signs at the beginning of a line stands for a page
 17 | ##  break, while an extra empty line between the code chunks
 18 | ##  represents one or more paragraphs in the original book between
 19 | ##  the examples for easier navigation.
 20 | ##
 21 | ##  Sometimes extra instructions starting with a double hash are
 22 | ##  also provided on how to run the below expressions.
 23 | ##
 24 | ##
 25 | ##  Find more information on the book at http://bit.ly/mastering-R
 26 | ##  and you can contact me on Twitter and GitHub by the @daroczig
 27 | ##  handle, or mail me at daroczig@rapporter.net
 28 | ##
 29 | 
 30 | library(tm)
 31 | getSources()
 32 | 
 33 | ##
 34 | 
 35 | getReaders()
 36 | 
 37 | res <- XML::readHTMLTable('http://cran.r-project.org/web/packages/available_packages_by_name.html', which = 1)
 38 | 
 39 | v <- Corpus(VectorSource(res$V2))
 40 | 
 41 | ##
 42 | 
 43 | v
 44 | 
 45 | inspect(head(v, 3))
 46 | 
 47 | getTransformations()
 48 | 
 49 | ##
 50 | 
 51 | stopwords('english')
 52 | 
 53 | ##
 54 | 
 55 | removeWords('to be or not to be', stopwords('english'))
 56 | 
 57 | v <- tm_map(v, removeWords, stopwords('english'))
 58 | 
 59 | inspect(head(v, 3))
 60 | 
 61 | ##
 62 | 
 63 | removeWords('To be or not to be.', stopwords('english'))
 64 | 
 65 | v <- tm_map(v, content_transformer(tolower))
 66 | v <- tm_map(v, removePunctuation)
 67 | v <- tm_map(v, stripWhitespace)
 68 | inspect(head(v, 3))
 69 | 
 70 | ##
 71 | 
 72 | library(wordcloud)
 73 | wordcloud::wordcloud(v)
 74 | 
 75 | ##
 76 | 
 77 | v <- tm_map(v, removeNumbers)
 78 | 
 79 | tdm <- TermDocumentMatrix(v)
 80 | 
 81 | inspect(tdm[1:5, 1:20])
 82 | 
 83 | ##
 84 | 
 85 | findFreqTerms(tdm, lowfreq = 100)
 86 | 
 87 | myStopwords <- c('package', 'based', 'using')
 88 | v <- tm_map(v, removeWords, myStopwords)
 89 | 
 90 | library(SnowballC)
 91 | wordStem(c('cats', 'mastering', 'modelling', 'models', 'model'))
 92 | 
 93 | wordStem(c('are', 'analyst', 'analyze', 'analysis'))
 94 | 
 95 | ##
 96 | 
 97 | d <- v
 98 | 
 99 | v <- tm_map(v, stemDocument, language = 'english')
100 | 
101 | v <- tm_map(v, content_transformer(function(x, d) paste(stemCompletion(strsplit(stemDocument(x), ' ')[[1]], d), collapse = ' ')), d)
102 | 
103 | tdm <- TermDocumentMatrix(v)
104 | findFreqTerms(tdm, lowfreq = 100)
105 | 
106 | ##
107 | 
108 | tdm
109 | 
110 | ##
111 | 
112 | findAssocs(tdm, 'data', 0.1)
113 | 
114 | findAssocs(tdm, 'big', 0.1)
115 | 
116 | ##
117 | 
118 | vnchar <- sapply(v, function(x) nchar(x$content))
119 | summary(vnchar)
120 | 
121 | (vm <- which.min(vnchar))
122 | 
123 | v[[vm]]
124 | res[vm, ]
125 | 
126 | ##
127 | 
128 | hist(vnchar, main = 'Length of R package descriptions', xlab = 'Number of characters')
129 | 
130 | ##
131 | 
132 | hadleyverse <- c('ggplot2', 'dplyr', 'reshape2', 'lubridate', 'stringr', 'devtools', 'roxygen2', 'tidyr')
133 | 
134 | (w <- which(res$V1 %in% hadleyverse))
135 | 
136 | plot(hclust(dist(DocumentTermMatrix(v[w]))), xlab = 'Hadleyverse packages')
137 | 
138 | ##
139 | 
140 | sapply(v[w], function(x) structure(content(x), .Names = meta(x, 'id')))
141 | 
142 | 
143 | library(topicmodels)
144 | fit <- LDA(DocumentTermMatrix(v[w]), k = 3)
145 | topics(fit)
146 | terms(fit, 10)
147 | 
148 | fit <- CTM(DocumentTermMatrix(v[w]), k = 3)
149 | 


--------------------------------------------------------------------------------
/08 - Polishing Data.R:
--------------------------------------------------------------------------------
  1 | ## Extracted code chunks from
  2 | ##
  3 | ##  Gergely Daróczi (2015): Mastering Data Analysis with R.
  4 | ##
  5 | ##    Chapter #8: Polishing Data. pp. 169-192.
  6 | ##
  7 | ##
  8 | ##  This file includes the code chunks from the above mentioned
  9 | ##  chapter except for the leading ">" and "+" characters, which
 10 | ##  stand for the prompt in the R console. The prompt was
 11 | ##  intentionally removed here along with arbitrary line-breaks,
 12 | ##  so that you copy and paste the R expressions to the R console
 13 | ##  in a more convenient and seamless way.
 14 | ##
 15 | ##  Code chunks are grouped here by the printed pages of the book.
 16 | ##  Two hash signs at the beginning of a line stands for a page
 17 | ##  break, while an extra empty line between the code chunks
 18 | ##  represents one or more paragraphs in the original book between
 19 | ##  the examples for easier navigation.
 20 | ##
 21 | ##  Sometimes extra instructions starting with a double hash are
 22 | ##  also provided on how to run the below expressions.
 23 | ##
 24 | ##
 25 | ##  Find more information on the book at http://bit.ly/mastering-R
 26 | ##  and you can contact me on Twitter and GitHub by the @daroczig
 27 | ##  handle, or mail me at daroczig@rapporter.net
 28 | ##
 29 | 
 30 | library(hflights)
 31 | table(complete.cases(hflights))
 32 | 
 33 | ##
 34 | 
 35 | prop.table(table(complete.cases(hflights))) * 100
 36 | 
 37 | sort(sapply(hflights, function(x) sum(is.na(x))))
 38 | 
 39 | mean(cor(apply(hflights, 2, function(x) as.numeric(is.na(x)))), na.rm = TRUE)
 40 | 
 41 | ##
 42 | 
 43 | Funs <- Filter(is.function, sapply(ls(baseenv()), get, baseenv()))
 44 | names(Filter(function(x) any(names(formals(args(x))) %in% 'na.rm'), Funs))
 45 | 
 46 | ##
 47 | 
 48 | names(Filter(function(x) any(names(formals(args(x))) %in% 'na.rm'), Filter(is.function, sapply(ls('package:stats'), get, 'package:stats'))))
 49 | 
 50 | myMean <- function(...) mean(..., na.rm = TRUE)
 51 | mean(c(1:5, NA))
 52 | myMean(c(1:5, NA))
 53 | 
 54 | ##
 55 | 
 56 | library(rapportools)
 57 | mean(c(1:5, NA))
 58 | 
 59 | detach('package:rapportools')
 60 | mean(c(1:5, NA))
 61 | 
 62 | library(Defaults)
 63 | setDefaults(mean.default, na.rm = TRUE)
 64 | mean(c(1:5, NA))
 65 | 
 66 | setDefaults(mean, na.rm = TRUE)
 67 | 
 68 | ##
 69 | 
 70 | mean
 71 | formals(mean)
 72 | 
 73 | unDefaults(ls)
 74 | 
 75 | ##
 76 | 
 77 | na.omit(c(1:5, NA))
 78 | na.exclude(c(1:5, NA))
 79 | 
 80 | x <- rnorm(10); y <- rnorm(10)
 81 | x[1] <- NA; y[2] <- NA
 82 | exclude <- lm(y ~ x, na.action = 'na.exclude')
 83 | omit <- lm(y ~ x, na.action = 'na.omit')
 84 | round(residuals(exclude), 2)
 85 | round(residuals(omit), 2)
 86 | 
 87 | ##
 88 | 
 89 | m <- matrix(1:9, 3)
 90 | m[which(m %% 4 == 0, arr.ind = TRUE)] <- NA
 91 | m
 92 | na.omit(m)
 93 | 
 94 | mean(hflights$ActualElapsedTime)
 95 | 
 96 | mean(hflights$ActualElapsedTime, na.rm = TRUE)
 97 | mean(na.omit(hflights$ActualElapsedTime))
 98 | 
 99 | ##
100 | 
101 | library(microbenchmark)
102 | NA.RM   <- function() mean(hflights$ActualElapsedTime, na.rm = TRUE)
103 | NA.OMIT <- function() mean(na.omit(hflights$ActualElapsedTime))
104 | microbenchmark(NA.RM(), NA.OMIT())
105 | 
106 | ##
107 | 
108 | m[which(is.na(m), arr.ind = TRUE)] <- 0
109 | m
110 | 
111 | ActualElapsedTime <- hflights$ActualElapsedTime
112 | mean(ActualElapsedTime, na.rm = TRUE)
113 | ActualElapsedTime[which(is.na(ActualElapsedTime))] <- mean(ActualElapsedTime, na.rm = TRUE)
114 | mean(ActualElapsedTime)
115 | 
116 | library(Hmisc)
117 | mean(impute(hflights$ActualElapsedTime, mean))
118 | 
119 | ##
120 | 
121 | sd(hflights$ActualElapsedTime, na.rm = TRUE)
122 | sd(ActualElapsedTime)
123 | 
124 | ##
125 | 
126 | summary(iris)
127 | 
128 | library(missForest)
129 | set.seed(81)
130 | miris <- prodNA(iris, noNA = 0.2)
131 | summary(miris)
132 | 
133 | ##
134 | 
135 | iiris <- missForest(miris, xtrue = iris, verbose = TRUE)
136 | 
137 | ##
138 | 
139 | str(iiris)
140 | 
141 | ##
142 | 
143 | miris <- miris[, 1:4]
144 | 
145 | ##
146 | 
147 | iris_mean <- impute(miris, fun = mean)
148 | iris_forest <- missForest(miris)
149 | 
150 | diag(cor(iris[, -5], iris_mean))
151 | diag(cor(iris[, -5], iris_forest$ximp))
152 | 
153 | ##
154 | 
155 | detach('package:missForest')
156 | detach('package:randomForest')
157 | 
158 | ##
159 | 
160 | library(outliers)
161 | outlier(hflights$DepDelay)
162 | 
163 | summary(hflights$DepDelay)
164 | 
165 | library(lattice)
166 | bwplot(hflights$DepDelay)
167 | 
168 | IQR(hflights$DepDelay, na.rm = TRUE)
169 | 
170 | ##
171 | 
172 | set.seed(83)
173 | dixon.test(c(runif(10), pi))
174 | 
175 | model <- lm(hflights$DepDelay ~ 1)
176 | 
177 | model$coefficients
178 | mean(hflights$DepDelay, na.rm = TRUE)
179 | 
180 | ##
181 | 
182 | a <- 0.1
183 | (n <- length(hflights$DepDelay))
184 | (F <- qf(1 - (a/n), 1, n-2, lower.tail = TRUE))
185 | (L <- ((n - 1) * F / (n - 2 + F))^0.5)
186 | 
187 | sum(abs(rstandard(model)) > L)
188 | 
189 | summary(lm(Sepal.Length ~ Petal.Length, data = miris))
190 | 
191 | ##
192 | 
193 | lm(Sepal.Length ~ Petal.Length, data = iris)$coefficients
194 | 
195 | library(MASS)
196 | summary(rlm(Sepal.Length ~ Petal.Length, data = miris))
197 | 
198 | ##
199 | 
200 | f <- formula(Sepal.Length ~ Petal.Length)
201 | cbind(
202 |     orig = lm(f, data = iris)$coefficients,
203 |     lm   = lm(f, data = miris)$coefficients,
204 |     rlm  = rlm(f, data = miris)$coefficients)
205 | 
206 | miris$Sepal.Length[1] <- 14
207 | cbind(
208 |     orig = lm(f, data = iris)$coefficients,
209 |     lm   = lm(f, data = miris)$coefficients,
210 |     rlm  = rlm(f, data = miris)$coefficients)
211 | 


--------------------------------------------------------------------------------
/09 - From Big to Smaller Data.R:
--------------------------------------------------------------------------------
  1 | ## Extracted code chunks from
  2 | ##
  3 | ##  Gergely Daróczi (2015): Mastering Data Analysis with R.
  4 | ##
  5 | ##    Chapter #9: From Big to Small Data. pp. 192-234.
  6 | ##
  7 | ##
  8 | ##  This file includes the code chunks from the above mentioned
  9 | ##  chapter except for the leading ">" and "+" characters, which
 10 | ##  stand for the prompt in the R console. The prompt was
 11 | ##  intentionally removed here along with arbitrary line-breaks,
 12 | ##  so that you copy and paste the R expressions to the R console
 13 | ##  in a more convenient and seamless way.
 14 | ##
 15 | ##  Code chunks are grouped here by the printed pages of the book.
 16 | ##  Two hash signs at the beginning of a line stands for a page
 17 | ##  break, while an extra empty line between the code chunks
 18 | ##  represents one or more paragraphs in the original book between
 19 | ##  the examples for easier navigation.
 20 | ##
 21 | ##  Sometimes extra instructions starting with a double hash are
 22 | ##  also provided on how to run the below expressions.
 23 | ##
 24 | ##
 25 | ##  Find more information on the book at http://bit.ly/mastering-R
 26 | ##  and you can contact me on Twitter and GitHub by the @daroczig
 27 | ##  handle, or mail me at daroczig@rapporter.net
 28 | ##
 29 | 
 30 | library(hflights)
 31 | 
 32 | JFK <- hflights[which(hflights$Dest == 'JFK'), c('TaxiIn', 'TaxiOut')]
 33 | JFK <- subset(hflights, Dest == 'JFK', select = c(TaxiIn, TaxiOut))
 34 | 
 35 | ##
 36 | 
 37 | par(mfrow = c(1, 2))
 38 | qqnorm(JFK$TaxiIn, ylab = 'TaxiIn'); qqline(JFK$TaxiIn)
 39 | qqnorm(JFK$TaxiOut, ylab = 'TaxiOut'); qqline(JFK$TaxiOut)
 40 | 
 41 | shapiro.test(JFK$TaxiIn)
 42 | 
 43 | ##
 44 | 
 45 | JFK <- na.omit(JFK)
 46 | library(MVN)
 47 | mardiaTest(JFK)
 48 | 
 49 | ##
 50 | 
 51 | hzTest(JFK)
 52 | 
 53 | roystonTest(JFK)
 54 | 
 55 | ##
 56 | 
 57 | mvt <- roystonTest(JFK, qqplot = TRUE)
 58 | 
 59 | par(mfrow = c(1, 2))
 60 | mvnPlot(mvt, type = 'contour', default = TRUE)
 61 | mvnPlot(mvt, type = 'persp', default = TRUE)
 62 | 
 63 | ##
 64 | 
 65 | set.seed(42)
 66 | mvt <- roystonTest(MASS::mvrnorm(100, mu = c(0, 0), Sigma = matrix(c(10, 3, 3, 2), 2)))
 67 | par(mfrow = c(1, 2), mar = c(0,0,0,0))
 68 | mvnPlot(mvt, type = 'contour', default = TRUE)
 69 | mvnPlot(mvt, type = 'persp', default = TRUE)
 70 | 
 71 | ##
 72 | 
 73 | hflights_numeric <- hflights[, which(sapply(hflights, is.numeric))]
 74 | cor(hflights_numeric, use = 'pairwise.complete.obs')
 75 | 
 76 | str(cor(hflights_numeric, use = 'pairwise.complete.obs'))
 77 | 
 78 | hflights_numeric <- hflights[,which(
 79 |     sapply(hflights, function(x)
 80 |         is.numeric(x) && var(x, na.rm = TRUE) != 0))]
 81 | 
 82 | table(is.na(cor(hflights_numeric, use = 'pairwise.complete.obs')))
 83 | 
 84 | ##
 85 | 
 86 | library(ellipse)
 87 | 
 88 | plotcorr(cor(hflights_numeric, use = 'pairwise.complete.obs'))
 89 | 
 90 | ##
 91 | 
 92 | plotcorr(cor(data.frame(
 93 |     1:10,
 94 |     1:10+runif(10),
 95 |     1:10+runif(10)*5,
 96 |     runif(10),
 97 |     10:1,
 98 |     check.names = FALSE)))
 99 | 
100 | cor(hflights$FlightNum, hflights$Month)
101 | 
102 | ##
103 | 
104 | ##
105 | 
106 | library(psych)
107 | KMO(cor(hflights_numeric, use = 'pairwise.complete.obs'))
108 | 
109 | ##
110 | 
111 | cor(hflights_numeric[, c('Cancelled', 'AirTime')])
112 | 
113 | table(hflights_numeric$AirTime[which(hflights_numeric$Cancelled == 1)], exclude = NULL)
114 | 
115 | table(hflights_numeric$Cancelled)
116 | 
117 | hflights_numeric <- subset(hflights_numeric, select = -Cancelled)
118 | 
119 | which(is.na(cor(hflights_numeric, use = 'pairwise.complete.obs')), arr.ind = TRUE)
120 | 
121 | ##
122 | 
123 | hflights_numeric <- subset(hflights_numeric, select = -Diverted)
124 | KMO(cor(hflights_numeric[, -c(14)], use = 'pairwise.complete.obs'))
125 | 
126 | ##
127 | 
128 | KMO(mtcars)
129 | 
130 | bartlett.test(mtcars)
131 | cortest.bartlett(cor(mtcars))
132 | 
133 | ##
134 | 
135 | prcomp(mtcars, scale = TRUE)
136 | 
137 | ##
138 | 
139 | summary(prcomp(mtcars, scale = TRUE))
140 | 
141 | ##
142 | 
143 | sum(prcomp(scale(mtcars))$sdev^2)
144 | 
145 | prcomp(scale(mtcars))$sdev^2
146 | 
147 | (6.6 + 2.65) / 11
148 | 
149 | ##
150 | 
151 | VSS.scree(cor(mtcars))
152 | 
153 | ##
154 | 
155 | scree(cor(mtcars))
156 | 
157 | ##
158 | 
159 | fa.parallel(mtcars)
160 | 
161 | ###
162 | 
163 | pc <- prcomp(mtcars, scale = TRUE)
164 | head(pc$x[, 1:2])
165 | 
166 | head(scale(mtcars) %*% pc$rotation[, 1:2])
167 | 
168 | summary(pc$x[, 1:2])
169 | 
170 | ##
171 | 
172 | apply(pc$x[, 1:2], 2, sd)
173 | pc$sdev[1:2]
174 | 
175 | round(cor(pc$x))
176 | 
177 | pc$rotation[, 1:2]
178 | 
179 | ##
180 | 
181 | biplot(pc, cex = c(0.8, 1.2))
182 | abline(h = 0, v = 0, lty = 'dashed')
183 | 
184 | ##
185 | 
186 | cor(mtcars, pc$x[, 1:2])
187 | 
188 | ##
189 | 
190 | varimax(pc$rotation[, 1:2])
191 | 
192 | ##
193 | 
194 | pcv <- varimax(pc$rotation[, 1:2])$loadings
195 | plot(scale(mtcars) %*% pcv, type = 'n', xlab = 'Transmission', ylab = 'Power')
196 | text(scale(mtcars) %*% pcv, labels = rownames(mtcars))
197 | 
198 | ##
199 | 
200 | library(GPArotation)
201 | promax(pc$rotation[, 1:2])
202 | cor(promax(pc$rotation[, 1:2])$loadings)
203 | 
204 | ##
205 | 
206 | library(jpeg)
207 | t <- tempfile()
208 | download.file('http://bit.ly/nasa-img', t)
209 | img <- readJPEG(t)
210 | str(img)
211 | 
212 | h <- dim(img)[1]
213 | w <- dim(img)[2]
214 | m <- matrix(img, h*w)
215 | str(m)
216 | 
217 | pca <- prcomp(m)
218 | 
219 | ##
220 | 
221 | summary(pca)
222 | pca$rotation
223 | 
224 | extractColors <- function(x) rgb(x[1], x[2], x[3])
225 | 
226 | (colors <- apply(abs(pca$rotation), 2, extractColors))
227 | 
228 | ##
229 | 
230 | pie(pca$sdev, col = colors, labels = colors)
231 | 
232 | par(mfrow = c(1, 2), mar = rep(0, 4))
233 | image(matrix(pca$x[, 1], h), col = gray.colors(100))
234 | image(matrix(pca$x[, 2], h), col = gray.colors(100), yaxt = 'n')
235 | 
236 | ##
237 | 
238 | m <- subset(mtcars, select = c(mpg, cyl, hp, carb))
239 | 
240 | (f <- fa(m))
241 | 
242 | ##
243 | 
244 | fa.diagram(f)
245 | 
246 | cor(f$scores, mtcars$disp)
247 | 
248 | ##
249 | 
250 | as.matrix(eurodist)[1:5, 1:5]
251 | 
252 | (mds <- cmdscale(eurodist))
253 | 
254 | ##
255 | 
256 | plot(mds)
257 | 
258 | ##
259 | 
260 | plot(mds, type = 'n')
261 | text(mds[, 1], mds[, 2], labels(eurodist), cex = 0.7)
262 | 
263 | ##
264 | 
265 | mds <- cmdscale(dist(mtcars))
266 | plot(mds, type = 'n')
267 | text(mds[, 1], mds[, 2], rownames(mds), cex = 0.7)
268 | 


--------------------------------------------------------------------------------
/10 - Classification and Clustering.R:
--------------------------------------------------------------------------------
  1 | ## Extracted code chunks from
  2 | ##
  3 | ##  Gergely Daróczi (2015): Mastering Data Analysis with R.
  4 | ##
  5 | ##    Chapter #10: Classification and Clustering. pp. 235-268.
  6 | ##
  7 | ##
  8 | ##  This file includes the code chunks from the above mentioned
  9 | ##  chapter except for the leading ">" and "+" characters, which
 10 | ##  stand for the prompt in the R console. The prompt was
 11 | ##  intentionally removed here along with arbitrary line-breaks,
 12 | ##  so that you copy and paste the R expressions to the R console
 13 | ##  in a more convenient and seamless way.
 14 | ##
 15 | ##  Code chunks are grouped here by the printed pages of the book.
 16 | ##  Two hash signs at the beginning of a line stands for a page
 17 | ##  break, while an extra empty line between the code chunks
 18 | ##  represents one or more paragraphs in the original book between
 19 | ##  the examples for easier navigation.
 20 | ##
 21 | ##  Sometimes extra instructions starting with a double hash are
 22 | ##  also provided on how to run the below expressions.
 23 | ##
 24 | ##
 25 | ##  Find more information on the book at http://bit.ly/mastering-R
 26 | ##  and you can contact me on Twitter and GitHub by the @daroczig
 27 | ##  handle, or mail me at daroczig@rapporter.net
 28 | ##
 29 | 
 30 | d <- dist(mtcars)
 31 | h <- hclust(d)
 32 | h
 33 | 
 34 | ##
 35 | 
 36 | plot(h)
 37 | 
 38 | ##
 39 | 
 40 | plot(h)
 41 | rect.hclust(h, k = 3, border = 'red')
 42 | 
 43 | (cn <- cutree(h, k = 3))
 44 | 
 45 | ##
 46 | 
 47 | table(cn)
 48 | 
 49 | round(aggregate(mtcars, FUN = mean, by = list(cn)), 1)
 50 | 
 51 | round(aggregate(mtcars, FUN = sd, by = list(cn)), 1)
 52 | 
 53 | ##
 54 | 
 55 | round(sapply(mtcars, sd), 1)
 56 | 
 57 | round(apply(aggregate(mtcars, FUN = mean, by = list(cn)), 2, sd), 1)
 58 | 
 59 | ##
 60 | 
 61 | library(NbClust)
 62 | NbClust(mtcars, method = 'complete', index = 'dindex')
 63 | 
 64 | NbClust(mtcars, method = 'complete', index = 'hartigan')$Best.nc
 65 | 
 66 | ##
 67 | 
 68 | NbClust(mtcars, method = 'complete', index = 'kl')$Best.nc
 69 | 
 70 | NbClust(iris[, -5], method = 'complete', index = 'all')$Best.nc[1, ]
 71 | 
 72 | ##
 73 | 
 74 | (k <- kmeans(mtcars, 3))
 75 | 
 76 | ##
 77 | 
 78 | all(cn == k$cluster)
 79 | 
 80 | cbind(cn, k$cluster)
 81 | 
 82 | ##
 83 | 
 84 | library(cluster)
 85 | clusplot(mtcars, k$cluster, color = TRUE, shade = TRUE, labels = 2, cex = 0.7)
 86 | 
 87 | ##
 88 | 
 89 | factors <- c('cyl', 'vs', 'am', 'carb', 'gear')
 90 | mtcars[, factors] <- lapply(mtcars[, factors], factor)
 91 | 
 92 | ##
 93 | 
 94 | library(poLCA)
 95 | p <- poLCA(cbind(cyl, vs, am, carb, gear) ~ 1, data = mtcars, graphs = TRUE, nclass = 3)
 96 | 
 97 | p$probs
 98 | 
 99 | ##
100 | 
101 | plot(p)
102 | 
103 | p$P
104 | 
105 | ##
106 | 
107 | rm(mtcars)
108 | mtcars$gear <- factor(mtcars$gear)
109 | 
110 | library(MASS)
111 | d <- lda(gear ~ ., data = mtcars, CV = TRUE)
112 | 
113 | (tab <- table(mtcars$gear, d$class))
114 | 
115 | tab / rowSums(tab)
116 | 
117 | sum(diag(tab)) / sum(tab)
118 | 
119 | ##
120 | 
121 | round(d$posterior, 4)
122 | 
123 | ##
124 | 
125 | d <- lda(gear ~ ., data = mtcars)
126 | plot(d)
127 | 
128 | ##
129 | 
130 | plot(d, dimen = 1, type = 'both' )
131 | 
132 | ##
133 | 
134 | lr <- glm(am ~ hp + wt, data = mtcars, family = binomial)
135 | summary(lr)
136 | 
137 | ##
138 | 
139 | table(mtcars$am, round(predict(lr, type = 'response')))
140 | 
141 | library(nnet)
142 | (mlr <- multinom(factor(gear) ~ ., data = mtcars))
143 | 
144 | ##
145 | 
146 | table(mtcars$gear, predict(mlr))
147 | 
148 | rm(mtcars)
149 | 
150 | ##
151 | 
152 | set.seed(42)
153 | n     <- nrow(mtcars)
154 | train <- mtcars[sample(n, n/2), ]
155 | 
156 | library(dplyr)
157 | train <- sample_n(mtcars, n / 2)
158 | 
159 | ##
160 | 
161 | test <- mtcars[setdiff(row.names(mtcars), row.names(train)), ]
162 | 
163 | library(class)
164 | (cm <- knn(
165 |     train = subset(train, select = -gear),
166 |     test  = subset(test, select = -gear),
167 |     cl    = train$gear,
168 |     k     = 5))
169 | 
170 | cor(test$gear, as.numeric(as.character(cm)))
171 | 
172 | table(test$gear, as.numeric(as.character(cm)))
173 | 
174 | ##
175 | 
176 | table(train$gear)
177 | 
178 | ##
179 | 
180 | library(rpart)
181 | ct <- rpart(factor(gear) ~ ., data = train, minsplit = 3)
182 | summary(ct)
183 | 
184 | ##
185 | 
186 | plot(ct); text(ct)
187 | 
188 | table(test$gear, predict(ct, newdata = test, type = 'class'))
189 | 
190 | ##
191 | 
192 | library(rpart.plot)
193 | rpart.plot(ct)
194 | 
195 | library(partykit)
196 | plot(as.party(ct))
197 | 
198 | library(party)
199 | ct <- ctree(factor(gear) ~ drat, data = train, controls = ctree_control(minsplit = 3))
200 | plot(ct, main = "Conditional Inference Tree")
201 | 
202 | table(test$gear, predict(ct, newdata = test, type = 'node'))
203 | 
204 | ##
205 | 
206 | library(randomForest)
207 | (rf <- randomForest(factor(gear) ~ ., data = train, ntree = 250))
208 | 
209 | table(test$gear, predict(rf, test))
210 | 
211 | ##
212 | 
213 | plot(rf)
214 | legend('topright', colnames(rf$err.rate), col = 1:4, fill = 1:4, bty = 'n')
215 | 
216 | library(caret)
217 | 
218 | ##
219 | 
220 | library(C50)
221 | C50 <- train(factor(gear) ~ ., data = train, method = 'C5.0')
222 | summary(C50)
223 | 
224 | ##
225 | 
226 | table(test$gear, predict(C50, test))
227 | 


--------------------------------------------------------------------------------
/11 - Social Network Analysis of the R ecosystem.R:
--------------------------------------------------------------------------------
  1 | ## Extracted code chunks from
  2 | ##
  3 | ##  Gergely Daróczi (2015): Mastering Data Analysis with R.
  4 | ##
  5 | ##    Chapter #11: Social Network Analysis
  6 | ##                 of the R Ecosystem. pp. 269-280.
  7 | ##
  8 | ##
  9 | ##  This file includes the code chunks from the above mentioned
 10 | ##  chapter except for the leading ">" and "+" characters, which
 11 | ##  stand for the prompt in the R console. The prompt was
 12 | ##  intentionally removed here along with arbitrary line-breaks,
 13 | ##  so that you copy and paste the R expressions to the R console
 14 | ##  in a more convenient and seamless way.
 15 | ##
 16 | ##  Code chunks are grouped here by the printed pages of the book.
 17 | ##  Two hash signs at the beginning of a line stands for a page
 18 | ##  break, while an extra empty line between the code chunks
 19 | ##  represents one or more paragraphs in the original book between
 20 | ##  the examples for easier navigation.
 21 | ##
 22 | ##  Sometimes extra instructions starting with a double hash are
 23 | ##  also provided on how to run the below expressions.
 24 | ##
 25 | ##
 26 | ##  Find more information on the book at http://bit.ly/mastering-R
 27 | ##  and you can contact me on Twitter and GitHub by the @daroczig
 28 | ##  handle, or mail me at daroczig@rapporter.net
 29 | ##
 30 | 
 31 | library(tools)
 32 | pkgs <- available.packages()
 33 | str(pkgs)
 34 | head(package.dependencies(pkgs), 2)
 35 | 
 36 | ##
 37 | 
 38 | library(plyr)
 39 | edges <- ldply(c('Depends', 'Imports', 'Suggests'), function(depLevel) {
 40 |     deps <- package.dependencies(pkgs, depLevel = depLevel)
 41 |     ldply(names(deps), function(pkg)
 42 |         if (!identical(deps[[pkg]], NA))
 43 |             data.frame(
 44 |                 src   = pkg,
 45 |                 dep   = deps[[pkg]][, 1],
 46 |                 label = depLevel,
 47 |                 stringsAsFactors = FALSE))
 48 | })
 49 | str(edges)
 50 | 
 51 | nrow(edges) / (nrow(pkgs) * (nrow(pkgs) - 1))
 52 | 
 53 | ##
 54 | 
 55 | head(sort(table(edges$dep), decreasing = TRUE))
 56 | 
 57 | edges <- edges[edges$dep != 'R', ]
 58 | 
 59 | 
 60 | ## directed
 61 | 
 62 | 
 63 | ## table(edges$label)
 64 | ## edges <- subset(edges, label %in% c('Depends', 'Imports'))
 65 | 
 66 | library(igraph)
 67 | g <- graph.data.frame(edges)
 68 | summary(g)
 69 | 
 70 | graph.density(g)
 71 | head(sort(degree(g), decreasing = TRUE))
 72 | 
 73 | ##
 74 | 
 75 | head(sort(degree(g), decreasing = TRUE))
 76 | head(sort(closeness(g)))
 77 | head(sort(betweenness(g), decreasing = TRUE))
 78 | 
 79 | plot(degree(g), betweenness(g), type = 'n', main = 'Centrality of R package dependencies')
 80 | text(degree(g), betweenness(g), labels = V(g)$name)
 81 | 
 82 | ##
 83 | 
 84 | edges <- edges[edges$label != 'Suggests', ]
 85 | deptree <- edges$dep[edges$src == 'igraph']
 86 | while (!all(edges$dep[edges$src %in% deptree] %in% deptree))
 87 |     deptree <- union(deptree, edges$dep[edges$src %in% deptree])
 88 | 
 89 | g <- graph.data.frame(edges[edges$src %in% c('igraph', deptree), ])
 90 | plot(g)
 91 | 
 92 | ##
 93 | 
 94 | V(g)$label.color <- 'orange'
 95 | V(g)$label.color[V(g)$name == 'igraph'] <- 'darkred'
 96 | V(g)$label.color[V(g)$name %in% edges$dep[edges$src == 'igraph']] <- 'orangered'
 97 | E(g)$color <- c('blue', 'green')[factor(df$label)]
 98 | plot(g, vertex.shape = 'none', edge.label = NA)
 99 | 
100 | ##
101 | 
102 | tkplot(g, edge.label = NA)
103 | rglplot(g)
104 | 
105 | ##
106 | 
107 | library(visNetwork)
108 | nodes <- get.data.frame(g, 'vertices')
109 | names(nodes) <- c('id', 'color')
110 | 
111 | ##
112 | 
113 | edges <- get.data.frame(g)
114 | visNetwork(nodes, edges)
115 | 
116 | g <- dominator.tree(g, root = 'igraph')$domtree
117 | plot(g, layout = layout.reingold.tilford(g, root = 'igraph'), vertex.shape = 'none')
118 | 
119 | ##
120 | 
121 | library(miniCRAN)
122 | pkgs <- pkgAvail()
123 | pkgDep('igraph', availPkgs = pkgs, suggests = FALSE, includeBasePkgs = TRUE)
124 | plot(makeDepGraph('igraph', pkgs, suggests = FALSE, includeBasePkgs = TRUE))
125 | 


--------------------------------------------------------------------------------
/12 - Analysing Time-series.R:
--------------------------------------------------------------------------------
  1 | ## Extracted code chunks from
  2 | ##
  3 | ##  Gergely Daróczi (2015): Mastering Data Analysis with R.
  4 | ##
  5 | ##    Chapter #12: Analyzing Time-series. pp. 281-296.
  6 | ##
  7 | ##
  8 | ##  This file includes the code chunks from the above mentioned
  9 | ##  chapter except for the leading ">" and "+" characters, which
 10 | ##  stand for the prompt in the R console. The prompt was
 11 | ##  intentionally removed here along with arbitrary line-breaks,
 12 | ##  so that you copy and paste the R expressions to the R console
 13 | ##  in a more convenient and seamless way.
 14 | ##
 15 | ##  Code chunks are grouped here by the printed pages of the book.
 16 | ##  Two hash signs at the beginning of a line stands for a page
 17 | ##  break, while an extra empty line between the code chunks
 18 | ##  represents one or more paragraphs in the original book between
 19 | ##  the examples for easier navigation.
 20 | ##
 21 | ##  Sometimes extra instructions starting with a double hash are
 22 | ##  also provided on how to run the below expressions.
 23 | ##
 24 | ##
 25 | ##  Find more information on the book at http://bit.ly/mastering-R
 26 | ##  and you can contact me on Twitter and GitHub by the @daroczig
 27 | ##  handle, or mail me at daroczig@rapporter.net
 28 | ##
 29 | 
 30 | library(hflights)
 31 | library(data.table)
 32 | 
 33 | dt <- data.table(hflights)
 34 | dt[, date := ISOdate(Year, Month, DayofMonth)]
 35 | 
 36 | daily <- dt[, list(
 37 |     N         = .N,
 38 |     Delays    = sum(ArrDelay, na.rm = TRUE),
 39 |     Cancelled = sum(Cancelled),
 40 |     Distance  = mean(Distance)
 41 | ), by = date]
 42 | 
 43 | str(daily)
 44 | 
 45 | ##
 46 | 
 47 | plot(ts(daily))
 48 | 
 49 | ##
 50 | 
 51 | setorder(daily, date)
 52 | plot(ts(daily))
 53 | 
 54 | plot(ts(daily$N, start = 2011, frequency = 365), main = 'Number of flights from Houston in 2011')
 55 | 
 56 | ##
 57 | 
 58 | plot(decompose(ts(daily$N, frequency = 7)))
 59 | 
 60 | ##
 61 | 
 62 | setNames(decompose(ts(daily$N, frequency = 7))$figure, weekdays(daily$date[1:7]))
 63 | 
 64 | decompose(ts(daily$N, frequency = 365))
 65 | 
 66 | nts <- ts(daily$N, frequency = 7)
 67 | fit <- HoltWinters(nts, beta = FALSE, gamma = FALSE)
 68 | plot(fit)
 69 | 
 70 | ##
 71 | 
 72 | fit <- HoltWinters(nts)
 73 | plot(fit)
 74 | 
 75 | library(forecast)
 76 | forecast(fit)
 77 | 
 78 | ##
 79 | 
 80 | plot(forecast(HoltWinters(nts), 31))
 81 | 
 82 | ##
 83 | 
 84 | auto.arima(nts)
 85 | 
 86 | ##
 87 | 
 88 | auto.arima(nts, approximation = FALSE)
 89 | 
 90 | plot(forecast(auto.arima(nts, D = 1, approximation = FALSE), 31))
 91 | 
 92 | ##
 93 | 
 94 | cts <- ts(daily$Cancelled)
 95 | auto.arima(cts)
 96 | 
 97 | library(tsoutliers)
 98 | outliers <- tso(cts, tsmethod = 'arima', args.tsmethod  = list(order = c(1, 1, 2)))
 99 | 
100 | ##
101 | 
102 | plot(outliers)
103 | 
104 | plot(tso(ts(daily$Cancelled)))
105 | 
106 | ##
107 | 
108 | dfc <- as.data.frame(daily[, c('date', 'Cancelled'), with = FALSE])
109 | 
110 | library(AnomalyDetection)
111 | AnomalyDetectionTs(dfc, plot = TRUE)$plot
112 | 
113 | library(zoo)
114 | zd <- zoo(daily[, -1, with = FALSE], daily[[1]])
115 | 
116 | ##
117 | 
118 | plot(zd)
119 | 
120 | ##
121 | 
122 | plot(cumsum(zd))
123 | 


--------------------------------------------------------------------------------
/13 - Data Around Us.R:
--------------------------------------------------------------------------------
  1 | ## Extracted code chunks from
  2 | ##
  3 | ##  Gergely Daróczi (2015): Mastering Data Analysis with R.
  4 | ##
  5 | ##    Chapter #13: Data Around Us. pp. 297-322.
  6 | ##
  7 | ##
  8 | ##  This file includes the code chunks from the above mentioned
  9 | ##  chapter except for the leading ">" and "+" characters, which
 10 | ##  stand for the prompt in the R console. The prompt was
 11 | ##  intentionally removed here along with arbitrary line-breaks,
 12 | ##  so that you copy and paste the R expressions to the R console
 13 | ##  in a more convenient and seamless way.
 14 | ##
 15 | ##  Code chunks are grouped here by the printed pages of the book.
 16 | ##  Two hash signs at the beginning of a line stands for a page
 17 | ##  break, while an extra empty line between the code chunks
 18 | ##  represents one or more paragraphs in the original book between
 19 | ##  the examples for easier navigation.
 20 | ##
 21 | ##  Sometimes extra instructions starting with a double hash are
 22 | ##  also provided on how to run the below expressions.
 23 | ##
 24 | ##
 25 | ##  Find more information on the book at http://bit.ly/mastering-R
 26 | ##  and you can contact me on Twitter and GitHub by the @daroczig
 27 | ##  handle, or mail me at daroczig@rapporter.net
 28 | ##
 29 | 
 30 | library(hflights)
 31 | library(data.table)
 32 | dt <- data.table(hflights)[, list(
 33 |     N         = .N,
 34 |     Cancelled = sum(Cancelled),
 35 |     Distance  = Distance[1],
 36 |     TimeVar   = sd(ActualElapsedTime, na.rm = TRUE),
 37 |     ArrDelay  = mean(ArrDelay, na.rm = TRUE)) , by = Dest]
 38 | 
 39 | ##
 40 | 
 41 | str(dt)
 42 | 
 43 | library(ggmap)
 44 | (h <- geocode('Houston, TX'))
 45 | 
 46 | ##
 47 | 
 48 | h <- data.frame(lat = 29.7630556, lon = -95.36306556)
 49 | 
 50 | dt[, c('lon', 'lat') := geocode(Dest)]
 51 | 
 52 | str(setDF(dt))
 53 | 
 54 | par(mar=(c(0,0,1,0)))
 55 | library(maps)
 56 | map('state')
 57 | title('Flight destinations from Houston,TX')
 58 | 
 59 | ##
 60 | 
 61 | points(h$lon, h$lat, col = 'blue', pch = 13)
 62 | points(dt$lon, dt$lat, col = 'red', pch = 19)
 63 | 
 64 | text(dt$lon, dt$lat + 1, labels = dt$Dest, cex = 0.7)
 65 | 
 66 | ##
 67 | 
 68 | map('state')
 69 | title('Frequent flight destinations from Houston,TX')
 70 | points(h$lon, h$lat, col = 'blue', pch = 13)
 71 | points(dt$lon, dt$lat, col = rgb(1, 0, 0, dt$N/max(dt$N)), pch = 19)
 72 | legend('bottomright', legend = round(quantile(dt$N)), col = rgb(1, 0, 0, quantile(dt$N)/max(dt$N)), pch = 19, box.col = NA)
 73 | 
 74 | ##
 75 | 
 76 | str(map_data <- map('state', plot = FALSE, fill = TRUE))
 77 | 
 78 | grep('^washington', map_data$names, value = TRUE)
 79 | 
 80 | states <- sapply(strsplit(map_data$names, ':'), '[[', 1)
 81 | 
 82 | library(maptools)
 83 | us <- map2SpatialPolygons(map_data, IDs = states, proj4string = CRS('+proj=longlat +datum=WGS84'))
 84 | 
 85 | ## ## alternative way of getting the data in the right form:
 86 | ## library(raster)
 87 | ## us <- getData('GADM', country = 'USA', level = 1)
 88 | ## library(rgeos)
 89 | ## us <- gSimplify(us, tol = 0.01, topologyPreserve = TRUE)
 90 | 
 91 | plot(us)
 92 | 
 93 | library(sp)
 94 | dtp <- SpatialPointsDataFrame(dt[, c('lon', 'lat')], dt, proj4string = CRS('+proj=longlat +datum=WGS84'))
 95 | str(sp::over(us, dtp))
 96 | 
 97 | ##
 98 | 
 99 | ## ## alternative way of getting state info
100 | ## geocode('LAX')
101 | ## geocode('LAX', 'more')
102 | ## geocode('LAX', 'all')$results[[1]]$address_components[[6]]$short_name
103 | 
104 | str(sapply(sp::over(us, dtp, returnList = TRUE), function(x) sum(x$Cancelled)))
105 | 
106 | dtp <- SpatialPointsDataFrame(dt[, c('lon', 'lat')], dt[, 'Cancelled', drop = FALSE], proj4string = CRS('+proj=longlat +datum=WGS84'))
107 | str(cancels <- sp::over(us, dtp, fn = sum))
108 | 
109 | val <- cancels$Cancelled[match(states, row.names(cancels))]
110 | 
111 | ##
112 | 
113 | val[is.na(val)] <- 0
114 | 
115 | map('state', col = rgb(1, 0, 0, sqrt(val/max(val))), fill = TRUE)
116 | title('Number of cancelled flights from Houston to US states')
117 | points(h$lon, h$lat, col = 'blue', pch = 13)
118 | legend('bottomright', legend = round(quantile(val)), fill = rgb(1, 0, 0, sqrt(quantile(val)/max(val))), box.col = NA)
119 | 
120 | ##
121 | 
122 | library(fields)
123 | out  <- as.image(dt$ArrDelay, x = dt[, c('lon', 'lat')], nrow = 256, ncol = 256)
124 | 
125 | table(is.na(out$z))
126 | 
127 | image(out)
128 | 
129 | ##
130 | 
131 | image(out, xlim = base::range(map_data$x, na.rm = TRUE), ylim = base::range(map_data$y, na.rm = TRUE))
132 | 
133 | look <- image.smooth(out, theta = .5)
134 | table(is.na(look$z))
135 | 
136 | ##
137 | 
138 | image(look)
139 | 
140 | out  <- as.image(dt$ArrDelay, x = dt[, c('lon', 'lat')], nrow = 512, ncol = 512)
141 | look <- image.smooth(out, theta = 1)
142 | 
143 | usa_data <- map('usa', plot = FALSE, region = 'main')
144 | p <- expand.grid(look$x, look$y)
145 | library(sp)
146 | n <- which(point.in.polygon(p$Var1, p$Var2, usa_data$x, usa_data$y) == 0)
147 | look$z[n] <- NA
148 | 
149 | ##
150 | 
151 | map('usa')
152 | image(look, add = TRUE)
153 | map('state', lwd = 3, add = TRUE)
154 | title('Arrival delays of flights from Houston')
155 | points(dt$lon, dt$lat, pch = 19, cex = .5)
156 | points(h$lon, h$lat, pch = 13)
157 | 
158 | library(deldir)
159 | map('usa')
160 | plot(deldir(dt$lon, dt$lat), wlines = 'tess', col = c('red', 'darkgray'), lwd = 2, add = TRUE, pch = 19)
161 | 
162 | ##
163 | 
164 | library(OpenStreetMap)
165 | map <- openmap(c(max(map_data$y, na.rm = TRUE),
166 |                  min(map_data$x, na.rm = TRUE)),
167 |                c(min(map_data$y, na.rm = TRUE),
168 |                  max(map_data$x, na.rm = TRUE)),
169 |                type = 'stamen-terrain')
170 | 
171 | ##
172 | 
173 | map <- openproj(map, projection = '+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs')
174 | 
175 | plot(map)
176 | plot(deldir(dt$lon, dt$lat), wlines = "tess", col = c('red', 'black'), lwd = 2, add = TRUE, pch = 19, cex = 0.5)
177 | 
178 | ##
179 | 
180 | map <- openmap(c(max(map_data$y),
181 |                  min(map_data$x)),
182 |                c(min(map_data$y),
183 |                  max(map_data$x)),
184 |                type = 'cloudmade-1960')
185 | plot(map)
186 | 
187 | ##
188 | 
189 | cancels$state <- rownames(cancels)
190 | cancels$Cancelled[is.na(cancels$Cancelled)] <- 0
191 | 
192 | library(googleVis)
193 | plot(gvisGeoChart(cancels, 'state', 'Cancelled',
194 |                   options = list(
195 |                       region      = 'US',
196 |                       displayMode = 'regions',
197 |                       resolution  = 'provinces')))
198 | 
199 | 
200 | ##
201 | 
202 | dt$LatLong <- paste(dt$lat, dt$lon, sep = ':')
203 | dt$tip <- apply(dt, 1, function(x)
204 |                  paste(names(dt), x, collapse = '<br/ >'))
205 | plot(gvisMap(dt, 'LatLong', tipvar = 'tip'))
206 | 
207 | ##
208 | 
209 | library(rCharts)
210 | map <- Leaflet$new()
211 | map$setView(as.numeric(dt[which(dt$Dest == 'MCI'), c('lat', 'lon')]), zoom = 4)
212 | 
213 | ##
214 | 
215 | 
216 | for (i in 1:nrow(dt))
217 |     map$marker(c(dt$lat[i], dt$lon[i]), bindPopup = dt$tip[i])
218 | map$show()
219 | 
220 | library(leaflet)
221 | 
222 | pal <- colorNumeric(palette = 'Blues', domain = countries$gdp_md_est)
223 | 
224 | leaflet(us) %>% addProviderTiles("Acetate.terrain") %>% addPolygons( color = ~pal(gdp_md_est)) %>% addMarkers(lng = dt$lon, lat = dt$lat, popup = dt$tip)
225 | 
226 | ##
227 | 
228 | dt <- dt[point.in.polygon(dt$lon, dt$lat, usa_data$x, usa_data$y) == 1, ]
229 | 
230 | ##
231 | 
232 | library(diagram)
233 | library(scales)
234 | 
235 | map('usa')
236 | title('Number of flights, cancellations and delays from Houston')
237 | image(look, add = TRUE)
238 | map('state', lwd = 3, add = TRUE)
239 | for (i in 1:nrow(dt)) {
240 |     curvedarrow(
241 |         from       = rev(as.numeric(h)),
242 |         to         = as.numeric(dt[i, c('lon', 'lat')]),
243 |         arr.pos    = 1,
244 |         arr.type   = 'circle',
245 |         curve      = 0.1,
246 |         arr.col    = alpha('black', dt$N[i] / max(dt$N)),
247 |         arr.length = dt$N[i] / max(dt$N),
248 |         lwd        = dt$Cancelled[i] / max(dt$Cancelled) * 25,
249 |         lcol       = alpha('black', dt$Cancelled[i] / max(dt$Cancelled)))
250 | }
251 | 
252 | ## models
253 | 
254 | dm <- dist(dt[, c('lon', 'lat')])
255 | dm <- as.matrix(dm)
256 | idm <- 1/dm
257 | diag(idm) <- 0
258 | str(idm)
259 | 
260 | library(ape)
261 | dt$TimeVar[is.na(dt$TimeVar)] <- 0
262 | Moran.I(dt$TimeVar, idm)
263 | 
264 | ##
265 | 
266 | library(spdep)
267 | idml <- mat2listw(idm)
268 | moran.test(dt$TimeVar, idml)
269 | 
270 | idml <- mat2listw(idm, style = 'W')
271 | moran.test(dt$TimeVar, idml)
272 | 


--------------------------------------------------------------------------------
/14 - Analyzing the R Community.R:
--------------------------------------------------------------------------------
  1 | ## Extracted code chunks from
  2 | ##
  3 | ##  Gergely Daróczi (2015): Mastering Data Analysis with R.
  4 | ##
  5 | ##    Chapter #14: Analyzing the R Community. pp. 323-348.
  6 | ##
  7 | ##
  8 | ##  This file includes the code chunks from the above mentioned
  9 | ##  chapter except for the leading ">" and "+" characters, which
 10 | ##  stand for the prompt in the R console. The prompt was
 11 | ##  intentionally removed here along with arbitrary line-breaks,
 12 | ##  so that you copy and paste the R expressions to the R console
 13 | ##  in a more convenient and seamless way.
 14 | ##
 15 | ##  Code chunks are grouped here by the printed pages of the book.
 16 | ##  Two hash signs at the beginning of a line stands for a page
 17 | ##  break, while an extra empty line between the code chunks
 18 | ##  represents one or more paragraphs in the original book between
 19 | ##  the examples for easier navigation.
 20 | ##
 21 | ##  Sometimes extra instructions starting with a double hash are
 22 | ##  also provided on how to run the below expressions.
 23 | ##
 24 | ##
 25 | ##  Find more information on the book at http://bit.ly/mastering-R
 26 | ##  and you can contact me on Twitter and GitHub by the @daroczig
 27 | ##  handle, or mail me at daroczig@rapporter.net
 28 | ##
 29 | 
 30 | library(XML)
 31 | page <- htmlParse('http://www.r-project.org/foundation/donors.html')
 32 | 
 33 | ##
 34 | 
 35 | list <- unlist(xpathApply(page, "//h3[@id='supporting-members']/following-sibling::ul[1]/li", xmlValue))
 36 | str(list)
 37 | 
 38 | supporterlist <- sub(' \\([a-zA-Z ]*\\)$', '', list)
 39 | countrylist   <- substr(list, nchar(supporterlist) + 3, nchar(list) - 1)
 40 | 
 41 | tail(sort(prop.table(table(countrylist)) * 100), 5)
 42 | 
 43 | countries <- as.data.frame(table(countrylist))
 44 | 
 45 | ##
 46 | 
 47 | library(rworldmap)
 48 | joinCountryData2Map(countries, joinCode = 'NAME', nameJoinColumn = 'countrylist', verbose = TRUE)
 49 | 
 50 | library(ggmap)
 51 | for (fix in c('Brasil', 'CZ', 'Danmark', 'NL')) {
 52 |     countrylist[which(countrylist == fix)] <- geocode(fix, output = 'more')$country
 53 | }
 54 | 
 55 | countries <- as.data.frame(table(countrylist))
 56 | countries <- joinCountryData2Map(countries, joinCode = 'NAME', nameJoinColumn = 'countrylist')
 57 | 
 58 | ##
 59 | 
 60 | mapCountryData(countries, 'Freq', catMethod = 'logFixedWidth', mapTitle = 'Number of R Foundation supporting members')
 61 | 
 62 | ##
 63 | 
 64 | packages <- readHTMLTable('http://cran.r-project.org/web/checks/check_summary.html', which = 2)
 65 | 
 66 | maintainers <- sub('(.*) <(.*)>', '\\1', packages$' Maintainer')
 67 | maintainers <- gsub(' ', ' ', maintainers)
 68 | str(maintainers)
 69 | 
 70 | tail(sort(table(maintainers)), 8)
 71 | 
 72 | ##
 73 | 
 74 | N <- as.numeric(table(maintainers))
 75 | library(fitdistrplus)
 76 | plotdist(N)
 77 | 
 78 | descdist(N, boot = 1e3)
 79 | 
 80 | ##
 81 | 
 82 | (gparams <- fitdist(N, 'gamma'))
 83 | 
 84 | ##
 85 | 
 86 | gshape  <- gparams$estimate[['shape']]
 87 | grate   <- gparams$estimate[['rate']]
 88 | sum(rgamma(1e5, shape = gshape, rate = grate))
 89 | hist(rgamma(1e5, shape = gshape, rate = grate))
 90 | 
 91 | pgamma(2, shape = gshape, rate = grate)
 92 | 
 93 | prop.table(table(N <= 2))
 94 | 
 95 | ##
 96 | 
 97 | ploc <- min(N)
 98 | pshp <- length(N) / sum(log(N) - log(ploc))
 99 | 
100 | library(actuar)
101 | ppareto(2, pshp, ploc)
102 | 
103 | fg <- fitdist(N, 'gamma')
104 | fw <- fitdist(N, 'weibull')
105 | fl <- fitdist(N, 'lnorm')
106 | fp <- fitdist(N, 'pareto', start = list(shape = 1, scale = 1))
107 | par(mfrow = c(1, 2))
108 | denscomp(list(fg, fw, fl, fp), addlegend = FALSE)
109 | qqcomp(list(fg, fw, fl, fp), legendtext = c('gamma', 'Weibull', 'Lognormal', 'Pareto'))
110 | 
111 | ##
112 | 
113 | length(unique(maintainers))
114 | 
115 | library(RCurl)
116 | url <- getURL('https://stat.ethz.ch/pipermail/r-help/')
117 | 
118 | R.help.toc <- htmlParse(url)
119 | R.help.archives <- unlist(xpathApply(R.help.toc, '//table//td[3]/a', xmlAttrs), use.names = FALSE)
120 | 
121 | dir.create('r-help')
122 | for (f in R.help.archives)
123 |     download.file(url = paste0(url, f), file.path('help-r', f), method = 'curl')
124 | 
125 | ##
126 | 
127 | lines <- system("zgrep -E '^From: .* at .*' ./help-r/*.txt.gz", intern = TRUE)
128 | length(lines)
129 | length(unique(lines))
130 | 
131 | lines    <- sub('.*From: ', '', lines)
132 | Rhelpers <- sub('.*\\((.*)\\)', '\\1', lines)
133 | 
134 | tail(sort(table(Rhelpers)), 6)
135 | 
136 | ##
137 | 
138 | grep('Brian( D)? Ripley', names(table(Rhelpers)), value = TRUE)
139 | 
140 | sum(grepl('Brian( D)? Ripley', Rhelpers))
141 | 
142 | ##
143 | 
144 | lines <- system("zgrep -E '^Date: [A-Za-z]{3}, [0-9]{1,2} [A-Za-z]{3} [0-9]{4} [0-9]{2}:[0-9]{2}:[0-9]{2} [-+]{1}[0-9]{4}' ./help-r/*.txt.gz", intern = TRUE)
145 | length(lines)
146 | 
147 | head(sub('.*Date: ', '', lines[1]))
148 | 
149 | ##
150 | 
151 | times <- strptime(sub('.*Date: ', '', lines), format = '%a, %d %b %Y %H:%M:%S %z')
152 | 
153 | plot(table(format(times, '%Y')), type = 'l')
154 | 
155 | library(data.table)
156 | Rhelp <- data.table(time = times)
157 | Rhelp[, H := hour(time)]
158 | Rhelp[, D := wday(time)]
159 | 
160 | ##
161 | 
162 | library(ggplot2)
163 | ggplot(na.omit(Rhelp[, .N, by = .(H, D)]),
164 |        aes(x = factor(H), y = factor(D), size = N)) + geom_point() +
165 |        ylab('Day of the week') + xlab('Hour of the day') +
166 |        ggtitle('Number of mails posted on [R-help]') +
167 |        theme_bw() + theme('legend.position' = 'top')
168 | 
169 | tail(sort(prop.table(table(sub('.*([+-][0-9]{4}).*', '\\1', lines)))), 22)
170 | 
171 | ##
172 | 
173 | Rhelp[, date := as.Date(time)]
174 | Rdaily <- Rhelp[, .N, by = date]
175 | 
176 | Rdaily <- zoo(Rdaily$N, Rdaily$date)
177 | 
178 | plot(Rdaily)
179 | 
180 | library(forecast)
181 | fit <- ets(Rdaily)
182 | 
183 | ##
184 | 
185 | forecast(fit, 1)
186 | 
187 | plot(forecast(fit, 30), include = 365)
188 | 
189 | ##
190 | 
191 | lists <- rbindlist(list(
192 |     data.frame(name = unique(supporterlist), list = 'supporter'),
193 |     data.frame(name = unique(maintainers), list = 'maintainer'),
194 |     data.frame(name = unique(Rhelpers), list = 'R-help')))
195 | 
196 | t <- table(lists$name, lists$list)
197 | table(rowSums(t))
198 | 
199 | library(Rcapture)
200 | descriptive(t)
201 | 
202 | ##
203 | 
204 | closedp(t)
205 | 
206 | ##
207 | 
208 | library(fbRads)
209 | 
210 | ## register an application at
211 | ##   https://developers.facebook.com/apps/
212 | ## and get a token e.g. via
213 | ## library(httr)
214 | ## app <- oauth_app('facebook', 'your_app_id', 'your_app_secret')
215 | ## tkn <- oauth2.0_token(
216 | ##   oauth_endpoints('facebook'), app, scope = 'ads_management',
217 | ##   type  = 'application/x-www-form-urlencoded')
218 | ## tkn <- tkn$credentials$access_token
219 | 
220 | fbad_init(..., ...)
221 | fbad_get_search(fbacc = fbacc, q = 'rstats', type = 'adinterest')
222 | 
223 | fbad_get_search(fbacc = fbacc, q = 'SPSS', type = 'adinterest')
224 | 
225 | res <- fbad_get_search(fbacc = fbacc, q = 'programming language', type = 'adinterest')
226 | res <- res[order(res$audience_size, decreasing = TRUE), ]
227 | res[1:10, 1:3]
228 | 
229 | ##
230 | 
231 | library(twitteR)
232 | ## register an application and make a note of your consumer key & secret,
233 | ## also your access token and secret at
234 | ##   https://apps.twitter.com
235 | setup_twitter_oauth(...)
236 | 
237 | str(searchTwitter('#rstats', n = 1, resultType = 'recent'))
238 | 
239 | ##
240 | 
241 | tweets <- Rtweets(n = 500)
242 | 
243 | ##
244 | 
245 | length(strip_retweets(tweets))
246 | 
247 | tweets <- twListToDF(tweets)
248 | 
249 | library(tm)
250 | corpus <- Corpus(VectorSource(tweets$text))
251 | 
252 | corpus <- tm_map(corpus, removeWords, stopwords('english'))
253 | corpus <- tm_map(corpus, content_transformer(tolower))
254 | corpus <- tm_map(corpus, removePunctuation)
255 | corpus <- tm_map(corpus, stripWhitespace)
256 | 
257 | corpus <- tm_map(corpus, removeWords, 'rstats')
258 | 
259 | library(wordcloud)
260 | wordcloud(corpus)
261 | 
262 | 
263 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## [Mastering Data Analysis with R](http://bit.ly/mastering-R)
 2 | 
 3 | This repository includes the example R source code and data files for the above referenced book published at Packt Publishing in 2015. 
 4 | 
 5 | The above R files are identical to the R code examples found in the book except for the leading `>` and `+` characters, which stand for the prompt in the R console. As the book included both the R commands and output, the prompt was also shown in the examples so that the reader can easily distinguish the calls from the returned values:
 6 | 
 7 | ```r
 8 | > set.seed(42)
 9 | > data.frame(
10 | +  A = runif(2),
11 | +  B = sample(letters, 2))
12 | A B
13 | 1 0.9148060 h
14 | 2 0.9370754 u
15 | ```
16 | 
17 | In this repository, both the output and the prompt were intentionally removed here along with arbitrary line-breaks, so that you copy and paste the R expressions to the R console in a more convenient and seamless way.
18 | 
19 | The code chunks are grouped by the printed pages of the book. Two hash signs at the beginning of a line stands for a page break, while an extra empty line between the code chunks represents one or more paragraphs in the original book between the examples for easier navigation. Sometimes extra instructions starting with a double hash are also provided on how to run the below expressions.
20 | 
21 | Please find more information on the book at http://bit.ly/mastering-R and you can contact me on [Twitter](https://twitter.com/daroczig) and [GitHub](https://github.com/daroczig) in case of any question or feedback.
22 | 
23 | I hope you will enjoy and find useful this book!
24 | 


--------------------------------------------------------------------------------
/data/SMART_2013.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daroczig/Mastering-Data-Analysis-with-R/668f85080a95169fcecdd8ff92341ff5261036f9/data/SMART_2013.RData


--------------------------------------------------------------------------------