├── 01 - Hello, Data.R ├── 02 - Getting Data from the Web.R ├── 03 - Filtering and Summarizing Data.R ├── 04 - Restructuring Data.R ├── 05 - Building Models.R ├── 06 - Beyond the Linear Trend Line.R ├── 07 - Unstructured Data.R ├── 08 - Polishing Data.R ├── 09 - From Big to Smaller Data.R ├── 10 - Classification and Clustering.R ├── 11 - Social Network Analysis of the R ecosystem.R ├── 12 - Analysing Time-series.R ├── 13 - Data Around Us.R ├── 14 - Analyzing the R Community.R ├── README.md └── data └── SMART_2013.RData /01 - Hello, Data.R: -------------------------------------------------------------------------------- 1 | ## Extracted code chunks from 2 | ## 3 | ## Gergely Daróczi (2015): Mastering Data Analysis with R. 4 | ## 5 | ## Chapter #1: Hello, Data! pp. 1-36. 6 | ## 7 | ## 8 | ## This file includes the code chunks from the above mentioned 9 | ## chapter except for the leading ">" and "+" characters, which 10 | ## stand for the prompt in the R console. The prompt was 11 | ## intentionally removed here along with arbitrary line-breaks, 12 | ## so that you copy and paste the R expressions to the R console 13 | ## in a more convenient and seamless way. 14 | ## 15 | ## Code chunks are grouped here by the printed pages of the book. 16 | ## Two hash signs at the beginning of a line stands for a page 17 | ## break, while an extra empty line between the code chunks 18 | ## represents one or more paragraphs in the original book between 19 | ## the examples for easier navigation. 20 | ## 21 | ## Sometimes extra instructions starting with a double hash are 22 | ## also provided on how to run the below expressions. 23 | ## 24 | ## 25 | ## Find more information on the book at http://bit.ly/mastering-R 26 | ## and you can contact me on Twitter and GitHub by the @daroczig 27 | ## handle, or mail me at daroczig@rapporter.net 28 | ## 29 | 30 | library('hflights') 31 | write.csv(hflights, 'hflights.csv', row.names = FALSE) 32 | 33 | ## 34 | 35 | str(hflights) 36 | 37 | ## 38 | 39 | system.time(read.csv('hflights.csv')) 40 | 41 | colClasses <- sapply(hflights, class) 42 | system.time(read.csv('hflights.csv', colClasses = colClasses)) 43 | 44 | library(microbenchmark) 45 | f <- function() read.csv('hflights.csv') 46 | g <- function() read.csv('hflights.csv', colClasses = colClasses, nrows = 227496, comment.char = '') 47 | res <- microbenchmark(f(), g()) 48 | res 49 | 50 | ## p. 3 infobox 51 | library(R.utils) 52 | countLines('hflights.csv') 53 | 54 | ## 55 | 56 | boxplot(res, xlab = '', main = expression(paste('Benchmarking ', italic('read.table')))) 57 | 58 | library(sqldf) 59 | system.time(read.csv.sql('hflights.csv')) 60 | 61 | ## 62 | 63 | library(ff) 64 | system.time(read.csv.ffdf(file = 'hflights.csv')) 65 | 66 | library(bigmemory) 67 | system.time(read.big.matrix('hflights.csv', header = TRUE)) 68 | 69 | ## 70 | 71 | library(data.table) 72 | system.time(dt <- fread('hflights.csv')) 73 | 74 | df <- as.data.frame(dt) 75 | 76 | ## 77 | 78 | is.data.frame(dt) 79 | 80 | ## p. 6 81 | .read.csv.orig <- function() read.csv('hflights.csv') 82 | .read.csv.opt <- function() read.csv('hflights.csv', colClasses = colClasses, nrows = 227496, comment.char = '', stringsAsFactors = FALSE) 83 | .read.csv.sql <- function() read.csv.sql('hflights.csv') 84 | .read.csv.ffdf <- function() read.csv.ffdf(file = 'hflights.csv') 85 | .read.big.matrix <- function() read.big.matrix('hflights.csv', header = TRUE) 86 | .fread <- function() fread('hflights.csv') 87 | 88 | res <- microbenchmark(.read.csv.orig(), .read.csv.opt(), .read.csv.sql(), .read.csv.ffdf(), .read.big.matrix(), .fread(), times = 10) 89 | 90 | print(res, digits = 6) 91 | 92 | ## 93 | 94 | df <- read.csv.sql('hflights.csv', "select * from file where Dest = '\"BNA\"'") 95 | 96 | ## 97 | 98 | df <- read.csv.sql('hflights.csv', "select * from file where Dest = 'BNA'", filter = 'tr -d ^\\" ') 99 | 100 | str(df) 101 | 102 | system.time(read.csv.sql('hflights.csv', "select * from file")) 103 | system.time(read.csv.sql('hflights.csv', "select * from file where Dest = '\"BNA\"'")) 104 | 105 | system.time(system('cat hflights.csv | grep BNA', intern = TRUE)) 106 | 107 | ## 108 | 109 | sqldf("attach 'hflights_db' as new") 110 | 111 | read.csv.sql('hflights.csv', sql = 'create table hflights as select * from file', dbname = 'hflights_db') 112 | 113 | system.time(df <- sqldf("select * from hflights where Dest = '\"BNA\"'", dbname = "hflights_db")) 114 | 115 | ## 116 | 117 | ## to be pasted in the MySQL command-line: 118 | create database hflights_db; 119 | grant all privileges on hflights_db.* to 'user'@'localhost' identified by 'password'; 120 | flush privileges; 121 | exit; 122 | 123 | ## 124 | 125 | library(RMySQL) 126 | con <- dbConnect(dbDriver('MySQL'), user = 'user', password = 'password', dbname = 'hflights_db') 127 | 128 | dbWriteTable(con, 'hflights', hflights) 129 | dbListTables(con) 130 | 131 | system.time(dbReadTable(con, 'hflights')) 132 | 133 | system.time(dbGetQuery(con, 'select * from hflights')) 134 | 135 | ## 136 | 137 | options('sqldf.connection' = con) 138 | system.time(sqldf('select * from hflights')) 139 | 140 | system.time(sqldf('SELECT * FROM hflights WHERE Dest = "BNA"')) 141 | 142 | ## 143 | 144 | dbSendQuery(con, 'CREATE INDEX Dest_idx ON hflights (Dest(3));') 145 | 146 | system.time(sqldf('SELECT * FROM hflights WHERE Dest = "BNA"')) 147 | 148 | options(sqldf.driver = 'SQLite') 149 | sqldf("CREATE INDEX Dest_idx ON hflights (Dest);", dbname = "hflights_db") 150 | system.time(sqldf("select * from hflights where Dest = '\"BNA\"'", dbname = "hflights_db")) 151 | 152 | ## 153 | 154 | ## to be pasted in the command-line: 155 | createuser --pwprompt user 156 | createdb hflights_db 157 | psql 158 | 159 | ## to be pasted in the PostgreSQL command-line: 160 | \du 161 | \list 162 | grant all privileges on hflights_db to user 163 | \q 164 | 165 | ## 166 | 167 | library(RPostgreSQL) 168 | 169 | ## 170 | 171 | con <- dbConnect(dbDriver('PostgreSQL'), user = 'user', password = 'password', dbname = 'hflights_db') 172 | 173 | dbListTables(con) 174 | dbExistsTable(con, 'hflights') 175 | 176 | dbWriteTable(con, 'hflights', hflights) 177 | system.time(dbReadTable(con, 'hflights')) 178 | 179 | system.time(dbGetQuery(con, "SELECT * FROM hflights WHERE \"Dest\" = 'BNA';")) 180 | 181 | ## 182 | 183 | ## to be pasted in bash to install ROracle on Arch Linux 184 | R CMD INSTALL --configure-args='--with-oci-lib=/usr/include/ --with-oci-inc=/usr/share/licenses/oracle-instantclient-basic' ROracle_1.1-11.tar.gz 185 | 186 | library(ROracle) 187 | con <- dbConnect(dbDriver('Oracle'), user = 'pmuser', password = 'oracle', dbname = '//192.168.0.16:1521/PDB1') 188 | summary(con) 189 | 190 | dbListTables(con) 191 | 192 | ## 193 | 194 | dbWriteTable(con, 'hflights', hflights) 195 | system.time(dbReadTable(con, 'hflights')) 196 | 197 | system.time(dbGetQuery(con, "SELECT * FROM \"hflights\" WHERE \"Dest\" = 'BNA'")) 198 | 199 | dbSendQuery(con, 'CREATE INDEX Dest_idx ON "hflights" ("Dest")') 200 | 201 | ## 202 | 203 | system.time(dbGetQuery(con, "SELECT * FROM \"hflights\" WHERE \"Dest\" = 'BNA'")) 204 | 205 | ## 206 | 207 | ## the content of /etc/odbcinst.ini on Linux 208 | [MySQL] 209 | Description = ODBC Driver for MySQL 210 | Driver = /usr/lib/libmyodbc.so 211 | Setup = /usr/lib/libodbcmyS.so 212 | FileUsage = 1 213 | 214 | ## the content of /etc/odbc.ini on Linux 215 | [hflights] 216 | Description = MySQL hflights test 217 | Driver = MySQL 218 | Server = localhost 219 | Database = hflights_db 220 | Port = 3306 221 | Socket = /var/run/mysqld/mysqld.sock 222 | 223 | ## 224 | 225 | library(RODBC) 226 | con <- odbcConnect("hflights", uid = "user", pwd = "password") 227 | 228 | system.time(hflights <- sqlQuery(con, "select * from hflights")) 229 | 230 | sqlDrop(con, 'hflights') 231 | sqlSave(con, hflights, 'hflights') 232 | 233 | ## 234 | 235 | close(con) 236 | 237 | library(dbConnect) 238 | DatabaseConnect() 239 | 240 | ## 241 | 242 | library(devtools) 243 | install_github('bigrquery', 'hadley') 244 | -------------------------------------------------------------------------------- /02 - Getting Data from the Web.R: -------------------------------------------------------------------------------- 1 | ## Extracted code chunks from 2 | ## 3 | ## Gergely Daróczi (2015): Mastering Data Analysis with R. 4 | ## 5 | ## Chapter #2: Getting Data from the Web. pp. 37-64. 6 | ## 7 | ## 8 | ## This file includes the code chunks from the above mentioned 9 | ## chapter except for the leading ">" and "+" characters, which 10 | ## stand for the prompt in the R console. The prompt was 11 | ## intentionally removed here along with arbitrary line-breaks, 12 | ## so that you copy and paste the R expressions to the R console 13 | ## in a more convenient and seamless way. 14 | ## 15 | ## Code chunks are grouped here by the printed pages of the book. 16 | ## Two hash signs at the beginning of a line stands for a page 17 | ## break, while an extra empty line between the code chunks 18 | ## represents one or more paragraphs in the original book between 19 | ## the examples for easier navigation. 20 | ## 21 | ## Sometimes extra instructions starting with a double hash are 22 | ## also provided on how to run the below expressions. 23 | ## 24 | ## 25 | ## Find more information on the book at http://bit.ly/mastering-R 26 | ## and you can contact me on Twitter and GitHub by the @daroczig 27 | ## handle, or mail me at daroczig@rapporter.net 28 | ## 29 | 30 | str(read.csv('http://opengeocode.org/download/CCurls.txt')) 31 | 32 | ## 33 | 34 | library(RCurl) 35 | df <- read.csv(text = getURL('https://data.consumerfinance.gov/api/views/x94z-ydhh/rows.csv?accessType=DOWNLOAD')) 36 | str(df) 37 | sort(table(df$Product)) 38 | 39 | ## 40 | 41 | library(rjson) 42 | u <- 'http://data.consumerfinance.gov/api/views' 43 | fromJSON(file = u) 44 | 45 | ## 46 | 47 | res <- fromJSON(file = paste0(u, '/25ei-6bcr/rows.json?max_rows=5')) 48 | names(res) 49 | 50 | res <- res$data 51 | class(res) 52 | 53 | df <- as.data.frame(t(sapply(res, function(x) unlist(x[-13])))) 54 | str(df) 55 | 56 | ## 57 | 58 | library(plyr) 59 | df <- ldply(res, function(x) unlist(x[-13])) 60 | 61 | names(df) <- sapply(res$meta$view$columns, `[`, 'name')[-13] 62 | 63 | ## 64 | 65 | library(XML) 66 | doc <- xmlParse(paste0(u, '/25ei-6bcr/rows.xml?max_rows=5')) 67 | df <- xmlToDataFrame(nodes = getNodeSet(doc,'//response/row/row')) 68 | str(df) 69 | 70 | is.number <- function(x) 71 | all(!is.na(suppressWarnings(as.numeric(as.character(x))))) 72 | for (n in names(df)) 73 | if (is.number(df[, n])) 74 | df[, n] <- as.numeric(as.character(df[, n])) 75 | 76 | ## 77 | 78 | doc <- getURL(paste0(u, '/25ei-6bcr/rows?max_rows=5'), httpheader = c(Accept = 'text/html')) 79 | 80 | res <- readHTMLTable(doc) 81 | 82 | df <- res[[1]] 83 | df <- readHTMLTable(doc, which = 1) 84 | 85 | res <- readHTMLTable('http://cran.r-project.org/web/packages/available_packages_by_name.html') 86 | str(res) 87 | 88 | ## 89 | 90 | library(wordcloud) 91 | wordcloud(res[[1]][, 2]) 92 | 93 | ## 94 | 95 | page <- htmlParse('http://cran.r-project.org/web/views/WebTechnologies.html') 96 | res <- unlist(xpathApply(page, "//h3[text()='CRAN packages:']/following-sibling::ul[1]/li", xmlValue)) 97 | str(res) 98 | 99 | ## 100 | 101 | res <- xpathSApply(page, "//h3[text()='CRAN packages:']/following-sibling::ul[1]/li", xmlValue) 102 | 103 | xpathSApply(page, "//h3[text()='CRAN packages:']/following-sibling::ul[1]/li/a", xmlAttrs, 'href') 104 | 105 | ## 106 | 107 | detach('package:rjson') 108 | library(RSocrata) 109 | 110 | ## 111 | 112 | df <- read.socrata(paste0(u, '/25ei-6bcr')) 113 | str(df) 114 | 115 | ## 116 | 117 | library(quantmod) 118 | tail(getSymbols('A', env = NULL)) 119 | 120 | getFX('USD/EUR') 121 | 122 | tail(USDEUR) 123 | 124 | ## 125 | 126 | methods(getSymbols) 127 | 128 | str(stockSymbols()) 129 | 130 | ## 131 | 132 | library(Quandl) 133 | Quandl('SEC/DIV_A') 134 | 135 | ## 136 | 137 | attr(Quandl('SEC/DIV_A', meta = TRUE), 'meta')$frequency 138 | 139 | ## 140 | 141 | library(devtools) 142 | install_bitbucket('GTrendsR', 'persican') 143 | 144 | ## 145 | 146 | library(GTrendsR) 147 | 148 | conn <- gconnect('username', 'password') 149 | df <- gtrends(conn, query = 'how to install R') 150 | tail(df) 151 | plot(df) 152 | 153 | ## 154 | 155 | library(weatherData) 156 | getWeatherForDate('London', start_date = Sys.Date()-7, end_date = Sys.Date()) 157 | -------------------------------------------------------------------------------- /03 - Filtering and Summarizing Data.R: -------------------------------------------------------------------------------- 1 | ## Extracted code chunks from 2 | ## 3 | ## Gergely Daróczi (2015): Mastering Data Analysis with R. 4 | ## 5 | ## Chapter #3: Filtering and Summarizing Data. pp. 65-84. 6 | ## 7 | ## 8 | ## This file includes the code chunks from the above mentioned 9 | ## chapter except for the leading ">" and "+" characters, which 10 | ## stand for the prompt in the R console. The prompt was 11 | ## intentionally removed here along with arbitrary line-breaks, 12 | ## so that you copy and paste the R expressions to the R console 13 | ## in a more convenient and seamless way. 14 | ## 15 | ## Code chunks are grouped here by the printed pages of the book. 16 | ## Two hash signs at the beginning of a line stands for a page 17 | ## break, while an extra empty line between the code chunks 18 | ## represents one or more paragraphs in the original book between 19 | ## the examples for easier navigation. 20 | ## 21 | ## Sometimes extra instructions starting with a double hash are 22 | ## also provided on how to run the below expressions. 23 | ## 24 | ## 25 | ## Find more information on the book at http://bit.ly/mastering-R 26 | ## and you can contact me on Twitter and GitHub by the @daroczig 27 | ## handle, or mail me at daroczig@rapporter.net 28 | ## 29 | 30 | library(sqldf) 31 | sqldf('SELECT * FROM mtcars WHERE am=1 AND vs=1') 32 | 33 | ## 34 | 35 | subset(mtcars, am == 1 & vs == 1) 36 | sqldf('SELECT * FROM mtcars WHERE am=1 AND vs=1', row.names = TRUE) 37 | 38 | identical( 39 | sqldf('SELECT * FROM mtcars WHERE am=1 AND vs=1', row.names = TRUE), 40 | subset(mtcars, am == 1 & vs == 1) 41 | ) 42 | 43 | subset(mtcars, am == 1 & vs == 1, select = hp:wt) 44 | 45 | ## 46 | 47 | library(hflights) 48 | system.time(sqldf("SELECT * FROM hflights WHERE Dest == 'BNA'", row.names = TRUE)) 49 | system.time(subset(hflights, Dest == 'BNA')) 50 | 51 | ## 52 | 53 | library(dplyr) 54 | system.time(filter(hflights, Dest == 'BNA')) 55 | 56 | str(select(filter(hflights, Dest == 'BNA'), DepTime:ArrTime)) 57 | 58 | mtcars$rownames <- rownames(mtcars) 59 | select(filter(mtcars, hp > 300), c(rownames, hp)) 60 | 61 | ## 62 | 63 | library(data.table) 64 | hflights_dt <- data.table(hflights) 65 | hflights_dt[, rownames := rownames(hflights)] 66 | system.time(hflights_dt[Dest == 'BNA']) 67 | 68 | ## 69 | 70 | str(hflights_dt[Dest == 'BNA', list(DepTime, ArrTime)]) 71 | 72 | str(hflights_dt[Dest == 'BNA', c('DepTime', 'ArrTime'), with = FALSE]) 73 | 74 | aggregate(hflights$Diverted, by = list(hflights$DayOfWeek), FUN = mean) 75 | 76 | ## 77 | 78 | with(hflights, aggregate(Diverted, by = list(DayOfWeek), FUN = mean)) 79 | 80 | aggregate(Diverted ~ DayOfWeek, data = hflights, FUN = mean) 81 | 82 | ## 83 | 84 | tapply(hflights$Diverted, hflights$DayOfWeek, mean) 85 | 86 | ## 87 | 88 | library(plyr) 89 | ddply(hflights, .(DayOfWeek), function(x) mean(x$Diverted)) 90 | 91 | ddply(hflights, .(DayOfWeek), summarise, Diverted = mean(Diverted)) 92 | 93 | ## 94 | 95 | hflights_DayOfWeek <- group_by(hflights, DayOfWeek) 96 | 97 | str(attributes(hflights_DayOfWeek)) 98 | 99 | ## 100 | 101 | dplyr::summarise(hflights_DayOfWeek, mean(Diverted)) 102 | 103 | hflights_dt[, mean(Diverted), by = DayOfWeek] 104 | 105 | ## 106 | 107 | setkey(hflights_dt, 'DayOfWeek') 108 | hflights_dt[, mean(Diverted), by = DayOfWeek] 109 | 110 | hflights_dt[, list('mean(Diverted)' = mean(Diverted)), by = DayOfWeek] 111 | 112 | ## 113 | 114 | AGGR1 <- function() aggregate(hflights$Diverted, by = list(hflights$DayOfWeek), FUN = mean) 115 | AGGR2 <- function() with(hflights, aggregate(Diverted, by = list(DayOfWeek), FUN = mean)) 116 | AGGR3 <- function() aggregate(Diverted ~ DayOfWeek, data = hflights, FUN = mean) 117 | TAPPLY <- function() tapply(hflights$Diverted, hflights$DayOfWeek, mean) 118 | PLYR1 <- function() ddply(hflights, .(DayOfWeek), function(x) mean(x$Diverted)) 119 | PLYR2 <- function() ddply(hflights, .(DayOfWeek), summarise, Diverted = mean(Diverted)) 120 | DPLYR <- function() dplyr::summarise(hflights_DayOfWeek, mean(Diverted)) 121 | 122 | DPLYR_ALL <- function() { 123 | hflights_DayOfWeek <- group_by(hflights, DayOfWeek) 124 | dplyr::summarise(hflights_DayOfWeek, mean(Diverted)) 125 | } 126 | 127 | hflights_dt_nokey <- data.table(hflights) 128 | 129 | key(hflights_dt_nokey) 130 | 131 | ## 132 | 133 | DT <- function() hflights_dt_nokey[, mean(FlightNum), by = DayOfWeek] 134 | DT_KEY <- function() hflights_dt[, mean(FlightNum), by = key(hflights_dt)] 135 | DT_ALL <- function() { 136 | hflights_dt_nokey <- data.table(hflights) 137 | setkey(hflights_dt_nokey, 'DayOfWeek') 138 | hflights_dt_nokey[, mean(FlightNum), by = DayOfWeek] 139 | } 140 | 141 | library(microbenchmark) 142 | res <- microbenchmark(AGGR1(), AGGR2(), AGGR3(), TAPPLY(), PLYR1(), PLYR2(), DPLYR(), DPLYR_ALL(), DT(), DT_KEY(), DT_ALL(), times = 10) 143 | print(res, digits = 3) 144 | 145 | ## 146 | 147 | autoplot(res) 148 | 149 | dplyr::summarise(group_by(hflights_dt, DayOfWeek), mean(Diverted)) 150 | 151 | ## 152 | 153 | ddply(hflights, .(DayOfWeek), summarise, n = length(Diverted)) 154 | 155 | ## 156 | 157 | ddply(hflights, .(DayOfWeek), nrow) 158 | 159 | table(hflights$DayOfWeek) 160 | 161 | count(hflights, 'DayOfWeek') 162 | 163 | ## 164 | 165 | dplyr::summarise(hflights_DayOfWeek, n()) 166 | 167 | attr(hflights_DayOfWeek, 'group_sizes') 168 | 169 | hflights_dt[, .N, by = list(DayOfWeek)] 170 | -------------------------------------------------------------------------------- /04 - Restructuring Data.R: -------------------------------------------------------------------------------- 1 | ## Extracted code chunks from 2 | ## 3 | ## Gergely Daróczi (2015): Mastering Data Analysis with R. 4 | ## 5 | ## Chapter #4: Restructuring Data. pp. 85-106. 6 | ## 7 | ## 8 | ## This file includes the code chunks from the above mentioned 9 | ## chapter except for the leading ">" and "+" characters, which 10 | ## stand for the prompt in the R console. The prompt was 11 | ## intentionally removed here along with arbitrary line-breaks, 12 | ## so that you copy and paste the R expressions to the R console 13 | ## in a more convenient and seamless way. 14 | ## 15 | ## Code chunks are grouped here by the printed pages of the book. 16 | ## Two hash signs at the beginning of a line stands for a page 17 | ## break, while an extra empty line between the code chunks 18 | ## represents one or more paragraphs in the original book between 19 | ## the examples for easier navigation. 20 | ## 21 | ## Sometimes extra instructions starting with a double hash are 22 | ## also provided on how to run the below expressions. 23 | ## 24 | ## 25 | ## Find more information on the book at http://bit.ly/mastering-R 26 | ## and you can contact me on Twitter and GitHub by the @daroczig 27 | ## handle, or mail me at daroczig@rapporter.net 28 | ## 29 | 30 | (m <- matrix(1:9, 3)) 31 | 32 | ## 33 | 34 | t(m) 35 | 36 | ## 37 | 38 | library(dplyr) 39 | library(hflights) 40 | str(select(hflights, ends_with('delay'))) 41 | 42 | str(select(hflights, contains('T', ignore.case = FALSE))) 43 | 44 | ## 45 | 46 | str(select(hflights, matches('^[[:alpha:]]{5,6}$'))) 47 | 48 | table(nchar(names(hflights))) 49 | 50 | names(select(hflights, -matches('^[[:alpha:]]{7,8}$'))) 51 | 52 | ## 53 | 54 | str(arrange(hflights, ActualElapsedTime)) 55 | 56 | ## 57 | 58 | hflights %>% arrange(ActualElapsedTime) %>% str 59 | 60 | hflights %>% 61 | arrange(ActualElapsedTime) %>% 62 | select(ActualElapsedTime, Dest) %>% 63 | subset(Dest != 'AUS') %>% 64 | head %>% 65 | str 66 | 67 | ## 68 | 69 | library(data.table) 70 | str(head(data.table(hflights, key = 'ActualElapsedTime')[Dest != 'AUS', c('ActualElapsedTime', 'Dest'), with = FALSE])) 71 | 72 | str(head(na.omit(data.table(hflights, key = 'ActualElapsedTime'))[Dest != 'AUS', .(ActualElapsedTime, Dest)])) 73 | 74 | system.time(str(head(data.table(na.omit(hflights), key = 'ActualElapsedTime')[Dest != 'AUS', c('ActualElapsedTime', 'Dest'), with = FALSE]))) 75 | system.time(str(head(na.omit(data.table(hflights, key = 'ActualElapsedTime'))[Dest != 'AUS', c('ActualElapsedTime', 'Dest'), with = FALSE]))) 76 | 77 | ## 78 | 79 | hflights_dt <- data.table(hflights) 80 | hflights_dt[, DistanceKMs := Distance / 0.62137] 81 | 82 | system.time(hflights_dt$DistanceKMs <- hflights_dt$Distance / 0.62137) 83 | system.time(hflights_dt[, DistanceKMs := Distance / 0.62137]) 84 | 85 | ## 86 | 87 | library(pryr) 88 | hflights_dt <- data.table(hflights) 89 | address(hflights_dt) 90 | 91 | hflights_dt$DistanceKMs <- hflights_dt$Distance / 0.62137 92 | address(hflights_dt) 93 | 94 | ## 95 | 96 | hflights_dt <- data.table(hflights) 97 | address(hflights_dt) 98 | hflights_dt[, DistanceKMs := Distance / 0.62137] 99 | address(hflights_dt) 100 | 101 | system.time(within(hflights_dt, DistanceKMs <- Distance / 0.62137)) 102 | 103 | hflights_dt[, c('DistanceKMs', 'DiastanceFeets') := list(Distance / 0.62137, Distance * 5280)] 104 | 105 | ## 106 | 107 | carriers <- unique(hflights_dt$UniqueCarrier) 108 | hflights_dt[, paste('carrier', carriers, sep = '_') := lapply(carriers, function(x) as.numeric(UniqueCarrier == x))] 109 | str(hflights_dt[, grep('^carrier', names(hflights_dt)), with = FALSE]) 110 | 111 | ### 112 | 113 | hflights <- hflights %>% 114 | mutate(DistanceKMs = Distance / 0.62137) 115 | 116 | hflights <- mutate(hflights, DistanceKMs = Distance / 0.62137) 117 | 118 | ## 119 | 120 | (wdays <- data.frame( 121 | DayOfWeek = 1:7, 122 | DayOfWeekString = c('Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday') 123 | )) 124 | 125 | system.time(merge(hflights, wdays)) 126 | system.time(merge(hflights_dt, wdays, by = 'DayOfWeek')) 127 | 128 | ## 129 | 130 | weekdays(as.Date(with(hflights, paste(Year, Month, DayofMonth, sep = '-')))) 131 | 132 | ## 133 | 134 | library(reshape2) 135 | 136 | ## 137 | 138 | head(melt(hflights)) 139 | 140 | hflights_melted <- melt(hflights, id.vars = 0, measure.vars = c('ActualElapsedTime', 'AirTime')) 141 | str(hflights_melted) 142 | 143 | ## 144 | 145 | library(ggplot2) 146 | ggplot(hflights_melted, aes(x = variable, y = value)) + geom_boxplot() 147 | 148 | ## 149 | 150 | hflights_melted <- melt(hflights, id.vars = 'Month', measure.vars = c('ActualElapsedTime', 'AirTime')) 151 | (df <- dcast(hflights_melted, Month ~ variable, fun.aggregate = mean, na.rm = TRUE)) 152 | 153 | ggplot(melt(df, id.vars = 'Month')) + geom_line(aes(x = Month, y = value, color = variable)) + scale_x_continuous(breaks = 1:12) + theme_bw() + theme(legend.position = 'top') 154 | 155 | ## 156 | 157 | hflights_melted <- melt(add_margins(hflights, 'Month'), id.vars = 'Month', measure.vars = c('ActualElapsedTime', 'AirTime')) 158 | (df <- dcast(hflights_melted, Month ~ variable, fun.aggregate = mean, na.rm = TRUE)) 159 | 160 | ## 161 | 162 | library(tidyr) 163 | str(gather(hflights[, c('Month', 'ActualElapsedTime', 'AirTime')], variable, value, -Month)) 164 | -------------------------------------------------------------------------------- /05 - Building Models.R: -------------------------------------------------------------------------------- 1 | ## Extracted code chunks from 2 | ## 3 | ## Gergely Daróczi (2015): Mastering Data Analysis with R. 4 | ## 5 | ## Chapter #5: Building Models. pp. 107-126. 6 | ## -- written by Renata Nemeth and Gergely Toth 7 | ## 8 | ## 9 | ## This file includes the code chunks from the above mentioned 10 | ## chapter except for the leading ">" and "+" characters, which 11 | ## stand for the prompt in the R console. The prompt was 12 | ## intentionally removed here along with arbitrary line-breaks, 13 | ## so that you copy and paste the R expressions to the R console 14 | ## in a more convenient and seamless way. 15 | ## 16 | ## Code chunks are grouped here by the printed pages of the book. 17 | ## Two hash signs at the beginning of a line stands for a page 18 | ## break, while an extra empty line between the code chunks 19 | ## represents one or more paragraphs in the original book between 20 | ## the examples for easier navigation. 21 | ## 22 | ## Sometimes extra instructions starting with a double hash are 23 | ## also provided on how to run the below expressions. 24 | ## 25 | ## 26 | ## Find more information on the book at http://bit.ly/mastering-R 27 | ## and you can contact me on Twitter and GitHub by the @daroczig 28 | ## handle, or mail me at daroczig@rapporter.net 29 | ## 30 | 31 | library(gamlss.data) 32 | data(usair) 33 | 34 | model.0 <- lm(y ~ x3, data = usair) 35 | summary(model.0) 36 | 37 | ## 38 | 39 | plot(y ~ x3, data = usair, cex.lab = 1.5) 40 | abline(model.0, col = 'red', lwd = 2.5) 41 | legend('bottomright', legend = 'y ~ x3', lty = 1, col = 'red', lwd = 2.5, title = 'Regression line') 42 | 43 | ## 44 | 45 | usair$prediction <- predict(model.0) 46 | usair$residual<- resid(model.0) 47 | plot(y ~ x3, data = usair, cex.lab = 1.5) 48 | abline(model.0, col = 'red', lwd = 2.5) 49 | segments(usair$x3, usair$y, usair$x3, usair$prediction, col = 'blue', lty = 2) 50 | legend('bottomright', legend = c('y ~ x3', 'residuals'), lty = c(1, 2), col = c('red', 'blue'), lwd = 2.5, title = 'Regression line') 51 | 52 | ## 53 | 54 | model.1 <- update(model.0, . ~ . + x2) 55 | summary(model.1) 56 | 57 | ## 58 | 59 | as.numeric(predict(model.1, data.frame(x2 = 150, x3 = 400))) 60 | 61 | -0.05661 * 400 + 0.08243 * 150 + 26.32508 62 | 63 | library(scatterplot3d) 64 | plot3d <- scatterplot3d(usair$x3, usair$x2, usair$y, pch = 19, type = 'h', highlight.3d = TRUE, main = '3-D Scatterplot') 65 | 66 | ## 67 | 68 | plot3d$plane3d(model.1, lty = 'solid', col = 'red') 69 | 70 | par(mfrow = c(1, 2)) 71 | plot(y ~ x3, data = usair, cex.lab = 1.5, main = '2D projection for x3') 72 | abline(model.1, col = 'red', lwd = 2.5) 73 | plot(y ~ x2, data = usair, cex.lab = 1.5, main = '2D projection for x2') 74 | abline(lm(y ~ x2 + x3, data = usair), col = 'red', lwd = 2.5) 75 | 76 | ## 77 | 78 | library(Hmisc) 79 | library(ggplot2) 80 | library(gridExtra) 81 | set.seed(7) 82 | x <- sort(rnorm(1000, 10, 100))[26:975] 83 | y <- x * 500 + rnorm(950, 5000, 20000) 84 | df <- data.frame(x = x, y = y, cuts = factor(cut2(x, g = 5)), resid = resid(lm(y ~ x))) 85 | scatterPl <- ggplot(df, aes(x = x, y = y)) + geom_point(aes(colour = cuts, fill = cuts), shape = 1, show_guide = FALSE) + geom_smooth(method = lm, level = 0.99) + theme_bw() + theme(axis.text.x = element_blank(), axis.text.y = element_blank(), text = element_text(size=20)) 86 | 87 | ## 88 | 89 | plot_left <- ggplot(df, aes(x = y, fill = cuts)) + geom_density(alpha = .5) + coord_flip() + scale_y_reverse() + xlab('Y values by groups') + theme_bw() + theme(legend.position = 'none') + theme(axis.text.x = element_blank(), axis.text.y = element_blank(), text = element_text(size=20)) 90 | plot_right <- ggplot(data = df, aes(x = resid, fill = cuts)) + geom_density(alpha = .5) + coord_flip() + theme_bw() + xlab('Residuals by groups') + theme(legend.position = 'none') + theme(axis.text.x = element_blank(), axis.text.y = element_blank(), text = element_text(size=20)) 91 | grid.arrange(plot_left, scatterPl, plot_right, ncol=3, nrow=1, widths=c(1, 3, 1)) 92 | 93 | library(gvlma) 94 | gvlma(model.1) 95 | 96 | ## 97 | 98 | model.2 <- update(model.1, data = usair[-31, ]) 99 | gvlma(model.2) 100 | 101 | ## 102 | 103 | model.0 <- update(model.0, data = usair[-31, ]) 104 | summary(model.0)[c('r.squared', 'adj.r.squared')] 105 | 106 | summary(model.2)[c('r.squared', 'adj.r.squared')] 107 | 108 | ## 109 | 110 | summary(model.3 <- update(model.2, .~. -x2 + x1))$coefficients 111 | summary(model.4 <- update(model.2, .~. -x3 + x1))$coefficients 112 | AIC(model.3, model.4) 113 | 114 | ## 115 | 116 | plot(y ~ x5, data = usair, cex.lab = 1.5) 117 | abline(lm(y ~ x5, data = usair), col = 'red', lwd = 2.5, lty = 1) 118 | abline(lm(y ~ x5, data = usair[usair$x5<=45,]), col = 'red', lwd = 2.5, lty = 3) 119 | abline(lm(y ~ x5, data = usair[usair$x5>=30,]), col = 'red', lwd = 2.5, lty = 2) 120 | abline(v = c(30, 45),col = 'blue', lwd = 2.5) 121 | legend('topleft', legend = c('y ~ x5', 'y ~ x5 | x5<=45','y ~ x5 | x5>=30', 'Critical zone'), lty = c(1, 3, 2, 1) , col = c('red', 'red', 'red', 'blue'), lwd = rep(2.5, 4), title = NULL) 122 | 123 | ## 124 | 125 | library(partykit) 126 | library(rpart) 127 | plot(as.party(rpart(y ~ x5, data = usair))) 128 | 129 | ## 130 | 131 | usair$x5_3 <- cut2(usair$x5, c(30, 45)) 132 | plot(y ~ as.numeric(x5_3), data = usair, cex.lab = 1.5, xlab = 'Categorized annual rainfall(x5)', xaxt = 'n') 133 | axis(1, at = 1:3, labels = levels(usair$x5_3)) 134 | lines(tapply(usair$y, usair$x5_3, mean), col = 'red', lwd = 2.5, lty = 1) 135 | legend('topright', legend = 'Linear prediction', lty = 1, col = 'red', lwd = 2.5, title = NULL) 136 | 137 | summary(glmmodel.1 <- glm(y ~ x2 + x3 + x5_3, data = usair[-31, ])) 138 | -------------------------------------------------------------------------------- /06 - Beyond the Linear Trend Line.R: -------------------------------------------------------------------------------- 1 | ## Extracted code chunks from 2 | ## 3 | ## Gergely Daróczi (2015): Mastering Data Analysis with R. 4 | ## 5 | ## Chapter #6: Beyond the Linear Trend Line. pp. 127-152. 6 | ## -- written by Renata Nemeth and Gergely Toth 7 | ## 8 | ## 9 | ## This file includes the code chunks from the above mentioned 10 | ## chapter except for the leading ">" and "+" characters, which 11 | ## stand for the prompt in the R console. The prompt was 12 | ## intentionally removed here along with arbitrary line-breaks, 13 | ## so that you copy and paste the R expressions to the R console 14 | ## in a more convenient and seamless way. 15 | ## 16 | ## Code chunks are grouped here by the printed pages of the book. 17 | ## Two hash signs at the beginning of a line stands for a page 18 | ## break, while an extra empty line between the code chunks 19 | ## represents one or more paragraphs in the original book between 20 | ## the examples for easier navigation. 21 | ## 22 | ## Sometimes extra instructions starting with a double hash are 23 | ## also provided on how to run the below expressions. 24 | ## 25 | ## 26 | ## Find more information on the book at http://bit.ly/mastering-R 27 | ## and you can contact me on Twitter and GitHub by the @daroczig 28 | ## handle, or mail me at daroczig@rapporter.net 29 | ## 30 | 31 | library(catdata) 32 | data(deathpenalty) 33 | library(vcdExtra) 34 | deathpenalty.expand <- expand.dft(deathpenalty) 35 | binom.model.0 <- glm(DeathPenalty ~ DefendantRace, data = deathpenalty.expand, family = binomial) 36 | summary(binom.model.0) 37 | 38 | ## 39 | 40 | exp(cbind(OR = coef(binom.model.0), confint(binom.model.0))) 41 | 42 | ## 43 | 44 | binom.model.1 <- update(binom.model.0, .~.+VictimRace) 45 | summary(binom.model.1) 46 | exp(cbind(OR = coef(binom.model.1), confint(binom.model.1))) 47 | 48 | ## 49 | 50 | prop.table(table(factor(deathpenalty.expand$VictimRace, labels = c('VictimRace=0', 'VictimRace=1')), factor(deathpenalty.expand$DefendantRace, labels = c('DefendantRace=0', 'DefendantRace=1'))), 1) 51 | 52 | ## 53 | 54 | library(lmtest) 55 | lrtest(binom.model.1) 56 | 57 | ## 58 | 59 | library(BaylorEdPsych) 60 | PseudoR2(binom.model.1) 61 | 62 | ## 63 | 64 | lrtest(binom.model.0, binom.model.1) 65 | 66 | ## 67 | 68 | dfa <- readRDS('SMART_2013.RData') 69 | 70 | (ct <- xtabs(~model + failure, data = dfa)) 71 | 72 | ## 73 | 74 | dfa <- dfa[dfa$model %in% names(which(rowSums(ct) - ct[, 1] > 0)), ] 75 | 76 | library(ggplot2) 77 | ggplot(rbind(dfa, data.frame(model = 'All', dfa[, -1])), aes(failure)) + geom_histogram(binwidth = 1, drop = TRUE, origin = -0.5) + scale_y_log10() + scale_x_continuous(breaks=c(0:10)) + ylab('log(count)') + facet_wrap( ~ model, ncol = 3) + ggtitle('Histograms by manufacturer') + theme_bw() 78 | 79 | ## 80 | 81 | poiss.base <- glm(failure ~ model, offset(log(freq)), family = 'poisson', data = dfa) 82 | summary(poiss.base) 83 | 84 | ## 85 | 86 | contrasts(dfa$model, sparse = TRUE) 87 | 88 | ## 89 | 90 | exp(1.7666) 91 | 92 | lrtest(poiss.base) 93 | 94 | ## 95 | 96 | library(MASS) 97 | model.negbin.0 <- glm.nb(failure ~ model, offset(log(freq)), data = dfa) 98 | 99 | lrtest(poiss.base,model.negbin.0) 100 | 101 | ## 102 | 103 | model.negbin.1 <- update(model.negbin.0, .~. + capacity_bytes + age_month + temperature) 104 | model.negbin.2 <- update(model.negbin.1, .~. + PendingSector) 105 | lrtest(model.negbin.0, model.negbin.1, model.negbin.2) 106 | 107 | summary(model.negbin.2) 108 | 109 | ## 110 | 111 | exp(data.frame(exp_coef = coef(model.negbin.2))) 112 | 113 | ## 114 | 115 | dfa$model <- relevel(dfa$model, 'WDC') 116 | 117 | model.negbin.3 <- update(model.negbin.2, data = dfa) 118 | library(broom) 119 | format(tidy(model.negbin.3), digits = 4) 120 | 121 | ## 122 | 123 | library(data.table) 124 | dfa <- data.table(dfa) 125 | dfa[, temp6 := cut2(temperature, g = 6)] 126 | temperature.weighted.mean <- dfa[, .(wfailure = weighted.mean(failure, freq)), by = temp6] 127 | ggplot(temperature.weighted.mean, aes(x = temp6, y = wfailure)) + geom_bar(stat = 'identity') + xlab('Categorized temperature') + ylab('Weighted mean of disk faults') + theme_bw() 128 | 129 | ## 130 | 131 | model.negbin.4 <- update(model.negbin.0, .~. + capacity_bytes + age_month + temp6 + PendingSector, data = dfa) 132 | AIC(model.negbin.3, model.negbin.4) 133 | 134 | weighted.means <- rbind( 135 | dfa[, .(var = 'capacity', wfailure = weighted.mean(failure, freq)), by = .(value = capacity_bytes)], 136 | dfa[, .(var = 'age', wfailure = weighted.mean(failure, freq)), by = .(value = age_month)]) 137 | 138 | ggplot(weighted.means, aes(x = value, y = wfailure)) + geom_step() + facet_grid(. ~ var, scales = 'free_x') + ylab('Weighted mean of disk faults') + xlab('') + theme_bw() 139 | 140 | ## 141 | 142 | dfa[, capacity_bytes := as.factor(capacity_bytes)] 143 | dfa[, age8 := cut2(age_month, g = 8)] 144 | model.negbin.5 <- update(model.negbin.0, .~. + capacity_bytes + age8 + temp6 + PendingSector, data = dfa) 145 | 146 | AIC(model.negbin.5, model.negbin.4) 147 | 148 | format(tidy(model.negbin.5), digits = 3) 149 | 150 | ## 151 | 152 | tmnb5 <- tidy(model.negbin.5) 153 | str(terms <- tmnb5$term[tmnb5$p.value < 0.05][-1]) 154 | 155 | library(plyr) 156 | ci <- ldply(terms, function(t) confint(model.negbin.5, t)) 157 | 158 | ## 159 | 160 | names(ci) <- c('min', 'max') 161 | ci$term <- terms 162 | ci$variable <- sub('[A-Z0-9\\]\\[,() ]*$', '', terms, perl = TRUE) 163 | 164 | ggplot(ci, aes(x = factor(term), color = variable, ymin = min, ymax = max)) + geom_errorbar(size = 1.3) + ylab('Coefficients (95% conf.int)') + xlab('') + theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.3), legend.position = 'top') 165 | 166 | ## 167 | 168 | PseudoR2(model.negbin.5) 169 | -------------------------------------------------------------------------------- /07 - Unstructured Data.R: -------------------------------------------------------------------------------- 1 | ## Extracted code chunks from 2 | ## 3 | ## Gergely Daróczi (2015): Mastering Data Analysis with R. 4 | ## 5 | ## Chapter #7: Unstructured Data. pp. 153-168. 6 | ## 7 | ## 8 | ## This file includes the code chunks from the above mentioned 9 | ## chapter except for the leading ">" and "+" characters, which 10 | ## stand for the prompt in the R console. The prompt was 11 | ## intentionally removed here along with arbitrary line-breaks, 12 | ## so that you copy and paste the R expressions to the R console 13 | ## in a more convenient and seamless way. 14 | ## 15 | ## Code chunks are grouped here by the printed pages of the book. 16 | ## Two hash signs at the beginning of a line stands for a page 17 | ## break, while an extra empty line between the code chunks 18 | ## represents one or more paragraphs in the original book between 19 | ## the examples for easier navigation. 20 | ## 21 | ## Sometimes extra instructions starting with a double hash are 22 | ## also provided on how to run the below expressions. 23 | ## 24 | ## 25 | ## Find more information on the book at http://bit.ly/mastering-R 26 | ## and you can contact me on Twitter and GitHub by the @daroczig 27 | ## handle, or mail me at daroczig@rapporter.net 28 | ## 29 | 30 | library(tm) 31 | getSources() 32 | 33 | ## 34 | 35 | getReaders() 36 | 37 | res <- XML::readHTMLTable('http://cran.r-project.org/web/packages/available_packages_by_name.html', which = 1) 38 | 39 | v <- Corpus(VectorSource(res$V2)) 40 | 41 | ## 42 | 43 | v 44 | 45 | inspect(head(v, 3)) 46 | 47 | getTransformations() 48 | 49 | ## 50 | 51 | stopwords('english') 52 | 53 | ## 54 | 55 | removeWords('to be or not to be', stopwords('english')) 56 | 57 | v <- tm_map(v, removeWords, stopwords('english')) 58 | 59 | inspect(head(v, 3)) 60 | 61 | ## 62 | 63 | removeWords('To be or not to be.', stopwords('english')) 64 | 65 | v <- tm_map(v, content_transformer(tolower)) 66 | v <- tm_map(v, removePunctuation) 67 | v <- tm_map(v, stripWhitespace) 68 | inspect(head(v, 3)) 69 | 70 | ## 71 | 72 | library(wordcloud) 73 | wordcloud::wordcloud(v) 74 | 75 | ## 76 | 77 | v <- tm_map(v, removeNumbers) 78 | 79 | tdm <- TermDocumentMatrix(v) 80 | 81 | inspect(tdm[1:5, 1:20]) 82 | 83 | ## 84 | 85 | findFreqTerms(tdm, lowfreq = 100) 86 | 87 | myStopwords <- c('package', 'based', 'using') 88 | v <- tm_map(v, removeWords, myStopwords) 89 | 90 | library(SnowballC) 91 | wordStem(c('cats', 'mastering', 'modelling', 'models', 'model')) 92 | 93 | wordStem(c('are', 'analyst', 'analyze', 'analysis')) 94 | 95 | ## 96 | 97 | d <- v 98 | 99 | v <- tm_map(v, stemDocument, language = 'english') 100 | 101 | v <- tm_map(v, content_transformer(function(x, d) paste(stemCompletion(strsplit(stemDocument(x), ' ')[[1]], d), collapse = ' ')), d) 102 | 103 | tdm <- TermDocumentMatrix(v) 104 | findFreqTerms(tdm, lowfreq = 100) 105 | 106 | ## 107 | 108 | tdm 109 | 110 | ## 111 | 112 | findAssocs(tdm, 'data', 0.1) 113 | 114 | findAssocs(tdm, 'big', 0.1) 115 | 116 | ## 117 | 118 | vnchar <- sapply(v, function(x) nchar(x$content)) 119 | summary(vnchar) 120 | 121 | (vm <- which.min(vnchar)) 122 | 123 | v[[vm]] 124 | res[vm, ] 125 | 126 | ## 127 | 128 | hist(vnchar, main = 'Length of R package descriptions', xlab = 'Number of characters') 129 | 130 | ## 131 | 132 | hadleyverse <- c('ggplot2', 'dplyr', 'reshape2', 'lubridate', 'stringr', 'devtools', 'roxygen2', 'tidyr') 133 | 134 | (w <- which(res$V1 %in% hadleyverse)) 135 | 136 | plot(hclust(dist(DocumentTermMatrix(v[w]))), xlab = 'Hadleyverse packages') 137 | 138 | ## 139 | 140 | sapply(v[w], function(x) structure(content(x), .Names = meta(x, 'id'))) 141 | 142 | 143 | library(topicmodels) 144 | fit <- LDA(DocumentTermMatrix(v[w]), k = 3) 145 | topics(fit) 146 | terms(fit, 10) 147 | 148 | fit <- CTM(DocumentTermMatrix(v[w]), k = 3) 149 | -------------------------------------------------------------------------------- /08 - Polishing Data.R: -------------------------------------------------------------------------------- 1 | ## Extracted code chunks from 2 | ## 3 | ## Gergely Daróczi (2015): Mastering Data Analysis with R. 4 | ## 5 | ## Chapter #8: Polishing Data. pp. 169-192. 6 | ## 7 | ## 8 | ## This file includes the code chunks from the above mentioned 9 | ## chapter except for the leading ">" and "+" characters, which 10 | ## stand for the prompt in the R console. The prompt was 11 | ## intentionally removed here along with arbitrary line-breaks, 12 | ## so that you copy and paste the R expressions to the R console 13 | ## in a more convenient and seamless way. 14 | ## 15 | ## Code chunks are grouped here by the printed pages of the book. 16 | ## Two hash signs at the beginning of a line stands for a page 17 | ## break, while an extra empty line between the code chunks 18 | ## represents one or more paragraphs in the original book between 19 | ## the examples for easier navigation. 20 | ## 21 | ## Sometimes extra instructions starting with a double hash are 22 | ## also provided on how to run the below expressions. 23 | ## 24 | ## 25 | ## Find more information on the book at http://bit.ly/mastering-R 26 | ## and you can contact me on Twitter and GitHub by the @daroczig 27 | ## handle, or mail me at daroczig@rapporter.net 28 | ## 29 | 30 | library(hflights) 31 | table(complete.cases(hflights)) 32 | 33 | ## 34 | 35 | prop.table(table(complete.cases(hflights))) * 100 36 | 37 | sort(sapply(hflights, function(x) sum(is.na(x)))) 38 | 39 | mean(cor(apply(hflights, 2, function(x) as.numeric(is.na(x)))), na.rm = TRUE) 40 | 41 | ## 42 | 43 | Funs <- Filter(is.function, sapply(ls(baseenv()), get, baseenv())) 44 | names(Filter(function(x) any(names(formals(args(x))) %in% 'na.rm'), Funs)) 45 | 46 | ## 47 | 48 | names(Filter(function(x) any(names(formals(args(x))) %in% 'na.rm'), Filter(is.function, sapply(ls('package:stats'), get, 'package:stats')))) 49 | 50 | myMean <- function(...) mean(..., na.rm = TRUE) 51 | mean(c(1:5, NA)) 52 | myMean(c(1:5, NA)) 53 | 54 | ## 55 | 56 | library(rapportools) 57 | mean(c(1:5, NA)) 58 | 59 | detach('package:rapportools') 60 | mean(c(1:5, NA)) 61 | 62 | library(Defaults) 63 | setDefaults(mean.default, na.rm = TRUE) 64 | mean(c(1:5, NA)) 65 | 66 | setDefaults(mean, na.rm = TRUE) 67 | 68 | ## 69 | 70 | mean 71 | formals(mean) 72 | 73 | unDefaults(ls) 74 | 75 | ## 76 | 77 | na.omit(c(1:5, NA)) 78 | na.exclude(c(1:5, NA)) 79 | 80 | x <- rnorm(10); y <- rnorm(10) 81 | x[1] <- NA; y[2] <- NA 82 | exclude <- lm(y ~ x, na.action = 'na.exclude') 83 | omit <- lm(y ~ x, na.action = 'na.omit') 84 | round(residuals(exclude), 2) 85 | round(residuals(omit), 2) 86 | 87 | ## 88 | 89 | m <- matrix(1:9, 3) 90 | m[which(m %% 4 == 0, arr.ind = TRUE)] <- NA 91 | m 92 | na.omit(m) 93 | 94 | mean(hflights$ActualElapsedTime) 95 | 96 | mean(hflights$ActualElapsedTime, na.rm = TRUE) 97 | mean(na.omit(hflights$ActualElapsedTime)) 98 | 99 | ## 100 | 101 | library(microbenchmark) 102 | NA.RM <- function() mean(hflights$ActualElapsedTime, na.rm = TRUE) 103 | NA.OMIT <- function() mean(na.omit(hflights$ActualElapsedTime)) 104 | microbenchmark(NA.RM(), NA.OMIT()) 105 | 106 | ## 107 | 108 | m[which(is.na(m), arr.ind = TRUE)] <- 0 109 | m 110 | 111 | ActualElapsedTime <- hflights$ActualElapsedTime 112 | mean(ActualElapsedTime, na.rm = TRUE) 113 | ActualElapsedTime[which(is.na(ActualElapsedTime))] <- mean(ActualElapsedTime, na.rm = TRUE) 114 | mean(ActualElapsedTime) 115 | 116 | library(Hmisc) 117 | mean(impute(hflights$ActualElapsedTime, mean)) 118 | 119 | ## 120 | 121 | sd(hflights$ActualElapsedTime, na.rm = TRUE) 122 | sd(ActualElapsedTime) 123 | 124 | ## 125 | 126 | summary(iris) 127 | 128 | library(missForest) 129 | set.seed(81) 130 | miris <- prodNA(iris, noNA = 0.2) 131 | summary(miris) 132 | 133 | ## 134 | 135 | iiris <- missForest(miris, xtrue = iris, verbose = TRUE) 136 | 137 | ## 138 | 139 | str(iiris) 140 | 141 | ## 142 | 143 | miris <- miris[, 1:4] 144 | 145 | ## 146 | 147 | iris_mean <- impute(miris, fun = mean) 148 | iris_forest <- missForest(miris) 149 | 150 | diag(cor(iris[, -5], iris_mean)) 151 | diag(cor(iris[, -5], iris_forest$ximp)) 152 | 153 | ## 154 | 155 | detach('package:missForest') 156 | detach('package:randomForest') 157 | 158 | ## 159 | 160 | library(outliers) 161 | outlier(hflights$DepDelay) 162 | 163 | summary(hflights$DepDelay) 164 | 165 | library(lattice) 166 | bwplot(hflights$DepDelay) 167 | 168 | IQR(hflights$DepDelay, na.rm = TRUE) 169 | 170 | ## 171 | 172 | set.seed(83) 173 | dixon.test(c(runif(10), pi)) 174 | 175 | model <- lm(hflights$DepDelay ~ 1) 176 | 177 | model$coefficients 178 | mean(hflights$DepDelay, na.rm = TRUE) 179 | 180 | ## 181 | 182 | a <- 0.1 183 | (n <- length(hflights$DepDelay)) 184 | (F <- qf(1 - (a/n), 1, n-2, lower.tail = TRUE)) 185 | (L <- ((n - 1) * F / (n - 2 + F))^0.5) 186 | 187 | sum(abs(rstandard(model)) > L) 188 | 189 | summary(lm(Sepal.Length ~ Petal.Length, data = miris)) 190 | 191 | ## 192 | 193 | lm(Sepal.Length ~ Petal.Length, data = iris)$coefficients 194 | 195 | library(MASS) 196 | summary(rlm(Sepal.Length ~ Petal.Length, data = miris)) 197 | 198 | ## 199 | 200 | f <- formula(Sepal.Length ~ Petal.Length) 201 | cbind( 202 | orig = lm(f, data = iris)$coefficients, 203 | lm = lm(f, data = miris)$coefficients, 204 | rlm = rlm(f, data = miris)$coefficients) 205 | 206 | miris$Sepal.Length[1] <- 14 207 | cbind( 208 | orig = lm(f, data = iris)$coefficients, 209 | lm = lm(f, data = miris)$coefficients, 210 | rlm = rlm(f, data = miris)$coefficients) 211 | -------------------------------------------------------------------------------- /09 - From Big to Smaller Data.R: -------------------------------------------------------------------------------- 1 | ## Extracted code chunks from 2 | ## 3 | ## Gergely Daróczi (2015): Mastering Data Analysis with R. 4 | ## 5 | ## Chapter #9: From Big to Small Data. pp. 192-234. 6 | ## 7 | ## 8 | ## This file includes the code chunks from the above mentioned 9 | ## chapter except for the leading ">" and "+" characters, which 10 | ## stand for the prompt in the R console. The prompt was 11 | ## intentionally removed here along with arbitrary line-breaks, 12 | ## so that you copy and paste the R expressions to the R console 13 | ## in a more convenient and seamless way. 14 | ## 15 | ## Code chunks are grouped here by the printed pages of the book. 16 | ## Two hash signs at the beginning of a line stands for a page 17 | ## break, while an extra empty line between the code chunks 18 | ## represents one or more paragraphs in the original book between 19 | ## the examples for easier navigation. 20 | ## 21 | ## Sometimes extra instructions starting with a double hash are 22 | ## also provided on how to run the below expressions. 23 | ## 24 | ## 25 | ## Find more information on the book at http://bit.ly/mastering-R 26 | ## and you can contact me on Twitter and GitHub by the @daroczig 27 | ## handle, or mail me at daroczig@rapporter.net 28 | ## 29 | 30 | library(hflights) 31 | 32 | JFK <- hflights[which(hflights$Dest == 'JFK'), c('TaxiIn', 'TaxiOut')] 33 | JFK <- subset(hflights, Dest == 'JFK', select = c(TaxiIn, TaxiOut)) 34 | 35 | ## 36 | 37 | par(mfrow = c(1, 2)) 38 | qqnorm(JFK$TaxiIn, ylab = 'TaxiIn'); qqline(JFK$TaxiIn) 39 | qqnorm(JFK$TaxiOut, ylab = 'TaxiOut'); qqline(JFK$TaxiOut) 40 | 41 | shapiro.test(JFK$TaxiIn) 42 | 43 | ## 44 | 45 | JFK <- na.omit(JFK) 46 | library(MVN) 47 | mardiaTest(JFK) 48 | 49 | ## 50 | 51 | hzTest(JFK) 52 | 53 | roystonTest(JFK) 54 | 55 | ## 56 | 57 | mvt <- roystonTest(JFK, qqplot = TRUE) 58 | 59 | par(mfrow = c(1, 2)) 60 | mvnPlot(mvt, type = 'contour', default = TRUE) 61 | mvnPlot(mvt, type = 'persp', default = TRUE) 62 | 63 | ## 64 | 65 | set.seed(42) 66 | mvt <- roystonTest(MASS::mvrnorm(100, mu = c(0, 0), Sigma = matrix(c(10, 3, 3, 2), 2))) 67 | par(mfrow = c(1, 2), mar = c(0,0,0,0)) 68 | mvnPlot(mvt, type = 'contour', default = TRUE) 69 | mvnPlot(mvt, type = 'persp', default = TRUE) 70 | 71 | ## 72 | 73 | hflights_numeric <- hflights[, which(sapply(hflights, is.numeric))] 74 | cor(hflights_numeric, use = 'pairwise.complete.obs') 75 | 76 | str(cor(hflights_numeric, use = 'pairwise.complete.obs')) 77 | 78 | hflights_numeric <- hflights[,which( 79 | sapply(hflights, function(x) 80 | is.numeric(x) && var(x, na.rm = TRUE) != 0))] 81 | 82 | table(is.na(cor(hflights_numeric, use = 'pairwise.complete.obs'))) 83 | 84 | ## 85 | 86 | library(ellipse) 87 | 88 | plotcorr(cor(hflights_numeric, use = 'pairwise.complete.obs')) 89 | 90 | ## 91 | 92 | plotcorr(cor(data.frame( 93 | 1:10, 94 | 1:10+runif(10), 95 | 1:10+runif(10)*5, 96 | runif(10), 97 | 10:1, 98 | check.names = FALSE))) 99 | 100 | cor(hflights$FlightNum, hflights$Month) 101 | 102 | ## 103 | 104 | ## 105 | 106 | library(psych) 107 | KMO(cor(hflights_numeric, use = 'pairwise.complete.obs')) 108 | 109 | ## 110 | 111 | cor(hflights_numeric[, c('Cancelled', 'AirTime')]) 112 | 113 | table(hflights_numeric$AirTime[which(hflights_numeric$Cancelled == 1)], exclude = NULL) 114 | 115 | table(hflights_numeric$Cancelled) 116 | 117 | hflights_numeric <- subset(hflights_numeric, select = -Cancelled) 118 | 119 | which(is.na(cor(hflights_numeric, use = 'pairwise.complete.obs')), arr.ind = TRUE) 120 | 121 | ## 122 | 123 | hflights_numeric <- subset(hflights_numeric, select = -Diverted) 124 | KMO(cor(hflights_numeric[, -c(14)], use = 'pairwise.complete.obs')) 125 | 126 | ## 127 | 128 | KMO(mtcars) 129 | 130 | bartlett.test(mtcars) 131 | cortest.bartlett(cor(mtcars)) 132 | 133 | ## 134 | 135 | prcomp(mtcars, scale = TRUE) 136 | 137 | ## 138 | 139 | summary(prcomp(mtcars, scale = TRUE)) 140 | 141 | ## 142 | 143 | sum(prcomp(scale(mtcars))$sdev^2) 144 | 145 | prcomp(scale(mtcars))$sdev^2 146 | 147 | (6.6 + 2.65) / 11 148 | 149 | ## 150 | 151 | VSS.scree(cor(mtcars)) 152 | 153 | ## 154 | 155 | scree(cor(mtcars)) 156 | 157 | ## 158 | 159 | fa.parallel(mtcars) 160 | 161 | ### 162 | 163 | pc <- prcomp(mtcars, scale = TRUE) 164 | head(pc$x[, 1:2]) 165 | 166 | head(scale(mtcars) %*% pc$rotation[, 1:2]) 167 | 168 | summary(pc$x[, 1:2]) 169 | 170 | ## 171 | 172 | apply(pc$x[, 1:2], 2, sd) 173 | pc$sdev[1:2] 174 | 175 | round(cor(pc$x)) 176 | 177 | pc$rotation[, 1:2] 178 | 179 | ## 180 | 181 | biplot(pc, cex = c(0.8, 1.2)) 182 | abline(h = 0, v = 0, lty = 'dashed') 183 | 184 | ## 185 | 186 | cor(mtcars, pc$x[, 1:2]) 187 | 188 | ## 189 | 190 | varimax(pc$rotation[, 1:2]) 191 | 192 | ## 193 | 194 | pcv <- varimax(pc$rotation[, 1:2])$loadings 195 | plot(scale(mtcars) %*% pcv, type = 'n', xlab = 'Transmission', ylab = 'Power') 196 | text(scale(mtcars) %*% pcv, labels = rownames(mtcars)) 197 | 198 | ## 199 | 200 | library(GPArotation) 201 | promax(pc$rotation[, 1:2]) 202 | cor(promax(pc$rotation[, 1:2])$loadings) 203 | 204 | ## 205 | 206 | library(jpeg) 207 | t <- tempfile() 208 | download.file('http://bit.ly/nasa-img', t) 209 | img <- readJPEG(t) 210 | str(img) 211 | 212 | h <- dim(img)[1] 213 | w <- dim(img)[2] 214 | m <- matrix(img, h*w) 215 | str(m) 216 | 217 | pca <- prcomp(m) 218 | 219 | ## 220 | 221 | summary(pca) 222 | pca$rotation 223 | 224 | extractColors <- function(x) rgb(x[1], x[2], x[3]) 225 | 226 | (colors <- apply(abs(pca$rotation), 2, extractColors)) 227 | 228 | ## 229 | 230 | pie(pca$sdev, col = colors, labels = colors) 231 | 232 | par(mfrow = c(1, 2), mar = rep(0, 4)) 233 | image(matrix(pca$x[, 1], h), col = gray.colors(100)) 234 | image(matrix(pca$x[, 2], h), col = gray.colors(100), yaxt = 'n') 235 | 236 | ## 237 | 238 | m <- subset(mtcars, select = c(mpg, cyl, hp, carb)) 239 | 240 | (f <- fa(m)) 241 | 242 | ## 243 | 244 | fa.diagram(f) 245 | 246 | cor(f$scores, mtcars$disp) 247 | 248 | ## 249 | 250 | as.matrix(eurodist)[1:5, 1:5] 251 | 252 | (mds <- cmdscale(eurodist)) 253 | 254 | ## 255 | 256 | plot(mds) 257 | 258 | ## 259 | 260 | plot(mds, type = 'n') 261 | text(mds[, 1], mds[, 2], labels(eurodist), cex = 0.7) 262 | 263 | ## 264 | 265 | mds <- cmdscale(dist(mtcars)) 266 | plot(mds, type = 'n') 267 | text(mds[, 1], mds[, 2], rownames(mds), cex = 0.7) 268 | -------------------------------------------------------------------------------- /10 - Classification and Clustering.R: -------------------------------------------------------------------------------- 1 | ## Extracted code chunks from 2 | ## 3 | ## Gergely Daróczi (2015): Mastering Data Analysis with R. 4 | ## 5 | ## Chapter #10: Classification and Clustering. pp. 235-268. 6 | ## 7 | ## 8 | ## This file includes the code chunks from the above mentioned 9 | ## chapter except for the leading ">" and "+" characters, which 10 | ## stand for the prompt in the R console. The prompt was 11 | ## intentionally removed here along with arbitrary line-breaks, 12 | ## so that you copy and paste the R expressions to the R console 13 | ## in a more convenient and seamless way. 14 | ## 15 | ## Code chunks are grouped here by the printed pages of the book. 16 | ## Two hash signs at the beginning of a line stands for a page 17 | ## break, while an extra empty line between the code chunks 18 | ## represents one or more paragraphs in the original book between 19 | ## the examples for easier navigation. 20 | ## 21 | ## Sometimes extra instructions starting with a double hash are 22 | ## also provided on how to run the below expressions. 23 | ## 24 | ## 25 | ## Find more information on the book at http://bit.ly/mastering-R 26 | ## and you can contact me on Twitter and GitHub by the @daroczig 27 | ## handle, or mail me at daroczig@rapporter.net 28 | ## 29 | 30 | d <- dist(mtcars) 31 | h <- hclust(d) 32 | h 33 | 34 | ## 35 | 36 | plot(h) 37 | 38 | ## 39 | 40 | plot(h) 41 | rect.hclust(h, k = 3, border = 'red') 42 | 43 | (cn <- cutree(h, k = 3)) 44 | 45 | ## 46 | 47 | table(cn) 48 | 49 | round(aggregate(mtcars, FUN = mean, by = list(cn)), 1) 50 | 51 | round(aggregate(mtcars, FUN = sd, by = list(cn)), 1) 52 | 53 | ## 54 | 55 | round(sapply(mtcars, sd), 1) 56 | 57 | round(apply(aggregate(mtcars, FUN = mean, by = list(cn)), 2, sd), 1) 58 | 59 | ## 60 | 61 | library(NbClust) 62 | NbClust(mtcars, method = 'complete', index = 'dindex') 63 | 64 | NbClust(mtcars, method = 'complete', index = 'hartigan')$Best.nc 65 | 66 | ## 67 | 68 | NbClust(mtcars, method = 'complete', index = 'kl')$Best.nc 69 | 70 | NbClust(iris[, -5], method = 'complete', index = 'all')$Best.nc[1, ] 71 | 72 | ## 73 | 74 | (k <- kmeans(mtcars, 3)) 75 | 76 | ## 77 | 78 | all(cn == k$cluster) 79 | 80 | cbind(cn, k$cluster) 81 | 82 | ## 83 | 84 | library(cluster) 85 | clusplot(mtcars, k$cluster, color = TRUE, shade = TRUE, labels = 2, cex = 0.7) 86 | 87 | ## 88 | 89 | factors <- c('cyl', 'vs', 'am', 'carb', 'gear') 90 | mtcars[, factors] <- lapply(mtcars[, factors], factor) 91 | 92 | ## 93 | 94 | library(poLCA) 95 | p <- poLCA(cbind(cyl, vs, am, carb, gear) ~ 1, data = mtcars, graphs = TRUE, nclass = 3) 96 | 97 | p$probs 98 | 99 | ## 100 | 101 | plot(p) 102 | 103 | p$P 104 | 105 | ## 106 | 107 | rm(mtcars) 108 | mtcars$gear <- factor(mtcars$gear) 109 | 110 | library(MASS) 111 | d <- lda(gear ~ ., data = mtcars, CV = TRUE) 112 | 113 | (tab <- table(mtcars$gear, d$class)) 114 | 115 | tab / rowSums(tab) 116 | 117 | sum(diag(tab)) / sum(tab) 118 | 119 | ## 120 | 121 | round(d$posterior, 4) 122 | 123 | ## 124 | 125 | d <- lda(gear ~ ., data = mtcars) 126 | plot(d) 127 | 128 | ## 129 | 130 | plot(d, dimen = 1, type = 'both' ) 131 | 132 | ## 133 | 134 | lr <- glm(am ~ hp + wt, data = mtcars, family = binomial) 135 | summary(lr) 136 | 137 | ## 138 | 139 | table(mtcars$am, round(predict(lr, type = 'response'))) 140 | 141 | library(nnet) 142 | (mlr <- multinom(factor(gear) ~ ., data = mtcars)) 143 | 144 | ## 145 | 146 | table(mtcars$gear, predict(mlr)) 147 | 148 | rm(mtcars) 149 | 150 | ## 151 | 152 | set.seed(42) 153 | n <- nrow(mtcars) 154 | train <- mtcars[sample(n, n/2), ] 155 | 156 | library(dplyr) 157 | train <- sample_n(mtcars, n / 2) 158 | 159 | ## 160 | 161 | test <- mtcars[setdiff(row.names(mtcars), row.names(train)), ] 162 | 163 | library(class) 164 | (cm <- knn( 165 | train = subset(train, select = -gear), 166 | test = subset(test, select = -gear), 167 | cl = train$gear, 168 | k = 5)) 169 | 170 | cor(test$gear, as.numeric(as.character(cm))) 171 | 172 | table(test$gear, as.numeric(as.character(cm))) 173 | 174 | ## 175 | 176 | table(train$gear) 177 | 178 | ## 179 | 180 | library(rpart) 181 | ct <- rpart(factor(gear) ~ ., data = train, minsplit = 3) 182 | summary(ct) 183 | 184 | ## 185 | 186 | plot(ct); text(ct) 187 | 188 | table(test$gear, predict(ct, newdata = test, type = 'class')) 189 | 190 | ## 191 | 192 | library(rpart.plot) 193 | rpart.plot(ct) 194 | 195 | library(partykit) 196 | plot(as.party(ct)) 197 | 198 | library(party) 199 | ct <- ctree(factor(gear) ~ drat, data = train, controls = ctree_control(minsplit = 3)) 200 | plot(ct, main = "Conditional Inference Tree") 201 | 202 | table(test$gear, predict(ct, newdata = test, type = 'node')) 203 | 204 | ## 205 | 206 | library(randomForest) 207 | (rf <- randomForest(factor(gear) ~ ., data = train, ntree = 250)) 208 | 209 | table(test$gear, predict(rf, test)) 210 | 211 | ## 212 | 213 | plot(rf) 214 | legend('topright', colnames(rf$err.rate), col = 1:4, fill = 1:4, bty = 'n') 215 | 216 | library(caret) 217 | 218 | ## 219 | 220 | library(C50) 221 | C50 <- train(factor(gear) ~ ., data = train, method = 'C5.0') 222 | summary(C50) 223 | 224 | ## 225 | 226 | table(test$gear, predict(C50, test)) 227 | -------------------------------------------------------------------------------- /11 - Social Network Analysis of the R ecosystem.R: -------------------------------------------------------------------------------- 1 | ## Extracted code chunks from 2 | ## 3 | ## Gergely Daróczi (2015): Mastering Data Analysis with R. 4 | ## 5 | ## Chapter #11: Social Network Analysis 6 | ## of the R Ecosystem. pp. 269-280. 7 | ## 8 | ## 9 | ## This file includes the code chunks from the above mentioned 10 | ## chapter except for the leading ">" and "+" characters, which 11 | ## stand for the prompt in the R console. The prompt was 12 | ## intentionally removed here along with arbitrary line-breaks, 13 | ## so that you copy and paste the R expressions to the R console 14 | ## in a more convenient and seamless way. 15 | ## 16 | ## Code chunks are grouped here by the printed pages of the book. 17 | ## Two hash signs at the beginning of a line stands for a page 18 | ## break, while an extra empty line between the code chunks 19 | ## represents one or more paragraphs in the original book between 20 | ## the examples for easier navigation. 21 | ## 22 | ## Sometimes extra instructions starting with a double hash are 23 | ## also provided on how to run the below expressions. 24 | ## 25 | ## 26 | ## Find more information on the book at http://bit.ly/mastering-R 27 | ## and you can contact me on Twitter and GitHub by the @daroczig 28 | ## handle, or mail me at daroczig@rapporter.net 29 | ## 30 | 31 | library(tools) 32 | pkgs <- available.packages() 33 | str(pkgs) 34 | head(package.dependencies(pkgs), 2) 35 | 36 | ## 37 | 38 | library(plyr) 39 | edges <- ldply(c('Depends', 'Imports', 'Suggests'), function(depLevel) { 40 | deps <- package.dependencies(pkgs, depLevel = depLevel) 41 | ldply(names(deps), function(pkg) 42 | if (!identical(deps[[pkg]], NA)) 43 | data.frame( 44 | src = pkg, 45 | dep = deps[[pkg]][, 1], 46 | label = depLevel, 47 | stringsAsFactors = FALSE)) 48 | }) 49 | str(edges) 50 | 51 | nrow(edges) / (nrow(pkgs) * (nrow(pkgs) - 1)) 52 | 53 | ## 54 | 55 | head(sort(table(edges$dep), decreasing = TRUE)) 56 | 57 | edges <- edges[edges$dep != 'R', ] 58 | 59 | 60 | ## directed 61 | 62 | 63 | ## table(edges$label) 64 | ## edges <- subset(edges, label %in% c('Depends', 'Imports')) 65 | 66 | library(igraph) 67 | g <- graph.data.frame(edges) 68 | summary(g) 69 | 70 | graph.density(g) 71 | head(sort(degree(g), decreasing = TRUE)) 72 | 73 | ## 74 | 75 | head(sort(degree(g), decreasing = TRUE)) 76 | head(sort(closeness(g))) 77 | head(sort(betweenness(g), decreasing = TRUE)) 78 | 79 | plot(degree(g), betweenness(g), type = 'n', main = 'Centrality of R package dependencies') 80 | text(degree(g), betweenness(g), labels = V(g)$name) 81 | 82 | ## 83 | 84 | edges <- edges[edges$label != 'Suggests', ] 85 | deptree <- edges$dep[edges$src == 'igraph'] 86 | while (!all(edges$dep[edges$src %in% deptree] %in% deptree)) 87 | deptree <- union(deptree, edges$dep[edges$src %in% deptree]) 88 | 89 | g <- graph.data.frame(edges[edges$src %in% c('igraph', deptree), ]) 90 | plot(g) 91 | 92 | ## 93 | 94 | V(g)$label.color <- 'orange' 95 | V(g)$label.color[V(g)$name == 'igraph'] <- 'darkred' 96 | V(g)$label.color[V(g)$name %in% edges$dep[edges$src == 'igraph']] <- 'orangered' 97 | E(g)$color <- c('blue', 'green')[factor(df$label)] 98 | plot(g, vertex.shape = 'none', edge.label = NA) 99 | 100 | ## 101 | 102 | tkplot(g, edge.label = NA) 103 | rglplot(g) 104 | 105 | ## 106 | 107 | library(visNetwork) 108 | nodes <- get.data.frame(g, 'vertices') 109 | names(nodes) <- c('id', 'color') 110 | 111 | ## 112 | 113 | edges <- get.data.frame(g) 114 | visNetwork(nodes, edges) 115 | 116 | g <- dominator.tree(g, root = 'igraph')$domtree 117 | plot(g, layout = layout.reingold.tilford(g, root = 'igraph'), vertex.shape = 'none') 118 | 119 | ## 120 | 121 | library(miniCRAN) 122 | pkgs <- pkgAvail() 123 | pkgDep('igraph', availPkgs = pkgs, suggests = FALSE, includeBasePkgs = TRUE) 124 | plot(makeDepGraph('igraph', pkgs, suggests = FALSE, includeBasePkgs = TRUE)) 125 | -------------------------------------------------------------------------------- /12 - Analysing Time-series.R: -------------------------------------------------------------------------------- 1 | ## Extracted code chunks from 2 | ## 3 | ## Gergely Daróczi (2015): Mastering Data Analysis with R. 4 | ## 5 | ## Chapter #12: Analyzing Time-series. pp. 281-296. 6 | ## 7 | ## 8 | ## This file includes the code chunks from the above mentioned 9 | ## chapter except for the leading ">" and "+" characters, which 10 | ## stand for the prompt in the R console. The prompt was 11 | ## intentionally removed here along with arbitrary line-breaks, 12 | ## so that you copy and paste the R expressions to the R console 13 | ## in a more convenient and seamless way. 14 | ## 15 | ## Code chunks are grouped here by the printed pages of the book. 16 | ## Two hash signs at the beginning of a line stands for a page 17 | ## break, while an extra empty line between the code chunks 18 | ## represents one or more paragraphs in the original book between 19 | ## the examples for easier navigation. 20 | ## 21 | ## Sometimes extra instructions starting with a double hash are 22 | ## also provided on how to run the below expressions. 23 | ## 24 | ## 25 | ## Find more information on the book at http://bit.ly/mastering-R 26 | ## and you can contact me on Twitter and GitHub by the @daroczig 27 | ## handle, or mail me at daroczig@rapporter.net 28 | ## 29 | 30 | library(hflights) 31 | library(data.table) 32 | 33 | dt <- data.table(hflights) 34 | dt[, date := ISOdate(Year, Month, DayofMonth)] 35 | 36 | daily <- dt[, list( 37 | N = .N, 38 | Delays = sum(ArrDelay, na.rm = TRUE), 39 | Cancelled = sum(Cancelled), 40 | Distance = mean(Distance) 41 | ), by = date] 42 | 43 | str(daily) 44 | 45 | ## 46 | 47 | plot(ts(daily)) 48 | 49 | ## 50 | 51 | setorder(daily, date) 52 | plot(ts(daily)) 53 | 54 | plot(ts(daily$N, start = 2011, frequency = 365), main = 'Number of flights from Houston in 2011') 55 | 56 | ## 57 | 58 | plot(decompose(ts(daily$N, frequency = 7))) 59 | 60 | ## 61 | 62 | setNames(decompose(ts(daily$N, frequency = 7))$figure, weekdays(daily$date[1:7])) 63 | 64 | decompose(ts(daily$N, frequency = 365)) 65 | 66 | nts <- ts(daily$N, frequency = 7) 67 | fit <- HoltWinters(nts, beta = FALSE, gamma = FALSE) 68 | plot(fit) 69 | 70 | ## 71 | 72 | fit <- HoltWinters(nts) 73 | plot(fit) 74 | 75 | library(forecast) 76 | forecast(fit) 77 | 78 | ## 79 | 80 | plot(forecast(HoltWinters(nts), 31)) 81 | 82 | ## 83 | 84 | auto.arima(nts) 85 | 86 | ## 87 | 88 | auto.arima(nts, approximation = FALSE) 89 | 90 | plot(forecast(auto.arima(nts, D = 1, approximation = FALSE), 31)) 91 | 92 | ## 93 | 94 | cts <- ts(daily$Cancelled) 95 | auto.arima(cts) 96 | 97 | library(tsoutliers) 98 | outliers <- tso(cts, tsmethod = 'arima', args.tsmethod = list(order = c(1, 1, 2))) 99 | 100 | ## 101 | 102 | plot(outliers) 103 | 104 | plot(tso(ts(daily$Cancelled))) 105 | 106 | ## 107 | 108 | dfc <- as.data.frame(daily[, c('date', 'Cancelled'), with = FALSE]) 109 | 110 | library(AnomalyDetection) 111 | AnomalyDetectionTs(dfc, plot = TRUE)$plot 112 | 113 | library(zoo) 114 | zd <- zoo(daily[, -1, with = FALSE], daily[[1]]) 115 | 116 | ## 117 | 118 | plot(zd) 119 | 120 | ## 121 | 122 | plot(cumsum(zd)) 123 | -------------------------------------------------------------------------------- /13 - Data Around Us.R: -------------------------------------------------------------------------------- 1 | ## Extracted code chunks from 2 | ## 3 | ## Gergely Daróczi (2015): Mastering Data Analysis with R. 4 | ## 5 | ## Chapter #13: Data Around Us. pp. 297-322. 6 | ## 7 | ## 8 | ## This file includes the code chunks from the above mentioned 9 | ## chapter except for the leading ">" and "+" characters, which 10 | ## stand for the prompt in the R console. The prompt was 11 | ## intentionally removed here along with arbitrary line-breaks, 12 | ## so that you copy and paste the R expressions to the R console 13 | ## in a more convenient and seamless way. 14 | ## 15 | ## Code chunks are grouped here by the printed pages of the book. 16 | ## Two hash signs at the beginning of a line stands for a page 17 | ## break, while an extra empty line between the code chunks 18 | ## represents one or more paragraphs in the original book between 19 | ## the examples for easier navigation. 20 | ## 21 | ## Sometimes extra instructions starting with a double hash are 22 | ## also provided on how to run the below expressions. 23 | ## 24 | ## 25 | ## Find more information on the book at http://bit.ly/mastering-R 26 | ## and you can contact me on Twitter and GitHub by the @daroczig 27 | ## handle, or mail me at daroczig@rapporter.net 28 | ## 29 | 30 | library(hflights) 31 | library(data.table) 32 | dt <- data.table(hflights)[, list( 33 | N = .N, 34 | Cancelled = sum(Cancelled), 35 | Distance = Distance[1], 36 | TimeVar = sd(ActualElapsedTime, na.rm = TRUE), 37 | ArrDelay = mean(ArrDelay, na.rm = TRUE)) , by = Dest] 38 | 39 | ## 40 | 41 | str(dt) 42 | 43 | library(ggmap) 44 | (h <- geocode('Houston, TX')) 45 | 46 | ## 47 | 48 | h <- data.frame(lat = 29.7630556, lon = -95.36306556) 49 | 50 | dt[, c('lon', 'lat') := geocode(Dest)] 51 | 52 | str(setDF(dt)) 53 | 54 | par(mar=(c(0,0,1,0))) 55 | library(maps) 56 | map('state') 57 | title('Flight destinations from Houston,TX') 58 | 59 | ## 60 | 61 | points(h$lon, h$lat, col = 'blue', pch = 13) 62 | points(dt$lon, dt$lat, col = 'red', pch = 19) 63 | 64 | text(dt$lon, dt$lat + 1, labels = dt$Dest, cex = 0.7) 65 | 66 | ## 67 | 68 | map('state') 69 | title('Frequent flight destinations from Houston,TX') 70 | points(h$lon, h$lat, col = 'blue', pch = 13) 71 | points(dt$lon, dt$lat, col = rgb(1, 0, 0, dt$N/max(dt$N)), pch = 19) 72 | legend('bottomright', legend = round(quantile(dt$N)), col = rgb(1, 0, 0, quantile(dt$N)/max(dt$N)), pch = 19, box.col = NA) 73 | 74 | ## 75 | 76 | str(map_data <- map('state', plot = FALSE, fill = TRUE)) 77 | 78 | grep('^washington', map_data$names, value = TRUE) 79 | 80 | states <- sapply(strsplit(map_data$names, ':'), '[[', 1) 81 | 82 | library(maptools) 83 | us <- map2SpatialPolygons(map_data, IDs = states, proj4string = CRS('+proj=longlat +datum=WGS84')) 84 | 85 | ## ## alternative way of getting the data in the right form: 86 | ## library(raster) 87 | ## us <- getData('GADM', country = 'USA', level = 1) 88 | ## library(rgeos) 89 | ## us <- gSimplify(us, tol = 0.01, topologyPreserve = TRUE) 90 | 91 | plot(us) 92 | 93 | library(sp) 94 | dtp <- SpatialPointsDataFrame(dt[, c('lon', 'lat')], dt, proj4string = CRS('+proj=longlat +datum=WGS84')) 95 | str(sp::over(us, dtp)) 96 | 97 | ## 98 | 99 | ## ## alternative way of getting state info 100 | ## geocode('LAX') 101 | ## geocode('LAX', 'more') 102 | ## geocode('LAX', 'all')$results[[1]]$address_components[[6]]$short_name 103 | 104 | str(sapply(sp::over(us, dtp, returnList = TRUE), function(x) sum(x$Cancelled))) 105 | 106 | dtp <- SpatialPointsDataFrame(dt[, c('lon', 'lat')], dt[, 'Cancelled', drop = FALSE], proj4string = CRS('+proj=longlat +datum=WGS84')) 107 | str(cancels <- sp::over(us, dtp, fn = sum)) 108 | 109 | val <- cancels$Cancelled[match(states, row.names(cancels))] 110 | 111 | ## 112 | 113 | val[is.na(val)] <- 0 114 | 115 | map('state', col = rgb(1, 0, 0, sqrt(val/max(val))), fill = TRUE) 116 | title('Number of cancelled flights from Houston to US states') 117 | points(h$lon, h$lat, col = 'blue', pch = 13) 118 | legend('bottomright', legend = round(quantile(val)), fill = rgb(1, 0, 0, sqrt(quantile(val)/max(val))), box.col = NA) 119 | 120 | ## 121 | 122 | library(fields) 123 | out <- as.image(dt$ArrDelay, x = dt[, c('lon', 'lat')], nrow = 256, ncol = 256) 124 | 125 | table(is.na(out$z)) 126 | 127 | image(out) 128 | 129 | ## 130 | 131 | image(out, xlim = base::range(map_data$x, na.rm = TRUE), ylim = base::range(map_data$y, na.rm = TRUE)) 132 | 133 | look <- image.smooth(out, theta = .5) 134 | table(is.na(look$z)) 135 | 136 | ## 137 | 138 | image(look) 139 | 140 | out <- as.image(dt$ArrDelay, x = dt[, c('lon', 'lat')], nrow = 512, ncol = 512) 141 | look <- image.smooth(out, theta = 1) 142 | 143 | usa_data <- map('usa', plot = FALSE, region = 'main') 144 | p <- expand.grid(look$x, look$y) 145 | library(sp) 146 | n <- which(point.in.polygon(p$Var1, p$Var2, usa_data$x, usa_data$y) == 0) 147 | look$z[n] <- NA 148 | 149 | ## 150 | 151 | map('usa') 152 | image(look, add = TRUE) 153 | map('state', lwd = 3, add = TRUE) 154 | title('Arrival delays of flights from Houston') 155 | points(dt$lon, dt$lat, pch = 19, cex = .5) 156 | points(h$lon, h$lat, pch = 13) 157 | 158 | library(deldir) 159 | map('usa') 160 | plot(deldir(dt$lon, dt$lat), wlines = 'tess', col = c('red', 'darkgray'), lwd = 2, add = TRUE, pch = 19) 161 | 162 | ## 163 | 164 | library(OpenStreetMap) 165 | map <- openmap(c(max(map_data$y, na.rm = TRUE), 166 | min(map_data$x, na.rm = TRUE)), 167 | c(min(map_data$y, na.rm = TRUE), 168 | max(map_data$x, na.rm = TRUE)), 169 | type = 'stamen-terrain') 170 | 171 | ## 172 | 173 | map <- openproj(map, projection = '+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs') 174 | 175 | plot(map) 176 | plot(deldir(dt$lon, dt$lat), wlines = "tess", col = c('red', 'black'), lwd = 2, add = TRUE, pch = 19, cex = 0.5) 177 | 178 | ## 179 | 180 | map <- openmap(c(max(map_data$y), 181 | min(map_data$x)), 182 | c(min(map_data$y), 183 | max(map_data$x)), 184 | type = 'cloudmade-1960') 185 | plot(map) 186 | 187 | ## 188 | 189 | cancels$state <- rownames(cancels) 190 | cancels$Cancelled[is.na(cancels$Cancelled)] <- 0 191 | 192 | library(googleVis) 193 | plot(gvisGeoChart(cancels, 'state', 'Cancelled', 194 | options = list( 195 | region = 'US', 196 | displayMode = 'regions', 197 | resolution = 'provinces'))) 198 | 199 | 200 | ## 201 | 202 | dt$LatLong <- paste(dt$lat, dt$lon, sep = ':') 203 | dt$tip <- apply(dt, 1, function(x) 204 | paste(names(dt), x, collapse = '
')) 205 | plot(gvisMap(dt, 'LatLong', tipvar = 'tip')) 206 | 207 | ## 208 | 209 | library(rCharts) 210 | map <- Leaflet$new() 211 | map$setView(as.numeric(dt[which(dt$Dest == 'MCI'), c('lat', 'lon')]), zoom = 4) 212 | 213 | ## 214 | 215 | 216 | for (i in 1:nrow(dt)) 217 | map$marker(c(dt$lat[i], dt$lon[i]), bindPopup = dt$tip[i]) 218 | map$show() 219 | 220 | library(leaflet) 221 | 222 | pal <- colorNumeric(palette = 'Blues', domain = countries$gdp_md_est) 223 | 224 | leaflet(us) %>% addProviderTiles("Acetate.terrain") %>% addPolygons( color = ~pal(gdp_md_est)) %>% addMarkers(lng = dt$lon, lat = dt$lat, popup = dt$tip) 225 | 226 | ## 227 | 228 | dt <- dt[point.in.polygon(dt$lon, dt$lat, usa_data$x, usa_data$y) == 1, ] 229 | 230 | ## 231 | 232 | library(diagram) 233 | library(scales) 234 | 235 | map('usa') 236 | title('Number of flights, cancellations and delays from Houston') 237 | image(look, add = TRUE) 238 | map('state', lwd = 3, add = TRUE) 239 | for (i in 1:nrow(dt)) { 240 | curvedarrow( 241 | from = rev(as.numeric(h)), 242 | to = as.numeric(dt[i, c('lon', 'lat')]), 243 | arr.pos = 1, 244 | arr.type = 'circle', 245 | curve = 0.1, 246 | arr.col = alpha('black', dt$N[i] / max(dt$N)), 247 | arr.length = dt$N[i] / max(dt$N), 248 | lwd = dt$Cancelled[i] / max(dt$Cancelled) * 25, 249 | lcol = alpha('black', dt$Cancelled[i] / max(dt$Cancelled))) 250 | } 251 | 252 | ## models 253 | 254 | dm <- dist(dt[, c('lon', 'lat')]) 255 | dm <- as.matrix(dm) 256 | idm <- 1/dm 257 | diag(idm) <- 0 258 | str(idm) 259 | 260 | library(ape) 261 | dt$TimeVar[is.na(dt$TimeVar)] <- 0 262 | Moran.I(dt$TimeVar, idm) 263 | 264 | ## 265 | 266 | library(spdep) 267 | idml <- mat2listw(idm) 268 | moran.test(dt$TimeVar, idml) 269 | 270 | idml <- mat2listw(idm, style = 'W') 271 | moran.test(dt$TimeVar, idml) 272 | -------------------------------------------------------------------------------- /14 - Analyzing the R Community.R: -------------------------------------------------------------------------------- 1 | ## Extracted code chunks from 2 | ## 3 | ## Gergely Daróczi (2015): Mastering Data Analysis with R. 4 | ## 5 | ## Chapter #14: Analyzing the R Community. pp. 323-348. 6 | ## 7 | ## 8 | ## This file includes the code chunks from the above mentioned 9 | ## chapter except for the leading ">" and "+" characters, which 10 | ## stand for the prompt in the R console. The prompt was 11 | ## intentionally removed here along with arbitrary line-breaks, 12 | ## so that you copy and paste the R expressions to the R console 13 | ## in a more convenient and seamless way. 14 | ## 15 | ## Code chunks are grouped here by the printed pages of the book. 16 | ## Two hash signs at the beginning of a line stands for a page 17 | ## break, while an extra empty line between the code chunks 18 | ## represents one or more paragraphs in the original book between 19 | ## the examples for easier navigation. 20 | ## 21 | ## Sometimes extra instructions starting with a double hash are 22 | ## also provided on how to run the below expressions. 23 | ## 24 | ## 25 | ## Find more information on the book at http://bit.ly/mastering-R 26 | ## and you can contact me on Twitter and GitHub by the @daroczig 27 | ## handle, or mail me at daroczig@rapporter.net 28 | ## 29 | 30 | library(XML) 31 | page <- htmlParse('http://www.r-project.org/foundation/donors.html') 32 | 33 | ## 34 | 35 | list <- unlist(xpathApply(page, "//h3[@id='supporting-members']/following-sibling::ul[1]/li", xmlValue)) 36 | str(list) 37 | 38 | supporterlist <- sub(' \\([a-zA-Z ]*\\)$', '', list) 39 | countrylist <- substr(list, nchar(supporterlist) + 3, nchar(list) - 1) 40 | 41 | tail(sort(prop.table(table(countrylist)) * 100), 5) 42 | 43 | countries <- as.data.frame(table(countrylist)) 44 | 45 | ## 46 | 47 | library(rworldmap) 48 | joinCountryData2Map(countries, joinCode = 'NAME', nameJoinColumn = 'countrylist', verbose = TRUE) 49 | 50 | library(ggmap) 51 | for (fix in c('Brasil', 'CZ', 'Danmark', 'NL')) { 52 | countrylist[which(countrylist == fix)] <- geocode(fix, output = 'more')$country 53 | } 54 | 55 | countries <- as.data.frame(table(countrylist)) 56 | countries <- joinCountryData2Map(countries, joinCode = 'NAME', nameJoinColumn = 'countrylist') 57 | 58 | ## 59 | 60 | mapCountryData(countries, 'Freq', catMethod = 'logFixedWidth', mapTitle = 'Number of R Foundation supporting members') 61 | 62 | ## 63 | 64 | packages <- readHTMLTable('http://cran.r-project.org/web/checks/check_summary.html', which = 2) 65 | 66 | maintainers <- sub('(.*) <(.*)>', '\\1', packages$' Maintainer') 67 | maintainers <- gsub(' ', ' ', maintainers) 68 | str(maintainers) 69 | 70 | tail(sort(table(maintainers)), 8) 71 | 72 | ## 73 | 74 | N <- as.numeric(table(maintainers)) 75 | library(fitdistrplus) 76 | plotdist(N) 77 | 78 | descdist(N, boot = 1e3) 79 | 80 | ## 81 | 82 | (gparams <- fitdist(N, 'gamma')) 83 | 84 | ## 85 | 86 | gshape <- gparams$estimate[['shape']] 87 | grate <- gparams$estimate[['rate']] 88 | sum(rgamma(1e5, shape = gshape, rate = grate)) 89 | hist(rgamma(1e5, shape = gshape, rate = grate)) 90 | 91 | pgamma(2, shape = gshape, rate = grate) 92 | 93 | prop.table(table(N <= 2)) 94 | 95 | ## 96 | 97 | ploc <- min(N) 98 | pshp <- length(N) / sum(log(N) - log(ploc)) 99 | 100 | library(actuar) 101 | ppareto(2, pshp, ploc) 102 | 103 | fg <- fitdist(N, 'gamma') 104 | fw <- fitdist(N, 'weibull') 105 | fl <- fitdist(N, 'lnorm') 106 | fp <- fitdist(N, 'pareto', start = list(shape = 1, scale = 1)) 107 | par(mfrow = c(1, 2)) 108 | denscomp(list(fg, fw, fl, fp), addlegend = FALSE) 109 | qqcomp(list(fg, fw, fl, fp), legendtext = c('gamma', 'Weibull', 'Lognormal', 'Pareto')) 110 | 111 | ## 112 | 113 | length(unique(maintainers)) 114 | 115 | library(RCurl) 116 | url <- getURL('https://stat.ethz.ch/pipermail/r-help/') 117 | 118 | R.help.toc <- htmlParse(url) 119 | R.help.archives <- unlist(xpathApply(R.help.toc, '//table//td[3]/a', xmlAttrs), use.names = FALSE) 120 | 121 | dir.create('r-help') 122 | for (f in R.help.archives) 123 | download.file(url = paste0(url, f), file.path('help-r', f), method = 'curl') 124 | 125 | ## 126 | 127 | lines <- system("zgrep -E '^From: .* at .*' ./help-r/*.txt.gz", intern = TRUE) 128 | length(lines) 129 | length(unique(lines)) 130 | 131 | lines <- sub('.*From: ', '', lines) 132 | Rhelpers <- sub('.*\\((.*)\\)', '\\1', lines) 133 | 134 | tail(sort(table(Rhelpers)), 6) 135 | 136 | ## 137 | 138 | grep('Brian( D)? Ripley', names(table(Rhelpers)), value = TRUE) 139 | 140 | sum(grepl('Brian( D)? Ripley', Rhelpers)) 141 | 142 | ## 143 | 144 | lines <- system("zgrep -E '^Date: [A-Za-z]{3}, [0-9]{1,2} [A-Za-z]{3} [0-9]{4} [0-9]{2}:[0-9]{2}:[0-9]{2} [-+]{1}[0-9]{4}' ./help-r/*.txt.gz", intern = TRUE) 145 | length(lines) 146 | 147 | head(sub('.*Date: ', '', lines[1])) 148 | 149 | ## 150 | 151 | times <- strptime(sub('.*Date: ', '', lines), format = '%a, %d %b %Y %H:%M:%S %z') 152 | 153 | plot(table(format(times, '%Y')), type = 'l') 154 | 155 | library(data.table) 156 | Rhelp <- data.table(time = times) 157 | Rhelp[, H := hour(time)] 158 | Rhelp[, D := wday(time)] 159 | 160 | ## 161 | 162 | library(ggplot2) 163 | ggplot(na.omit(Rhelp[, .N, by = .(H, D)]), 164 | aes(x = factor(H), y = factor(D), size = N)) + geom_point() + 165 | ylab('Day of the week') + xlab('Hour of the day') + 166 | ggtitle('Number of mails posted on [R-help]') + 167 | theme_bw() + theme('legend.position' = 'top') 168 | 169 | tail(sort(prop.table(table(sub('.*([+-][0-9]{4}).*', '\\1', lines)))), 22) 170 | 171 | ## 172 | 173 | Rhelp[, date := as.Date(time)] 174 | Rdaily <- Rhelp[, .N, by = date] 175 | 176 | Rdaily <- zoo(Rdaily$N, Rdaily$date) 177 | 178 | plot(Rdaily) 179 | 180 | library(forecast) 181 | fit <- ets(Rdaily) 182 | 183 | ## 184 | 185 | forecast(fit, 1) 186 | 187 | plot(forecast(fit, 30), include = 365) 188 | 189 | ## 190 | 191 | lists <- rbindlist(list( 192 | data.frame(name = unique(supporterlist), list = 'supporter'), 193 | data.frame(name = unique(maintainers), list = 'maintainer'), 194 | data.frame(name = unique(Rhelpers), list = 'R-help'))) 195 | 196 | t <- table(lists$name, lists$list) 197 | table(rowSums(t)) 198 | 199 | library(Rcapture) 200 | descriptive(t) 201 | 202 | ## 203 | 204 | closedp(t) 205 | 206 | ## 207 | 208 | library(fbRads) 209 | 210 | ## register an application at 211 | ## https://developers.facebook.com/apps/ 212 | ## and get a token e.g. via 213 | ## library(httr) 214 | ## app <- oauth_app('facebook', 'your_app_id', 'your_app_secret') 215 | ## tkn <- oauth2.0_token( 216 | ## oauth_endpoints('facebook'), app, scope = 'ads_management', 217 | ## type = 'application/x-www-form-urlencoded') 218 | ## tkn <- tkn$credentials$access_token 219 | 220 | fbad_init(..., ...) 221 | fbad_get_search(fbacc = fbacc, q = 'rstats', type = 'adinterest') 222 | 223 | fbad_get_search(fbacc = fbacc, q = 'SPSS', type = 'adinterest') 224 | 225 | res <- fbad_get_search(fbacc = fbacc, q = 'programming language', type = 'adinterest') 226 | res <- res[order(res$audience_size, decreasing = TRUE), ] 227 | res[1:10, 1:3] 228 | 229 | ## 230 | 231 | library(twitteR) 232 | ## register an application and make a note of your consumer key & secret, 233 | ## also your access token and secret at 234 | ## https://apps.twitter.com 235 | setup_twitter_oauth(...) 236 | 237 | str(searchTwitter('#rstats', n = 1, resultType = 'recent')) 238 | 239 | ## 240 | 241 | tweets <- Rtweets(n = 500) 242 | 243 | ## 244 | 245 | length(strip_retweets(tweets)) 246 | 247 | tweets <- twListToDF(tweets) 248 | 249 | library(tm) 250 | corpus <- Corpus(VectorSource(tweets$text)) 251 | 252 | corpus <- tm_map(corpus, removeWords, stopwords('english')) 253 | corpus <- tm_map(corpus, content_transformer(tolower)) 254 | corpus <- tm_map(corpus, removePunctuation) 255 | corpus <- tm_map(corpus, stripWhitespace) 256 | 257 | corpus <- tm_map(corpus, removeWords, 'rstats') 258 | 259 | library(wordcloud) 260 | wordcloud(corpus) 261 | 262 | 263 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## [Mastering Data Analysis with R](http://bit.ly/mastering-R) 2 | 3 | This repository includes the example R source code and data files for the above referenced book published at Packt Publishing in 2015. 4 | 5 | The above R files are identical to the R code examples found in the book except for the leading `>` and `+` characters, which stand for the prompt in the R console. As the book included both the R commands and output, the prompt was also shown in the examples so that the reader can easily distinguish the calls from the returned values: 6 | 7 | ```r 8 | > set.seed(42) 9 | > data.frame( 10 | + A = runif(2), 11 | + B = sample(letters, 2)) 12 | A B 13 | 1 0.9148060 h 14 | 2 0.9370754 u 15 | ``` 16 | 17 | In this repository, both the output and the prompt were intentionally removed here along with arbitrary line-breaks, so that you copy and paste the R expressions to the R console in a more convenient and seamless way. 18 | 19 | The code chunks are grouped by the printed pages of the book. Two hash signs at the beginning of a line stands for a page break, while an extra empty line between the code chunks represents one or more paragraphs in the original book between the examples for easier navigation. Sometimes extra instructions starting with a double hash are also provided on how to run the below expressions. 20 | 21 | Please find more information on the book at http://bit.ly/mastering-R and you can contact me on [Twitter](https://twitter.com/daroczig) and [GitHub](https://github.com/daroczig) in case of any question or feedback. 22 | 23 | I hope you will enjoy and find useful this book! 24 | -------------------------------------------------------------------------------- /data/SMART_2013.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daroczig/Mastering-Data-Analysis-with-R/668f85080a95169fcecdd8ff92341ff5261036f9/data/SMART_2013.RData --------------------------------------------------------------------------------