├── .gitignore ├── 1-Using-R-with-Hadoop.Rpres ├── 1-Using-R-with-Hadoop.html ├── 2-Taxi-analysis-with-RHadoop.Rpres ├── 2-Taxi-analysis-with-RHadoop.html ├── 4-Computing-on-distributed-matrices.Rpres ├── 4-Computing-on-distributed-matrices.html ├── 5-hive.Rpres ├── 5-hive.html ├── LICENSE ├── README.md ├── css ├── custom.css ├── mslogo.png └── title-background.png ├── data ├── dictionary_trip_data.csv ├── trip_data_10_sample.csv ├── trip_data_11_sample.csv ├── trip_data_12_sample.csv ├── trip_data_1_sample.csv ├── trip_data_2_sample.csv ├── trip_data_3_sample.csv ├── trip_data_4_sample.csv ├── trip_data_5_sample.csv ├── trip_data_6_sample.csv ├── trip_data_7_sample.csv ├── trip_data_8_sample.csv ├── trip_data_9_sample.csv └── ullyses.txt ├── demo ├── 01-intro-lapply.R ├── 02-intro-tapply.R ├── 03-download-ebook-to-hdfs.R ├── 04-wordcount-1-algorithm.R ├── 05-wordcount-2-mapreduce.R ├── 06-logistic-regression-iris.R ├── 07-logistic-regression.R ├── 08-logistic-regression-mapreduce.R ├── 09-linear-regression.R └── 10-kmeans.R ├── exercises ├── ex-1-lapply.R ├── ex-2-taxi-local.R ├── ex-3-put-taxi-data-to-dfs.R └── ex-4-taxi-hadoop.R ├── hdinsight ├── r-installer.ps1 ├── remove.ps1 ├── settings.ps1 └── setup.ps1 ├── hive ├── 1-create-external-table.sql └── 2-hive-queries.R ├── images ├── SSCP-matrix.png ├── cluster-structure.png ├── dilbert-big-data-in-the-cloud.png ├── img-hadoop-logical-flow.png ├── img-rmr2.png ├── indeed-job-trend-stats.png ├── jfk-times-square.png ├── mapreduce-weekdays-0.png ├── mapreduce-weekdays-1.png ├── mapreduce-weekdays-2.png ├── r-for-dummies.jpg ├── r-machine-learning.png ├── taxi-tweet.png ├── xkcd-my-job-is-compiling.png └── xkcd-wordcount.png ├── taxi ├── taxi-1-upload.R ├── taxi-2-rmr-local.R ├── taxi-2-rmr-lon-lat.R └── taxi-rmr-3-hadoop.R ├── test.txt └── utils ├── put-taxi-data-to-dfs.R └── sample-taxi-data.R /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Example code in package build process 6 | *-Ex.R 7 | 8 | # RStudio files 9 | .Rproj.user/ 10 | 11 | # produced vignettes 12 | vignettes/*.html 13 | vignettes/*.pdf 14 | 15 | data/trip_data_1.csv 16 | 17 | # knitr artefacts 18 | *.md 19 | *-cache/* 20 | *-figure/* 21 | *_files/* 22 | 23 | 24 | # gh-pages artefacts 25 | .ssas-cache/* 26 | _site/* 27 | .Rproj.user 28 | 29 | 30 | 9-old-demo* -------------------------------------------------------------------------------- /1-Using-R-with-Hadoop.Rpres: -------------------------------------------------------------------------------- 1 | Introduction to Using R with Hadoop 2 | ======================================================== 3 | author: Andrie de Vries & Michele Usuelli 4 | date: 2015-07-01, UseR!2015 5 | width: 1680 6 | height: 1050 7 | css: css/custom.css 8 | 9 | ```{r include=FALSE} 10 | knitr::opts_chunk$set(cache=TRUE) 11 | ``` 12 | 13 | 14 | About us 15 | ======================================================== 16 | 17 | Andrie de Vries 18 | - Programme Manager, Community projects (Microsoft) 19 | - Set up an independent market research firm in 2009 20 | - Joined Revolution Analytics in 2013 21 | - Author of `ggdendro`, `checkpoint` and `miniCRAN` packages on CRAN 22 | - Co-author of *R for Dummies* 23 | 24 | ![](images/r-for-dummies.jpg) 25 | 26 | *** 27 | 28 | Michele Usuelli 29 | - Data Scientist (Microsoft) 30 | - Joined Revolution Analytics in 2014 31 | - Author of *R Machine Learning Essentials* 32 | 33 | ![](images/r-machine-learning.png) 34 | 35 | 36 | 37 | Connecting to Azure with your browser 38 | ======================================================== 39 | 40 | http://ra-ldn-cluster-master-02.cloudapp.net:8787 41 | 42 | You should already have received individual login details 43 | 44 | Why Hadoop? 45 | ======================================================== 46 | 47 | ![](images/indeed-job-trend-stats.png) 48 | 49 | Source: http://www.indeed.com/jobtrends?q=HTML5%2C+hadoop%2C+SAS&l= 50 | 51 | Hype central 52 | ======================================================== 53 | ![](images/dilbert-big-data-in-the-cloud.png) 54 | 55 | 56 | Is your problem big enough for Hadoop? 57 | ================================================ 58 | 59 | When to use Hadoop? 60 | * Conventional processing tools won’t work on your data 61 | * Your data is really BIG 62 | - Won’t fit/process in your favourite database or file-system 63 | * Your data is really diverse ! 64 | - Semi-structured – JSON, XML, Logs, Images, Sounds 65 | * You’re a whiz at programming 66 | 67 | *** 68 | When not to use Hadoop? 69 | * !(When to use Hadoop?) 70 | * You’re in a hurry ! 71 | 72 | 73 | My job is reducing 74 | ================== 75 | ![](images/xkcd-my-job-is-compiling.png) 76 | 77 | 78 | 79 | Some important components 80 | ===================== 81 | 82 | * HDFS 83 | - distributed file system 84 | - `rhdfs` 85 | * mapreduce 86 | - task manager 87 | - `rmr2` 88 | * hbase 89 | - NOSQL database 90 | - `rhbase` 91 | * hive 92 | - SQL like database 93 | - `RHbase` 94 | 95 | 96 | The Azure cluster 97 | ========== 98 | 99 | You are using a HortonWorks Hadoop cluster, provisioned in the Microsoft Azure cloud 100 | 101 | ![](images/cluster-structure.png) 102 | 103 | MapReduce 104 | ============ 105 | type: section 106 | 107 | MapReduce 108 | ========= 109 | 110 | A programming abstraction 111 | * Applies to many types of big data calculation 112 | 113 | Hides messy implementation detail in library 114 | * Implicit parallelisation 115 | * Load balancing 116 | * Reduce data movement 117 | * Robust job / machine failure management 118 | 119 | ~~You as the programmer doesn't need to think about this (too much)~~ 120 | 121 | MapReduce solves a generic problem 122 | ================================== 123 | 124 | * Read a large amount of data 125 | * MAP 126 | * Extract a summary from each record / block 127 | * Shuffle and sort 128 | * REDUCE 129 | * Aggregate, filter transform 130 | 131 | 132 | The problem outline is generic – simply implement map and reduce to solve the problem at hand 133 | 134 | ![](images/img-hadoop-logical-flow.png) 135 | 136 | 137 | 138 | 139 | The hadoop mapreduce magic 140 | ========================== 141 | type:alert 142 | 143 | So how does Hadoop do its magic? 144 | 145 | **Remember: the key is key** 146 | 147 | The Hadoop promise: 148 | 149 | **Hadoop guarantees that records with the same key will be processed by the same reducer** 150 | 151 | This magic happens during the shuffle and sort phase 152 | 153 | **During shuffle and sort, all items with the same key get moved to the same node** 154 | 155 | ![](images/img-hadoop-logical-flow.png) 156 | 157 | 158 | rmr2 159 | ============ 160 | type: section 161 | 162 | rmr2 163 | ============ 164 | 165 | The `rmr2` package allows you to write R code in the mapper and reducer, without having to know anything about java. 166 | 167 | ![](images/img-rmr2.png) 168 | 169 | 170 | MapReduce in R pseudo-code 171 | ========================== 172 | 173 | In the mapper, `v’` is available as data – no need for an explicit read statement 174 | 175 | ```{r, eval=FALSE} 176 | mapper <- function(k, v){ 177 | ... 178 | keyval(k’, v’) 179 | } 180 | ``` 181 | 182 | In the reducer, all `v’` with the same `k’` is processed together 183 | 184 | ```{r, eval=FALSE} 185 | reducer <- function(k’, v’){ 186 | ... 187 | keyval(k’’, v’’) 188 | } 189 | ``` 190 | 191 | Pass these functions to `mapreduce(): 192 | 193 | ```{r, eval=FALSE} 194 | mapreduce(input, 195 | map = mapper, 196 | reduce = reducer, 197 | ...) 198 | ``` 199 | 200 | Testing using the local backend 201 | =============================== 202 | 203 | Local backend 204 | 205 | ```{r, eval=FALSE} 206 | rmr.options(backend = "local") 207 | ``` 208 | 209 | * The `rmr2` package has a "local" back end, completely implemented in R 210 | * Useful for development, testing and debugging 211 | * **Since computation runs entirely in memory, on small data, it's fast!** 212 | 213 | 214 | *** 215 | 216 | Hadoop backend 217 | 218 | ```{r, eval=FALSE} 219 | rmr.options(backend = "hadoop") 220 | ``` 221 | 222 | * This allows easy (and fast) testing before scaling to the “hadoop” backend 223 | * Computation distributed to hdfs and mapreduce 224 | * Hadoop computation overhead 225 | 226 | 227 | Sending data R <---> Hadoop 228 | ============================ 229 | 230 | * In a real Hadoop context, your data will be ingested into dfs by dedicated tools, e.g. Squoop or Flume 231 | * For easy testing and development, the `rmr2` package has two convenience functions that allow you to import / export a `big.data.object` into the R session 232 | 233 | ```{r, eval=FALSE} 234 | to.dfs() 235 | from.dfs() 236 | ``` 237 | 238 | Using the mapreduce() function 239 | ============================== 240 | 241 | ```{r, eval=FALSE} 242 | m <- mapreduce(input, 243 | input.format, 244 | map, 245 | reduce, 246 | output = NULL, ...) 247 | ``` 248 | 249 | * Specify the `input`, `map` and `reduce` functions 250 | * Optionally, specify `output`, to persist result in hdfs 251 | * If `output = NULL`, `mapreduce()` returns a temporary `big.data.object` 252 | * A `big.data.object` is a pointer to a temporary file in dfs 253 | 254 | If you know that the resulting object is small, use `to.dfs()` to return data from dfs into your R session 255 | 256 | ```{r, eval=FALSE} 257 | m() # returns the file location of the big.data.object 258 | from.dfs(m) # available in R session 259 | ``` 260 | 261 | Using the key-value pair 262 | ======================== 263 | 264 | Everything in hadoop uses a key - value pair. (~~Remember: the key is key!~~) 265 | 266 | Use `keyval()` to create the key-value pair inside your mapper and reducer. 267 | 268 | ```{r, eval=FALSE} 269 | mapper <- function(k, v){ 270 | ... 271 | keyval(k', v') 272 | } 273 | reducer <- function(k, v){ 274 | ... 275 | keyval(k', v') 276 | } 277 | ``` 278 | 279 | 280 | Use the helper functions `keys()` and `values()` to separate components from the `big.data.object` 281 | 282 | ```{r, eval=FALSE} 283 | m <- mapreduce(input, map, reduce, ...) 284 | x <- from.dfs(m) # available in R session 285 | 286 | keys(x) 287 | values(x) 288 | ``` 289 | 290 | Cheat sheet for a mapreduce flow using rmr2 291 | ===================== 292 | 293 | Optional: get a sample of data to dfs: 294 | ```{r, eval=FALSE} 295 | hdp.file <- to.dfs(...) 296 | ``` 297 | 298 | Mapreduce: 299 | 300 | ```{r, eval=FALSE} 301 | map.fun <- function(k, v){...; keyval(k', v')} 302 | reduce.fun <- function(k, v){...; keyval(k', v')} 303 | 304 | m <- mapreduce(input, 305 | map = map.fun, 306 | reduce = reduce.fun, 307 | ... 308 | ) 309 | ``` 310 | 311 | Inspect results 312 | 313 | ```{r, eval=FALSE} 314 | x <- from.dfs(m) 315 | 316 | keys(x) 317 | values(x) 318 | ``` 319 | 320 | 321 | 322 | 323 | 324 | End 325 | === 326 | type: section 327 | 328 | Thank you. 329 | -------------------------------------------------------------------------------- /2-Taxi-analysis-with-RHadoop.Rpres: -------------------------------------------------------------------------------- 1 | Analysing New York taxis with RHadoop 2 | ======================================================== 3 | author: Andrie de Vries & Michele Usuelli 4 | date: 2015-07-01, UseR!2015 5 | width: 1680 6 | height: 1050 7 | css: css/custom.css 8 | 9 | 10 | Motivation: New York taxi data 11 | ============= 12 | 13 | An inconveniently sized data set (~200GB uncompressed CSV). 14 | 15 | Contains information about every single taxi trip in New York over a 4-year period. 16 | 17 | *** 18 | 19 | ![](images/taxi-tweet.png) 20 | 21 | Source: http://chriswhong.com/open-data/foil_nyc_taxi/ 22 | 23 | Introduction to the taxi data 24 | ==== 25 | type: section 26 | 27 | ```{r include=FALSE} 28 | knitr::opts_chunk$set(cache = FALSE) 29 | ``` 30 | 31 | 32 | Introduction to the taxi data 33 | ==== 34 | 35 | The data is at http://publish.illinois.edu/dbwork/open-data/ 36 | 37 | Previous analysis published at 38 | 39 | > Brian Donovan and Daniel B. Work. “Using coarse GPS data to quantify city-scale transportation system resilience to extreme events.” to appear, Transportation Research Board 94th Annual Meeting, August 2014. [preprint](https://www.dropbox.com/s/deruyszudfqrll0/TRB15DonovanWork.pdf), [source code](https://github.com/UIUC-Transportation-Data/gpsresilience). 40 | 41 | 42 | Data dictionary 43 | =============== 44 | 45 | Field | Description 46 | ----- | ----------- 47 | medallion | a permit to operate a yellow taxi cab in New York City, it is effectively a (randomly assigned) car ID. See also medallions. 48 | hack_license | a license to drive the vehicle, it is effectively a (randomly assigned) driver ID. See also hack license. 49 | vendor_id | e.g., Verifone Transportation Systems (VTS), or Mobile Knowledge Systems Inc (CMT), implemented as part of the Technology Passenger Enhancements Project. 50 | rate_code | taximeter rate, see NYCT&L description. 51 | store_and_fwd_flag | unknown attribute. 52 | pickup_datetime | start time of the trip, mm-dd-yyyy hh24:mm:ss EDT. 53 | dropoff_datetime | end time of the trip, mm-dd-yyyy hh24:mm:ss EDT. 54 | passenger_count | number of passengers on the trip, default value is one. 55 | trip_time_in_secs | trip time measured by the taximeter in seconds. 56 | trip_distance | trip distance measured by the taximeter in miles. 57 | pickup_longitude and pickup_latitude | GPS coordinates at the start of the trip. 58 | dropoff_longitude and dropoff_latitude | GPS coordinates at the end of the trip. 59 | 60 | Source: http://publish.illinois.edu/dbwork/open-data/ 61 | 62 | Introduction to mapreduce() 63 | ========== 64 | type: section 65 | 66 | 67 | Using a local context in rmr2 68 | ====== 69 | 70 | ```{r taxi-2-rmr-local, cache=FALSE, include=FALSE} 71 | read_chunk("taxi/taxi-2-rmr-local.R") 72 | ``` 73 | 74 | ```{r load-packages} 75 | ``` 76 | 77 | 78 | Exercise 1 79 | ========== 80 | type: cobalt 81 | 82 | * Open the folder `exercises` 83 | * Open the file `ex-1-lapply.R` 84 | * Source the script 85 | 86 | You are looking at a simple `mapreduce()` job that simulates `lapply()` in base R. 87 | 88 | 89 | make.input.format() 90 | ====== 91 | 92 | The function `make.input.format()` allows you to specify the attributes of your input data. 93 | 94 | The argument `format = "csv"` specifies a csv file. This is a wrapper around `read.table()`. 95 | 96 | ```{r make.input.format} 97 | ``` 98 | 99 | from.dfs() 100 | ====== 101 | 102 | Use from.dfs() to get a (small) file from dfs to local memory 103 | 104 | ```{r from.dfs-1} 105 | ``` 106 | 107 | from.dfs() 108 | ====== 109 | 110 | Use `keys()` and `values()` to extract the components 111 | 112 | ```{r from.dfs-2} 113 | ``` 114 | 115 | Why do the columns not have labels? 116 | ====== 117 | type: alert 118 | 119 | Remember: hdfs splits the individual files across nodes 120 | 121 | Implications: 122 | * The chunk that's available to your mapper may not have a header file 123 | * Thus, in general, csv files should not have a header at all! 124 | 125 | The solution: 126 | * Make a custom input format, specifying the column names 127 | * (in the same way you specify `col.names` to `read.table()`) 128 | 129 | Make an input format 130 | ====== 131 | 132 | The file, `data/dictionary_trip_data.csv` (in the linux file system) contains a data dictionary: 133 | 134 | ```{r make.input.format-with-colnames-1} 135 | ``` 136 | 137 | Use this information to configure the input format 138 | 139 | Make an input format 140 | ====== 141 | 142 | Use the data dictionary information to configure the input format: 143 | 144 | 145 | ```{r make.input.format-with-colnames-2} 146 | ``` 147 | 148 | 149 | mapreduce() without any transformation 150 | ====== 151 | 152 | ```{r mapreduce-1-a} 153 | ``` 154 | 155 | mapreduce() without any transformation 156 | ====== 157 | 158 | ```{r mapreduce-1-b} 159 | ``` 160 | 161 | mapreduce() with a simpler mapper 162 | ====== 163 | 164 | ```{r mapreduce-2} 165 | ``` 166 | 167 | mapreduce() with a simpler mapper 168 | ====== 169 | 170 | ![](images/mapreduce-weekdays-0.png) 171 | 172 | 173 | mapreduce() emitting a keyvalue pair 174 | ====== 175 | 176 | ```{r mapreduce-3} 177 | ``` 178 | 179 | mapreduce() with a reducer 180 | ====== 181 | 182 | ```{r mapreduce-4} 183 | ``` 184 | 185 | mapreduce() with a reducer 186 | ====== 187 | 188 | ![](images/mapreduce-weekdays-1.png) 189 | 190 | Can we write a more sensible mapper and reducer? 191 | ====== 192 | 193 | ![](images/mapreduce-weekdays-2.png) 194 | 195 | mapreduce() with a more sensible mapper 196 | ====== 197 | 198 | ```{r mapreduce-5} 199 | ``` 200 | 201 | mapreduce() the final step 202 | ====== 203 | 204 | ```{r mapreduce-6} 205 | ``` 206 | 207 | Exercise 2 208 | ========== 209 | type: cobalt 210 | 211 | * Open the folder `exercises` 212 | * Open the file `ex-2-taxi-local.R` 213 | * Source the script 214 | 215 | This script contains a short `mapreduce()` script on the taxi data. 216 | 217 | Run the code, section by section. 218 | 219 | Ensure you understand what happens. 220 | 221 | Try to compute the mean of number of passengers. 222 | 223 | Hint: 224 | 225 | * The mapper should compute sum and count of passengers 226 | * The reduced should compute `mean <- sum / count` 227 | 228 | 229 | Analysing by hour 230 | ================= 231 | type: section 232 | 233 | Doing the analysis by hour 234 | ========================== 235 | 236 | ```{r mapreduce-7-a} 237 | ``` 238 | 239 | Doing the analysis by hour 240 | ========================== 241 | 242 | ```{r mapreduce-7-b} 243 | ``` 244 | 245 | 246 | Plotting the results 247 | ==================== 248 | 249 | ```{r mapreduce-7-plot-1, out.width="1500px", out.height="1000px"} 250 | ``` 251 | 252 | Plotting the results 253 | ==================== 254 | 255 | ```{r mapreduce-7-plot-2, echo=FALSE, out.width="1500px", out.height="1000px"} 256 | ``` 257 | 258 | 259 | Putting files in hdfs with rhdfs 260 | =============== 261 | type: section 262 | 263 | rhdfs function overview 264 | ======================= 265 | 266 | * Initialize 267 | - `hdfs.init()` 268 | - `hdfs.defaults()` 269 | * File and directory manipulation 270 | - `hdfs.ls()` 271 | - `hdfs.delete()` 272 | - `hdfs.mkdir()` 273 | - `hdfs.exists()` 274 | * Copy and move from local <-> HDFS 275 | - `hdfs.put()` 276 | - `hdfs.get()` 277 | 278 | *** 279 | 280 | * Manipulate files within HDFS 281 | - `hdfs.copy()` 282 | - `hdfs.move()` 283 | - `hdfs.rename()` 284 | * Reading files directly from HDFS 285 | - `hdfs.file()` 286 | - `hdfs.read()` 287 | - `hdfs.write()` 288 | - `hdfs.flush()` 289 | - `hdfs.seek()` 290 | - `hdfs.tell(con)` 291 | - `hdfs.close()` 292 | - `hdfs.line.reader()` 293 | - `hdfs.read.text.file()` 294 | 295 | 296 | Putting the data from local file system to dfs 297 | ============ 298 | 299 | You use the `rhdfs` package to manipulate files in the hadoop dfs. 300 | 301 | ```{r put-taxi-in-dfs, cache=FALSE, include=FALSE} 302 | read_chunk("utils/put-taxi-data-to-dfs.R") 303 | ``` 304 | 305 | ```{r rhdfs, eval=FALSE} 306 | ``` 307 | 308 | 309 | Exercise 3 310 | ========== 311 | type: cobalt 312 | 313 | * Open the folder `exercises` 314 | * Open the file `ex-3-put-taxi-data-to-dfs.R` 315 | * Source the script 316 | 317 | Use the `rhdfs` functions to satisfy yourself the files are in hadoop. 318 | 319 | Hint: Use `hdfs.ls()` 320 | 321 | Running the script in Hadoop compute context 322 | ================= 323 | type: section 324 | 325 | 326 | Running the script in Hadoop compute context 327 | ================= 328 | 329 | Once you've tested your script in local context, it is generally very easy to deploy on all your data. 330 | 331 | ```{r, eval=FALSE} 332 | rmr.options(backend = "hadoop") 333 | ``` 334 | 335 | What the hadoop output means 336 | ============ 337 | 338 | ``` 339 | packageJobJar: [] [/usr/lib/hadoop-mapreduce/hadoop-streaming-2.4.0.2.1.5.0-695.jar] /tmp/streamjob4577673905010649130.jar tmpDir=null 340 | 15/06/23 14:18:01 INFO impl.TimelineClientImpl: Timeline service address: http://0.0.0.0:8188/ws/v1/timeline/ 341 | 15/06/23 14:18:01 INFO client.RMProxy: Connecting to ResourceManager at ra-ldn-cluster-master-02.cloudapp.net/172.16.0.5:8050 342 | 15/06/23 14:18:02 INFO impl.TimelineClientImpl: Timeline service address: http://0.0.0.0:8188/ws/v1/timeline/ 343 | .... 344 | 15/06/23 14:18:17 INFO mapreduce.Job: Job job_1434365936669_0041 running in uber mode : false 345 | 15/06/23 14:18:17 INFO mapreduce.Job: map 0% reduce 0% 346 | 15/06/23 14:18:31 INFO mapreduce.Job: map 1% reduce 0% 347 | 15/06/23 14:18:34 INFO mapreduce.Job: map 17% reduce 0% 348 | 15/06/23 14:18:40 INFO mapreduce.Job: map 19% reduce 0% 349 | 15/06/23 14:18:41 INFO mapreduce.Job: map 21% reduce 0% 350 | 15/06/23 14:18:47 INFO mapreduce.Job: map 37% reduce 0% 351 | 15/06/23 14:18:48 INFO mapreduce.Job: map 92% reduce 0% 352 | 15/06/23 14:18:49 INFO mapreduce.Job: map 92% reduce 19% 353 | 15/06/23 14:18:50 INFO mapreduce.Job: map 100% reduce 19% 354 | 15/06/23 14:18:52 INFO mapreduce.Job: map 100% reduce 100% 355 | 15/06/23 14:19:00 INFO mapreduce.Job: Job job_1434365936669_0041 completed successfully 356 | ... 357 | rmr 358 | reduce calls=24 359 | 15/06/23 14:19:01 INFO streaming.StreamJob: Output directory: /tmp/file55a346591243 360 | ``` 361 | 362 | Exercise 4 363 | ========== 364 | type: cobalt 365 | 366 | * Open the folder `exercises` 367 | * Open the file `ex-4-taxi-hadoop.R` 368 | * Source the script 369 | 370 | 371 | 372 | Summary 373 | ========= 374 | type: section 375 | 376 | 377 | Summary 378 | ========= 379 | 380 | In this session you: 381 | 382 | * Used an Azure cluster running HortonWorks 383 | * Developed scripts in local context with the `rmr2` package: 384 | - Write mappers 385 | - Wrote reducers 386 | - Used `mapreduce()` 387 | * Returned data to your local R session, for plotting 388 | * Uploaded files into hdfs using `rhdfs` 389 | * Ran the script directly in hadoop, using the hadoop context 390 | 391 | Questions 392 | ========= 393 | type: section 394 | 395 | 396 | Questions 397 | ========= 398 | 399 | 400 | ? 401 | ? 402 | ? 403 | 404 | 405 | 406 | End 407 | === 408 | type: section 409 | 410 | Thank you. 411 | -------------------------------------------------------------------------------- /4-Computing-on-distributed-matrices.Rpres: -------------------------------------------------------------------------------- 1 | Computing on distributed matrices 2 | ======================================================== 3 | author: Andrie de Vries & Michele Usuelli 4 | date: 2015-07-01, UseR!2015 5 | width: 1680 6 | height: 1050 7 | css: css/custom.css 8 | 9 | 10 | Linear regression 11 | ============= 12 | 13 | $$Y = X \beta + e$$ 14 | 15 | To solve for `b`, create a loss function, differentiate and solve for a minimum: 16 | 17 | $$B = (X’X)^{-1} X’Y$$ 18 | 19 | Re-arrange: 20 | 21 | $$(X’X) \beta = X’ Y$$ 22 | 23 | The R function `b <- solve(x, y)` gives the solution to $xb = y$ 24 | 25 | ```{r, eval=FALSE} 26 | b <- solve(x’x, x’y) 27 | ``` 28 | 29 | The math of distributed computing 30 | ================================= 31 | 32 | * Commutative property 33 | 34 | $$a + b == b + a$$ 35 | 36 | * Associative property 37 | 38 | $$(a + b) + c == a + (b + c)$$ 39 | 40 | * Distributive property 41 | 42 | $$a * (b + c) == (a * b) + (b * c)$$ 43 | 44 | 45 | 46 | Computing X'X is a distributive operation 47 | ============= 48 | 49 | ![](images/SSCP-matrix.png) 50 | 51 | Regression 52 | ========== 53 | 54 | ```{r taxi-2-rmr-local, cache=FALSE, include=FALSE} 55 | read_chunk("demo/09-linear-regression.R") 56 | ``` 57 | 58 | ```{r load-packages} 59 | ``` 60 | 61 | Regression 62 | ========== 63 | 64 | ```{r generate-data} 65 | ``` 66 | 67 | Regression 68 | ========== 69 | 70 | ```{r sum-function} 71 | ``` 72 | 73 | Regression 74 | ========== 75 | 76 | ```{r XtX} 77 | ``` 78 | 79 | Regression 80 | ========== 81 | 82 | ```{r XtY} 83 | ``` 84 | 85 | Regression 86 | ========== 87 | 88 | ```{r solve} 89 | ``` 90 | -------------------------------------------------------------------------------- /5-hive.Rpres: -------------------------------------------------------------------------------- 1 | Analysing New York taxis with RHive 2 | ======================================================== 3 | author: Andrie de Vries & Michele Usuelli 4 | date: 2015-07-01, UseR!2015 5 | width: 1680 6 | height: 1050 7 | css: css/custom.css 8 | 9 | 10 | 11 | Introduction to Hive 12 | ==== 13 | 14 | - Hadoop data warehouse 15 | - SQL-like interface 16 | - Language: HQL (Hive Query Language) 17 | - HQL translated into MapReduce jobs 18 | - R interface: RHive, RevoScaleR 19 | - Hue web interface 20 | 21 | 22 | 23 | Using Hive with R 24 | ==== 25 | 26 | - Building queries using string manipulation tools like sprintf and paste 27 | - Running queries using RHive 28 | 29 | 30 | Syntax difference between Hive and RHive 31 | ==== 32 | type: alert 33 | 34 | - Hive: semi-colon in the end 35 | 36 | ``` 37 | SELECT COUNT(*) FROM table_data; 38 | ``` 39 | 40 | - RHive: no semi-colon in the end 41 | 42 | ``` 43 | SELECT COUNT(*) FROM table_data 44 | ``` 45 | 46 | 47 | 48 | 49 | 50 | Computing the number of taxi rides by hour 51 | ==== 52 | 53 | Using Hive, we can replicate the rmr2 example counting the number of taxi runs by hour. The steps are 54 | 55 | 1. Importing the data 56 | 2. Querying the data 57 | 58 | 59 | 60 | 61 | Defining an external table 62 | ==== 63 | 64 | Starting from our data, we can create a Hive table. Since the data is already in CSV format, the easiest option is creating an external table. 65 | 66 | We need to specify 67 | 68 | - The data location 69 | - The data format 70 | - The field formats 71 | 72 | 73 | 74 | 75 | Query to create an extenal table 76 | ==== 77 | 78 | ``` 79 | CREATE EXTERNAL TABLE taxi_sample(medallion STRING, hack_license STRING, vendor_id STRING, rate_code INT, store_and_fwd_flag STRING, pickup_datetime STRING, dropoff_datetime STRING, passenger_count INT, trip_time_in_secs INT, trip_distance FLOAT, pickup_longitude FLOAT, pickup_latitude FLOAT, dropoff_longitude FLOAT, dropoff_latitude FLOAT) ROW FORMAT 80 | DELIMITED FIELDS TERMINATED BY ',' 81 | LINES TERMINATED BY '\n' 82 | STORED AS TEXTFILE 83 | LOCATION '/user/share/taxi/sample'; 84 | ``` 85 | 86 | 87 | 88 | Query to create an external table - defining the field format 89 | ==== 90 | 91 | The first part of the query defines a table called *taxi_sample* and the format of its fields. 92 | 93 | ``` 94 | CREATE EXTERNAL TABLE taxi_sample(medallion STRING, hack_license STRING, vendor_id STRING, rate_code INT, store_and_fwd_flag STRING, pickup_datetime STRING, dropoff_datetime STRING, passenger_count INT, trip_time_in_secs INT, trip_distance FLOAT, pickup_longitude FLOAT, pickup_latitude FLOAT, dropoff_longitude FLOAT, dropoff_latitude FLOAT) ROW FORMAT 95 | ``` 96 | 97 | 98 | 99 | Query to create an external table: defining the data format 100 | ==== 101 | 102 | This part of the query defines the data format. 103 | 104 | ``` 105 | DELIMITED FIELDS TERMINATED BY ',' 106 | LINES TERMINATED BY '\n' 107 | STORED AS TEXTFILE 108 | ``` 109 | 110 | 111 | Query to create an external table: defining the data location 112 | ==== 113 | 114 | This part of the query defines the HDFS path to the data folder. 115 | 116 | ``` 117 | LOCATION '/user/share/taxi/sample'; 118 | ``` 119 | 120 | 121 | Setting up RHive 122 | ==== 123 | 124 | ```{r 2-hive-queries.R, cache=FALSE, include=FALSE} 125 | read_chunk("hive/2-hive-queries.R") 126 | ``` 127 | 128 | ```{r configure-rhive} 129 | ``` 130 | 131 | 132 | 133 | 134 | Simple query: count the number of records 135 | ==== 136 | 137 | 138 | ```{r row-count} 139 | ``` 140 | 141 | ```{r run-row-count, cache=TRUE} 142 | ``` 143 | 144 | 145 | 146 | Simple query: count the number of records 147 | ==== 148 | 149 | 150 | ```{r build-row-count} 151 | ``` 152 | 153 | ```{r run-row-count, cache=TRUE} 154 | ``` 155 | 156 | 157 | 158 | 159 | Define the hour 160 | ==== 161 | 162 | ```{r define-hour} 163 | ``` 164 | 165 | 166 | 167 | 168 | 169 | Define the hour 170 | ==== 171 | 172 | ```{r build-define-hour} 173 | ``` 174 | 175 | 176 | 177 | Define the hour 178 | ==== 179 | 180 | ```{r run-define-hour, cache=TRUE} 181 | ``` 182 | 183 | 184 | 185 | Count by hour 186 | ==== 187 | 188 | ```{r count-by-hour} 189 | ``` 190 | 191 | 192 | 193 | 194 | Count by hour 195 | ==== 196 | 197 | ```{r build-count-by-hour} 198 | ``` 199 | 200 | 201 | 202 | Count by hour 203 | ==== 204 | 205 | ```{r run-count-by-hour, cache=TRUE} 206 | ``` 207 | 208 | 209 | 210 | Conclusions 211 | ==== 212 | 213 | - Hive allows to define group-by queries writing SQL-like code 214 | - The MapReduce code is not displayed 215 | - R tools for string manipulation allow to easily build complex queries 216 | - RHive allows to run Hive queries directly from R 217 | - For more complex queries, we still need to use other tools like rmr2 218 | 219 | 220 | 221 | 222 | 223 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | {description} 294 | Copyright (C) {year} {fullname} 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | {signature of Ty Coon}, 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | 341 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RHadoop-tutorial 2 | A tutorial on R and Hadoop, using the RHadoop project 3 | 4 | ##Slides 5 | 6 | 1. [Using R with Hadoop](http://htmlpreview.github.io/?https://github.com/andrie/RHadoop-tutorial/blob/master/1-Using-R-with-Hadoop.html#/) 7 | 2. [Taxi analysis with RHadoop](http://htmlpreview.github.io/?https://github.com/andrie/RHadoop-tutorial/blob/master/2-Taxi-analysis-with-RHadoop.html) 8 | 3. [Computing on distributed matrices](http://htmlpreview.github.io/?https://github.com/andrie/RHadoop-tutorial/blob/master/4-Computing-on-distributed-matrices.html) 9 | 4. [Using hive](http://htmlpreview.github.io/?https://github.com/andrie/RHadoop-tutorial/blob/master/5-hive.html) 10 | 11 | -------------------------------------------------------------------------------- /css/custom.css: -------------------------------------------------------------------------------- 1 | .reveal section del { 2 | color: red; 3 | } 4 | 5 | .section .reveal .state-background { 6 | background: #00F url("css/mslogo.png") no-repeat 5% 95%; 7 | } 8 | 9 | 10 | 11 | /* 12 | .title-blue .reveal .state-background { 13 | background: #00F url("mslogo.png") no-repeat 5% 95%; 14 | } 15 | */ 16 | -------------------------------------------------------------------------------- /css/mslogo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/css/mslogo.png -------------------------------------------------------------------------------- /css/title-background.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/css/title-background.png -------------------------------------------------------------------------------- /data/dictionary_trip_data.csv: -------------------------------------------------------------------------------- 1 | "medallion","hack_license","vendor_id","rate_code","store_and_fwd_flag","pickup_datetime","dropoff_datetime","passenger_count","trip_time_in_secs","trip_distance","pickup_longitude","pickup_latitude","dropoff_longitude","dropoff_latitude" 2 | "integer","integer","character","integer","character","character","character","integer","integer","numeric","numeric","numeric","numeric","numeric" 3 | -------------------------------------------------------------------------------- /demo/01-intro-lapply.R: -------------------------------------------------------------------------------- 1 | ## @knitr load-packages 2 | library(rmr2) 3 | library(rhdfs) 4 | hdfs.init() 5 | 6 | rmr.options(backend = "local") 7 | 8 | ## @knitr R --------------------------------------------------------------- 9 | 10 | x <- 1:1000 11 | lapply(x, function(x)cbind(x, x^2)) 12 | 13 | 14 | ## @knitr rmr ------------------------------------------------------------- 15 | 16 | small.ints = to.dfs(1:1000) 17 | 18 | a <- mapreduce( 19 | input = small.ints, 20 | map = function(k, v) cbind(v, v^2) 21 | ) 22 | 23 | a() 24 | from.dfs(a) 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /demo/02-intro-tapply.R: -------------------------------------------------------------------------------- 1 | ## @knitr load-packages 2 | library(rmr2) 3 | library(rhdfs) 4 | hdfs.init() 5 | 6 | rmr.options(backend = "local") 7 | 8 | ## @knitr R --------------------------------------------------------------- 9 | 10 | groups <- rbinom(32, n = 50, prob = 0.4) 11 | tapply(groups, groups, length) 12 | 13 | 14 | ## @knitr rmr ------------------------------------------------------------- 15 | 16 | dfs.groups <- to.dfs(groups) 17 | 18 | x <- mapreduce(input = dfs.groups, 19 | map = function(., v) keyval(v, 1), 20 | reduce = function(k, vv) keyval(k, length(vv)) 21 | ) 22 | 23 | y <- from.dfs(x) 24 | 25 | as.data.frame(y)[order(y[["key"]]), ] 26 | -------------------------------------------------------------------------------- /demo/03-download-ebook-to-hdfs.R: -------------------------------------------------------------------------------- 1 | library(rmr2) 2 | library(rhdfs) 3 | hdfs.init() 4 | 5 | rmr.options(backend = "local") 6 | 7 | # ------------------------------------------------------------------------- 8 | 9 | dir.create("data") 10 | ebookLocal <- "data/ullyses.txt" 11 | 12 | if(!file.exists(ebookLocal)) { 13 | download.file(url = "http://www.gutenberg.org/ebooks/4300.txt.utf-8", 14 | destfile = ebookLocal) 15 | } 16 | 17 | 18 | 19 | file.exists(ebookLocal) 20 | readLines(ebookLocal, n = 50) 21 | 22 | 23 | # Copy file to HDFS ------------------------------------------------------- 24 | 25 | ebookHadoop <- dirname(ebookLocal) 26 | hdfs.dircreate(ebookHadoop) 27 | hdfs.ls(ebookHadoop) 28 | 29 | hdfs.put(src = ebookLocal, dest = ebookHadoop) 30 | 31 | hdfs.ls(ebookHadoop) 32 | -------------------------------------------------------------------------------- /demo/04-wordcount-1-algorithm.R: -------------------------------------------------------------------------------- 1 | library(rmr2) 2 | library(rhdfs) 3 | hdfs.init() 4 | 5 | rmr.options(backend = "local") 6 | 7 | # ------------------------------------------------------------------------- 8 | 9 | # Script to perform word count -------------------------------------------- 10 | 11 | ebookLocation <- "data/ullyses.txt" 12 | dat <- readLines(ebookLocation, n = 100) 13 | words <- unlist(strsplit(dat, split = "[[:space:][:punct:]]")) 14 | words <- tolower(words) 15 | words <- gsub("[0-9]", "", words) 16 | words <- words[words != ""] 17 | wordcount <- table(words) 18 | keyval( 19 | key = names(wordcount), 20 | val = as.numeric(wordcount) 21 | ) 22 | 23 | 24 | # Function to do word count ----------------------------------------------- 25 | 26 | wordcount <- function(location, n = -1L){ 27 | dat <- readLines(location, n = n) 28 | words <- unlist(strsplit(dat, split = "[[:space:][:punct:]]")) 29 | words <- tolower(words) 30 | words <- gsub("[0-9]", "", words) 31 | words <- words[words != ""] 32 | words <- words[!is.na(words)] 33 | x <- table(words) 34 | keyval( 35 | key = names(x), 36 | val = as.numeric(x) 37 | ) 38 | } 39 | 40 | x <- wordcount("data/ullyses.txt", n = -1) 41 | lapply(x, head, 10) 42 | lapply(x, tail, 10) 43 | -------------------------------------------------------------------------------- /demo/05-wordcount-2-mapreduce.R: -------------------------------------------------------------------------------- 1 | library(rmr2) 2 | library(rhdfs) 3 | hdfs.init() 4 | 5 | rmr.options(backend = "local") 6 | 7 | # Word count -------------------------------------------------------------- 8 | 9 | ebookLocation <- "data/ullyses.txt" 10 | 11 | m <- mapreduce(input = ebookLocation, 12 | input.format = "text", 13 | 14 | map = function(k, v){ 15 | words <- unlist(strsplit(v, split = "[[:space:][:punct:]]")) 16 | words <- tolower(words) 17 | words <- gsub("[0-9]", "", words) 18 | words <- words[words != ""] 19 | wordcount <- table(words) 20 | keyval( 21 | key = names(wordcount), 22 | val = as.numeric(wordcount) 23 | ) 24 | }, 25 | 26 | reduce = function(k, counts){ 27 | keyval(key = k, 28 | val = sum(counts)) 29 | } 30 | ) 31 | 32 | 33 | # Retrieve results and prepare to plot ------------------------------------ 34 | 35 | 36 | x <- from.dfs(m) 37 | dat <- data.frame( 38 | word = keys(x), 39 | count = values(x) 40 | ) 41 | dat <- dat[order(dat$count, decreasing=TRUE), ] 42 | head(dat, 50) 43 | with(head(dat, 25), plot(count, names = word)) 44 | -------------------------------------------------------------------------------- /demo/06-logistic-regression-iris.R: -------------------------------------------------------------------------------- 1 | iris2 <- transform(iris, 2 | Setosa = Species == "virginica", 3 | Species = NULL 4 | ) 5 | 6 | model <- glm(Setosa ~ ., data = iris2, family = binomial) 7 | table(iris2$Setosa, 8 | as.logical(round( 9 | predict(model, iris2, type = "response") 10 | , 2)) 11 | ) 12 | -------------------------------------------------------------------------------- /demo/07-logistic-regression.R: -------------------------------------------------------------------------------- 1 | gdescent <- function(input, iterations, dims, alpha){ 2 | 3 | plane = t(rep(0, dims)) 4 | M <- input 5 | for (i in 1:iterations) { 6 | # map 7 | Y <- M[, 1] 8 | X <- M[, -1] 9 | map <- Y * X * plogis(-Y * as.numeric(X %*% t(plane))) 10 | # reduce 11 | gradient <- colSums(map) 12 | 13 | plane <- plane + alpha * gradient 14 | } 15 | plane 16 | } 17 | 18 | 19 | # ------------------------------------------------------------------------ 20 | 21 | library(ggplot2) 22 | mean(diamonds$price) 23 | quantile(diamonds$price) 24 | glm(price > 5324 ~ ., data = diamonds, family = binomial) 25 | iris2 <- transform(iris, 26 | Virginica = Species == "versicolor", 27 | Species = NULL 28 | ) 29 | str(iris2) 30 | 31 | dat <- cbind(Virginica = iris2$Virginica * 2 - 1, 32 | model.matrix(Virginica ~ ., iris2) 33 | ) 34 | gdescent(dat, dims = 5, iterations = 1000, alpha = 0.01) 35 | 36 | coef(glm(Virginica ~ ., data = iris2, family = binomial)) 37 | -------------------------------------------------------------------------------- /demo/08-logistic-regression-mapreduce.R: -------------------------------------------------------------------------------- 1 | library(rmr2) 2 | rmr.options(backend = "local") 3 | 4 | logistic.regression <- function(input, iterations, dims, alpha){ 5 | 6 | plane <- t(rep(0, dims)) 7 | g <- function(z) 1 / (1 + exp(-z)) 8 | 9 | lr.map <- function(., M) { 10 | Y <- M[,1] 11 | X <- M[,-1] 12 | keyval( 13 | 1, 14 | Y * X * g(-Y * as.numeric(X %*% t(plane))) 15 | ) 16 | } 17 | 18 | lr.reduce <- function(k, Z){ 19 | keyval(k, t(as.matrix(apply(Z, 2, sum)))) 20 | } 21 | 22 | for (i in 1:iterations) { 23 | x <- mapreduce( 24 | input, 25 | map = lr.map, 26 | reduce = lr.reduce, 27 | combine = TRUE 28 | ) 29 | gradient <- values(from.dfs(x)) 30 | plane <- plane + alpha * gradient 31 | } 32 | plane 33 | } 34 | 35 | 36 | 37 | # Create design matrix ---------------------------------------------------- 38 | 39 | iris2 <- transform(iris, 40 | Virginica = Species == "virginica", 41 | Species = NULL 42 | ) 43 | 44 | 45 | dat <- cbind(Virginica = iris2$Virginica * 2 - 1, 46 | model.matrix(Virginica ~ ., iris2) 47 | ) 48 | str(dat) 49 | head(dat) 50 | 51 | # Send design matrix to dfs ----------------------------------------------- 52 | 53 | hdp.iris2 <- to.dfs(dat) 54 | hdp.iris2() 55 | from.dfs(hdp.iris2) 56 | model <- logistic.regression(hdp.iris2, dims = 5, iterations = 5, alpha = 0.1) 57 | 58 | model 59 | 60 | 61 | # Inspect confusion matrix ------------------------------------------------ 62 | 63 | # table(iris2$Virginica, 64 | # as.logical(round( 65 | # predict(model, iris2, type = "response") 66 | # , 2)) 67 | # ) 68 | -------------------------------------------------------------------------------- /demo/09-linear-regression.R: -------------------------------------------------------------------------------- 1 | ## @knitr load-packages --------------------------------------------------- 2 | library(rmr2) 3 | library(rhdfs) 4 | hdfs.init() 5 | 6 | rmr.options(backend = "local") 7 | 8 | ## @knitr generate-data --------------------------------------------------- 9 | 10 | X <- matrix(rnorm(2000), ncol = 10) 11 | y <- as.matrix(rnorm(200)) 12 | design.mat <- cbind(y, X) 13 | keyed.design <- to.dfs(design.mat) 14 | 15 | ## @knitr sum-function --------------------------------------------------- 16 | # The next is a reusable reduce function that just sums a list of matrices, ignores the key. 17 | 18 | Sum <- function(., YY) keyval(1, list(Reduce('+', YY))) 19 | 20 | ## @knitr XtX ------------------------------------------------------------- 21 | 22 | # The big matrix is passed to the mapper in chunks of complete rows. Smaller cross-products are computed for these submatrices and passed on to a single reducer, which sums them together. Since we have a single key a combiner is mandatory and since matrix sum is associative and commutative we certainly can use it here. 23 | 24 | XtX <- values(from.dfs( 25 | mapreduce(input = keyed.design, 26 | map = function(., Xi) { 27 | yi = Xi[, 1] 28 | Xi = Xi[, -1] 29 | keyval(1, list(t(Xi) %*% Xi)) 30 | }, 31 | reduce = Sum, 32 | combine = TRUE) 33 | ))[[1]] 34 | 35 | ## @knitr XtY ------------------------------------------------------------- 36 | 37 | # The same pretty much goes on also for vector y 38 | 39 | Xty <- values(from.dfs( 40 | mapreduce(input = keyed.design, 41 | map = function(., Xi) { 42 | yi = Xi[, 1] 43 | Xi = Xi[, -1] 44 | keyval(1, list(t(Xi) %*% yi)) 45 | }, 46 | reduce = Sum, 47 | combine = TRUE) 48 | ))[[1]] 49 | 50 | ## @knitr solve ----------------------------------------------------------- 51 | 52 | # And finally we just need to call solve. 53 | 54 | solve(XtX, Xty) 55 | -------------------------------------------------------------------------------- /demo/10-kmeans.R: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Revolution Analytics 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | library(rmr2) 16 | rmr.options(backend = "local") 17 | 18 | ## kmeans-signature 19 | kmeans.mr <- function(P, num.clusters, num.iter, combine, in.memory.combine) { 20 | ## kmeans-dist.fun 21 | dist.fun <- function(C, P) apply(C, 1, function(x) colSums((t(P) - x)^2)) 22 | 23 | ## kmeans.map 24 | kmeans.map <- function(., P) { 25 | nearest <- if(is.null(C)) 26 | sample(1:num.clusters, nrow(P), replace = TRUE) 27 | else { 28 | D <- dist.fun(C, P) 29 | nearest <- max.col(-D) 30 | } 31 | 32 | if(!(combine || in.memory.combine)) 33 | keyval(nearest, P) 34 | else 35 | keyval(nearest, cbind(1, P))} 36 | 37 | ## kmeans.reduce 38 | kmeans.reduce <-if (!(combine || in.memory.combine) ) 39 | function(., P) t(as.matrix(apply(P, 2, mean))) 40 | else 41 | function(k, P) keyval(k, t(as.matrix(apply(P, 2, sum)))) 42 | 43 | ## kmeans-main-1 44 | C <- NULL 45 | for(i in 1:num.iter ) { 46 | C <- values(from.dfs( 47 | mapreduce(P, 48 | map = kmeans.map, 49 | reduce = kmeans.reduce 50 | ) 51 | )) 52 | if(combine || in.memory.combine) 53 | C <- C[, -1] / C[, 1] 54 | ## end 55 | # points(C, col = i + 1, pch = 19) 56 | ## kmeans-main-2 57 | if(nrow(C) < num.clusters) { 58 | C <-rbind(C, matrix( 59 | rnorm((num.clusters - nrow(C)) * nrow(C)), 60 | ncol = nrow(C)) %*% C 61 | ) 62 | } 63 | } 64 | C 65 | } 66 | ## end 67 | 68 | ## sample runs 69 | ## 70 | 71 | out <- list() 72 | 73 | for(be in c("local")) { 74 | rmr.options(backend = be) 75 | set.seed(0) 76 | ## kmeans-data 77 | P <- do.call(rbind, 78 | rep(list(matrix( 79 | rnorm(10, sd = 10), 80 | ncol=2)), 81 | 20)) + 82 | matrix(rnorm(200), ncol =2) 83 | ## end 84 | out[[be]] = 85 | ## kmeans-run 86 | kmeans.mr(to.dfs(P), 87 | num.clusters = 12, 88 | num.iter = 5, 89 | combine = FALSE, 90 | in.memory.combine = FALSE 91 | ) 92 | ## end 93 | } 94 | 95 | # would love to take this step but kmeans in randomized in a way that makes it hard to be completely reprodubile 96 | # stopifnot(rmr2:::cmp(out[['hadoop']], out[['local']])) 97 | out[["local"]] 98 | -------------------------------------------------------------------------------- /exercises/ex-1-lapply.R: -------------------------------------------------------------------------------- 1 | ## load-packages 2 | library(rmr2) 3 | library(rhdfs) 4 | hdfs.init() 5 | 6 | rmr.options(backend = "local") 7 | 8 | ## R --------------------------------------------------------------- 9 | 10 | x <- 1:1000 11 | lapply(x, function(x)cbind(x, x^2)) 12 | 13 | 14 | ## rmr ------------------------------------------------------------- 15 | 16 | small.ints = to.dfs(1:1000) 17 | 18 | a <- mapreduce( 19 | input = small.ints, 20 | map = function(k, v) cbind(v, v^2) 21 | ) 22 | 23 | a() 24 | from.dfs(a) 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /exercises/ex-2-taxi-local.R: -------------------------------------------------------------------------------- 1 | ## load-packages --------------------------------------------------- 2 | library(rmr2) 3 | rmr.options(backend = "local") 4 | 5 | 6 | taxi.hdp <- "data/trip_data_1_sample.csv" 7 | 8 | ## make.input.format-with-colnames-1 ------------------------------- 9 | 10 | headerInfo <- read.csv("data/dictionary_trip_data.csv", stringsAsFactors = FALSE) 11 | headerInfo 12 | colClasses <- as.character(as.vector(headerInfo[1, ])) 13 | names(headerInfo) 14 | colClasses 15 | 16 | taxi.format <- make.input.format(format = "csv", sep = ",", 17 | col.names = names(headerInfo), 18 | colClasses = colClasses, 19 | stringsAsFactors = FALSE 20 | ) 21 | 22 | x <- from.dfs(taxi.hdp, format = taxi.format) 23 | str(values(x)) 24 | 25 | 26 | taxi.map <- function(k, v){ 27 | original <- v[[6]] 28 | date <- as.Date(original, origin = "1970-01-01") 29 | wkday <- weekdays(date) 30 | dat <- data.frame(date, wkday) 31 | z <- aggregate(date ~ wkday, dat, FUN = length) 32 | keyval(z[[1]], z[[2]]) 33 | } 34 | 35 | taxi.reduce <- function(k, v){ 36 | data.frame(weekday = k, trips = sum(v), row.names = k) 37 | } 38 | 39 | m <- mapreduce(taxi.hdp, input.format = taxi.format, 40 | map = taxi.map, 41 | reduce = taxi.reduce 42 | ) 43 | keys(from.dfs(m)) 44 | values(from.dfs(m)) 45 | 46 | -------------------------------------------------------------------------------- /exercises/ex-3-put-taxi-data-to-dfs.R: -------------------------------------------------------------------------------- 1 | library(rhdfs) 2 | hdfs.init() 3 | 4 | # List taxi data files in local file system 5 | localFiles <- dir("data", pattern = "_sample.csv", full.names = TRUE) 6 | localFiles 7 | 8 | # Put files into dfs 9 | hdfs.mkdir("taxi/sample") 10 | hdfs.put(localFiles, "taxi/sample") 11 | hdfs.ls("taxi/sample") 12 | 13 | hdfs.ls("taxi/sample")$file 14 | -------------------------------------------------------------------------------- /exercises/ex-4-taxi-hadoop.R: -------------------------------------------------------------------------------- 1 | library(rmr2) 2 | library(rhdfs) 3 | hdfs.init() 4 | 5 | rmr.options(backend = "hadoop") 6 | 7 | hdfs.ls("taxi")$file 8 | homeFolder <- file.path("/user", Sys.getenv("USER")) 9 | taxi.hdp <- file.path(homeFolder, "taxi/sample") 10 | 11 | headerInfo <- read.csv("data/dictionary_trip_data.csv", stringsAsFactors = FALSE) 12 | colClasses <- as.character(as.vector(headerInfo[1, ])) 13 | 14 | taxi.format <- make.input.format(format = "csv", sep = ",", 15 | col.names = names(headerInfo), 16 | colClasses = colClasses, 17 | stringsAsFactors = FALSE 18 | ) 19 | 20 | taxi.map <- function(k, v){ 21 | original <- v[[6]] 22 | date <- as.Date(original, origin = "1970-01-01") 23 | wkday <- weekdays(date) 24 | hour <- format(as.POSIXct(original), "%H") 25 | dat <- data.frame(date, hour) 26 | z <- aggregate(date ~ hour, dat, FUN = length) 27 | keyval(z[[1]], z[[2]]) 28 | } 29 | 30 | taxi.reduce <- function(k, v){ 31 | data.frame(hour = k, trips = sum(v), row.names = k) 32 | } 33 | 34 | m <- mapreduce(taxi.hdp, input.format = taxi.format, 35 | map = taxi.map, 36 | reduce = taxi.reduce 37 | ) 38 | 39 | dat <- values(from.dfs(m)) 40 | 41 | library("ggplot2") 42 | p <- ggplot(dat, aes(x = hour, y = trips, group = 1)) + 43 | geom_smooth(method = loess, span = 0.5, 44 | col = "grey50", fill = "yellow") + 45 | geom_line(col = "blue") + 46 | expand_limits(y = 0) + 47 | ggtitle("Sample of taxi trips in New York") 48 | 49 | 50 | p 51 | -------------------------------------------------------------------------------- /hdinsight/r-installer.ps1: -------------------------------------------------------------------------------- 1 | <# 2 | .SYNOPSIS 3 | Install R to HDInsight cluster. 4 | 5 | .DESCRIPTION 6 | This installs R on HDInsight cluster and it runs on YARN. 7 | 8 | .EXAMPLE 9 | .\r-installer-v02.ps1 -RSrc https://hdiconfigactions.blob.core.windows.net/rconfigactionv01/R-3.1.1-win.exe -RmrSrc https://hdiconfigactions.blob.core.windows.net/rconfigactionv01/rmr2_3.1.2.zip -RhdfsSrc https://hdiconfigactions.blob.core.windows.net/rconfigactionv01/rhdfs_1.0.8.zip 10 | #> 11 | 12 | param ( 13 | # The binary executable installer location for R. 14 | [Parameter()] 15 | [String]$RSrc, 16 | 17 | # The zip file for R MapReduce. 18 | [Parameter()] 19 | [String]$RmrSrc, 20 | 21 | # The zip file for R HDFS. 22 | [Parameter()] 23 | [String]$RhdfsSrc) 24 | 25 | # Use default parameters in case they are not specified. 26 | if (!$RSrc) 27 | { 28 | $RSrc = "https://hdiconfigactions.blob.core.windows.net/rconfigactionv01/R-3.1.1-win.exe"; 29 | } 30 | if (!$RmrSrc) 31 | { 32 | $RmrSrc = "https://hdiconfigactions.blob.core.windows.net/rconfigactionv01/rmr2_3.1.2.zip"; 33 | } 34 | if (!$RhdfsSrc) 35 | { 36 | $RhdfsSrc = "https://hdiconfigactions.blob.core.windows.net/rconfigactionv01/rhdfs_1.0.8.zip"; 37 | } 38 | 39 | # Download config action module from a well-known directory. 40 | $CONFIGACTIONURI = "https://hdiconfigactions.blob.core.windows.net/configactionmodulev02/HDInsightUtilities-v02.psm1"; 41 | $CONFIGACTIONMODULE = "$env:TEMP\HDInsightUtilities.psm1"; 42 | $webclient = New-Object System.Net.WebClient; 43 | $webclient.DownloadFile($CONFIGACTIONURI, $CONFIGACTIONMODULE); 44 | 45 | # (TIP) Import config action helper method module to make writing config action easy. 46 | if (Test-Path ($CONFIGACTIONMODULE)) 47 | { 48 | Import-Module $CONFIGACTIONMODULE; 49 | } 50 | else 51 | { 52 | Write-Output "Failed to load HDInsightUtilities module, exiting ..."; 53 | exit; 54 | } 55 | 56 | # (TIP) Write-HDILog is the way to write to STDOUT and STDERR in HDInsight config action script. 57 | Write-HDILog "Starting R installation at: $(Get-Date)"; 58 | 59 | $rInstallationRoot = (Get-Item "$env:HADOOP_HOME").parent.FullName+'\R\R-3.1.1'; 60 | $rExecutableDir = $rInstallationRoot + '\bin\x64'; 61 | 62 | # (TIP) Test whether the destination file already exists and this makes the script idempotent so it functions properly upon reboot and reimage. 63 | if (Test-Path $rInstallationRoot) 64 | { 65 | Write-HDILog "Destination: $rInstallationRoot already exists, exiting ..."; 66 | exit; 67 | } 68 | 69 | # Install R. 70 | # (TIP) It is always good to download to user temporary location. 71 | $rDest = $env:temp + '\' + [guid]::NewGuid() + '.exe'; 72 | Save-HDIFile -SrcUri $RSrc -DestFile $rDest; 73 | Start-Process -wait $rDest "/COMPONENTS=x64,main,translation /DIR=$rInstallationRoot /SILENT"; 74 | Remove-Item $rDest; 75 | 76 | # Download rmr and rhdfs libraries. 77 | $rmrDest = $env:temp + '\rmr2_3.1.2.zip'; 78 | Save-HDIFile -SrcUri $RmrSrc -DestFile $rmrDest; 79 | $rhdfsDest = $env:temp + '\rhdfs_1.0.8.zip'; 80 | Save-HDIFile -SrcUri $RhdfsSrc -DestFile $rhdfsDest; 81 | 82 | # Install RMR and RHDFS. 83 | [Environment]::SetEnvironmentVariable('PATH', $env:PATH + ';' + $rExecutableDir, 'Process'); 84 | $output = Invoke-HDICmdScript -CmdToExecute "RScript.exe -e ""install.packages(c('XML', 'getopt', 'dplyr', 'RCurl', 'rJava', 'Rcpp', 'RJSONIO', 'bitops', 'digest', 'functional', 'reshape2', 'stringr', 'plyr', 'caTools', 'stringdist', 'R.utils'), repos='http://ftp.heanet.ie/mirrors/cran.r-project.org/')"""; 85 | 86 | Write-HDILog $output; 87 | $output = Invoke-HDICmdScript -CmdToExecute "R.exe CMD INSTALL $rmrDest"; 88 | Write-HDILog $output; 89 | $output = Invoke-HDICmdScript -CmdToExecute "R.exe CMD INSTALL $rhdfsDest"; 90 | Write-HDILog $output; 91 | 92 | # (TIP) Please clean up temporary files when no longer needed. 93 | Remove-Item $rmrDest; 94 | Remove-Item $rhdfsDest; 95 | 96 | # Config environment variables. 97 | [Environment]::SetEnvironmentVariable('PATH', $env:PATH + ';' + $rExecutableDir + ';' + $env:JAVA_HOME + '\jre\bin\server', 'Machine'); 98 | [Environment]::SetEnvironmentVariable('HADOOP_CMD', $env:HADOOP_HOME + '\bin\hadoop', 'Machine'); 99 | [Environment]::SetEnvironmentVariable('HDFS_CMD', $env:HADOOP_HOME + '\bin\hdfs', 'Machine'); 100 | [Environment]::SetEnvironmentVariable('HADOOP_STREAMING', (gci ($env:HADOOP_HOME + '\share\hadoop\tools\lib') -filter *streaming* | Select-Object -First 1 | % { $_.FullName }), 'Machine'); 101 | 102 | # Restart nodemanager to pick up environment variable changes. 103 | if (Get-HDIService -ServiceName nodemanager) 104 | { 105 | Restart-Service nodemanager; 106 | } 107 | 108 | Write-HDILog "Done with R installation at: $(Get-Date)"; -------------------------------------------------------------------------------- /hdinsight/remove.ps1: -------------------------------------------------------------------------------- 1 | $ErrorActionPreference = "Stop" 2 | 3 | # Load settings 4 | . .\settings.ps1 5 | 6 | # Delete cluster 7 | Remove-AzureHDInsightCluster -Name $settings.HDInsightClusterName -------------------------------------------------------------------------------- /hdinsight/settings.ps1: -------------------------------------------------------------------------------- 1 | # Use unique prefix for cluster resources 2 | $prefix = "andrier" 3 | 4 | $settings = New-Object PSObject -Property @{ 5 | # Subscription 6 | SubscriptionName = "Visual Studio Ultimate with MSDN" 7 | 8 | # Storage account 9 | StorageAccountName = "$($prefix)hadooptutorial" 10 | StorageAccountLabel = "RHaddop-tutorial" 11 | StorageAccountLocation = "West Europe" 12 | 13 | # Cluster 14 | HDInsightClusterName = "$($prefix)-r-hadoop-tutorial" 15 | HDInsightContainerName = "$($prefix)-r-hadoop-tutorial-hdfs" 16 | HDInsightUsername = "admin" 17 | HDInsightPassword = "RHadoopTutorial2015!" 18 | HDInsightClusterSizeInNodes = 2 19 | HDInsightHeadNodeVMSize = "Large" 20 | 21 | # Custom version of the script referenced at: 22 | # Install and use R on HDInsight Hadoop clusters 23 | # https://azure.microsoft.com/en-us/documentation/articles/hdinsight-hadoop-r-scripts 24 | RInstallerScriptUri = "https://raw.githubusercontent.com/StanislawSwierc/RHadoop-tutorial/master/hdinsight/r-installer.ps1" 25 | #RInstallerScriptUri = "https://raw.githubusercontent.com/$($user)/RHadoop-tutorial/master/hdinsight/r-installer.ps1" 26 | } 27 | -------------------------------------------------------------------------------- /hdinsight/setup.ps1: -------------------------------------------------------------------------------- 1 | $ErrorActionPreference = "Stop" 2 | 3 | # Load settings 4 | . .\settings.ps1 5 | 6 | # Check if Azure Powershell module is available 7 | if (Get-Module -ListAvailable Azure) { 8 | Write-Host "Importing Azure Powershell module" 9 | Import-Module -Name Azure 10 | } else { 11 | throw "Azure module not available. Please refer to: How to install and configure Azure PowerShell " + 12 | "(https://azure.microsoft.com/en-us/documentation/articles/powershell-install-configure)" 13 | } 14 | 15 | # Make Azure account and its subscriptions are available in current session 16 | Add-AzureAccount 17 | 18 | # Select subscription in case there are several subscriptions connected to the account 19 | Select-AzureSubscription $settings.SubscriptionName 20 | 21 | # Check if storage account exist and create it otherwise 22 | if (!(Get-AzureStorageAccount -StorageAccountName $settings.StorageAccountName -ErrorAction SilentlyContinue)) { 23 | Write-Host "Creating new storage account: $($settings.StorageAccountName)" 24 | New-AzureStorageAccount ` 25 | -StorageAccountName $settings.StorageAccountName ` 26 | -Label $settings.StorageAccountLabel ` 27 | -Location $settings.StorageAccountLocation 28 | } 29 | 30 | # Select storage account 31 | Set-AzureSubscription ` 32 | -SubscriptionName $settings.SubscriptionName ` 33 | -CurrentStorageAccountName $settings.StorageAccountName 34 | 35 | # Get the storage account key 36 | $storageAccountKey = (Get-AzureStorageKey $settings.StorageAccountName).Primary 37 | 38 | # Create cluster configuration 39 | $hdinsightConfig = New-AzureHDInsightClusterConfig ` 40 | -HeadNodeVMSize $settings.HDInsightHeadNodeVMSize ` 41 | -ClusterSizeInNodes $settings.HDInsightClusterSizeInNodes | 42 | Set-AzureHDInsightDefaultStorage ` 43 | -StorageAccountName $settings.StorageAccountName ` 44 | -StorageAccountKey $storageAccountKey ` 45 | -StorageContainerName $settings.HDInsightContainerName | 46 | Add-AzureHDInsightScriptAction ` 47 | -Name "Install R" ` 48 | -ClusterRoleCollection HeadNode,DataNode ` 49 | -Uri $settings.RInstallerScriptUri 50 | 51 | # Convert plain text user name and password to PSCredential object 52 | $hdinsightPasswordSecureString = ConvertTo-SecureString -String $settings.HDInsightPassword -AsPlainText -Force 53 | $hdinsightCredential = New-Object -TypeName System.Management.Automation.PSCredential ` 54 | -ArgumentList $settings.HDInsightUsername, $hdinsightPasswordSecureString 55 | 56 | # Create cluster 57 | $hdinsightCluster = New-AzureHDInsightCluster ` 58 | -Name $settings.HDInsightClusterName ` 59 | -Config $hdinsightConfig ` 60 | -Credential $hdinsightCredential ` 61 | -Location $settings.StorageAccountLocation 62 | 63 | # Check cluster state 64 | Get-AzureHDInsightCluster -Name $settings.HDInsightClusterName 65 | 66 | 67 | -------------------------------------------------------------------------------- /hive/1-create-external-table.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS taxi_sample; 2 | CREATE EXTERNAL TABLE taxi_sample(medallion STRING, hack_license STRING, vendor_id STRING, rate_code INT, store_and_fwd_flag STRING, pickup_datetime STRING, dropoff_datetime STRING, passenger_count INT, trip_time_in_secs INT, trip_distance FLOAT, pickup_longitude FLOAT, pickup_latitude FLOAT, dropoff_longitude FLOAT, dropoff_latitude FLOAT) ROW FORMAT 3 | DELIMITED FIELDS TERMINATED BY ',' 4 | LINES TERMINATED BY '\n' 5 | STORED AS TEXTFILE 6 | LOCATION '/user/andrie.devries/taxi/sample'; 7 | -------------------------------------------------------------------------------- /hive/2-hive-queries.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | # @knitr configure-rhive -------------------------------------------------- 4 | # install.packages("RHive") 5 | library(RHive) 6 | dirHive <- "/user/hive" 7 | Sys.setenv(HIVE_HOME = "/usr/lib/hive") 8 | rhive.init() 9 | rhive.connect(host = "127.0.0.1", hiveServer2 = TRUE) 10 | 11 | 12 | 13 | 14 | # @knitr row-count -------------------------------------------------------- 15 | query_count <- "SELECT COUNT(*) FROM taxi_sample" 16 | 17 | 18 | # @knitr build-row-count -------------------------------------------------- 19 | name_table <- "taxi_sample" 20 | query_count <- sprintf("SELECT COUNT(*) FROM %s", 21 | name_table) 22 | cat(query_count) 23 | 24 | 25 | 26 | # @knitr run-row-count ---------------------------------------------------- 27 | table_count <- rhive.query(query_count) 28 | head(table_count) 29 | 30 | 31 | 32 | # @knitr define-hour ------------------------------------------------------ 33 | query_hour <- " 34 | SELECT pickup_datetime, substring(pickup_datetime, 12, 2) AS hour 35 | FROM taxi_sample LIMIT 100" 36 | 37 | 38 | 39 | # @knitr build-define-hour ------------------------------------------------ 40 | field_time <- "pickup_datetime" 41 | field_hour <- sprintf("substring(%s, 12, 2)", 42 | field_time) 43 | query_hour <- sprintf( 44 | "SELECT %s, %s AS hour 45 | FROM %s LIMIT 100", 46 | field_time, field_hour, name_table) 47 | cat(query_hour) 48 | 49 | 50 | 51 | # @knitr run-define-hour -------------------------------------------------- 52 | head(rhive.query(query_hour)) 53 | 54 | 55 | 56 | # @knitr count-by-hour ---------------------------------------------------- 57 | query_count <- " 58 | SELECT substring(pickup_datetime, 12, 2) AS hour, COUNT(*) AS count 59 | FROM taxi_sample 60 | GROUP BY substring(pickup_datetime, 12, 2)" 61 | 62 | 63 | # @knitr build-count-by-hour ---------------------------------------------- 64 | query_count <- sprintf( 65 | "SELECT %s AS hour, COUNT(*) AS count 66 | FROM %s 67 | GROUP BY %s", 68 | field_hour, name_table, field_hour) 69 | cat(query_count) 70 | 71 | 72 | # @knitr run-count-by-hour ------------------------------------------------ 73 | head(rhive.query(query_count)) 74 | 75 | 76 | -------------------------------------------------------------------------------- /images/SSCP-matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/SSCP-matrix.png -------------------------------------------------------------------------------- /images/cluster-structure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/cluster-structure.png -------------------------------------------------------------------------------- /images/dilbert-big-data-in-the-cloud.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/dilbert-big-data-in-the-cloud.png -------------------------------------------------------------------------------- /images/img-hadoop-logical-flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/img-hadoop-logical-flow.png -------------------------------------------------------------------------------- /images/img-rmr2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/img-rmr2.png -------------------------------------------------------------------------------- /images/indeed-job-trend-stats.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/indeed-job-trend-stats.png -------------------------------------------------------------------------------- /images/jfk-times-square.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/jfk-times-square.png -------------------------------------------------------------------------------- /images/mapreduce-weekdays-0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/mapreduce-weekdays-0.png -------------------------------------------------------------------------------- /images/mapreduce-weekdays-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/mapreduce-weekdays-1.png -------------------------------------------------------------------------------- /images/mapreduce-weekdays-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/mapreduce-weekdays-2.png -------------------------------------------------------------------------------- /images/r-for-dummies.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/r-for-dummies.jpg -------------------------------------------------------------------------------- /images/r-machine-learning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/r-machine-learning.png -------------------------------------------------------------------------------- /images/taxi-tweet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/taxi-tweet.png -------------------------------------------------------------------------------- /images/xkcd-my-job-is-compiling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/xkcd-my-job-is-compiling.png -------------------------------------------------------------------------------- /images/xkcd-wordcount.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/xkcd-wordcount.png -------------------------------------------------------------------------------- /taxi/taxi-1-upload.R: -------------------------------------------------------------------------------- 1 | library(rmr2) 2 | library(rhdfs) 3 | hdfs.init() 4 | 5 | taxifile <- "data/trip_data_1_sample.csv" 6 | file.exists(taxifile) 7 | 8 | list.files("data") 9 | hdfs.ls("data") 10 | 11 | hdfs.ls("taxi") 12 | hdfs.put("data/trip_data_1_sample.csv", "taxi/trip_data_1_sample.csv") 13 | hdfs.ls("taxi") 14 | 15 | # Put taxi data in dfs 16 | hdfs.ls(".") 17 | hdfs.mkdir("taxi") 18 | 19 | hdfs.put(taxifile, file.path("taxi", basename(taxifile))) 20 | hdfs.ls(".") 21 | -------------------------------------------------------------------------------- /taxi/taxi-2-rmr-local.R: -------------------------------------------------------------------------------- 1 | ## @knitr load-packages --------------------------------------------------- 2 | library(rmr2) 3 | rmr.options(backend = "local") 4 | 5 | ## @knitr make.input.format ----------------------------------------------- 6 | taxi.format <- make.input.format("csv", sep = ",", 7 | colClasses = "character", 8 | stringsAsFactors = FALSE 9 | ) 10 | 11 | 12 | 13 | 14 | ## @knitr from.dfs-1 ------------------------------------------------------ 15 | 16 | taxi.hdp <- "data/trip_data_1_sample.csv" 17 | x <- from.dfs(taxi.hdp, format = taxi.format) 18 | str(x) 19 | 20 | 21 | ## @knitr from.dfs-2 ------------------------------------------------------ 22 | 23 | x <- from.dfs(taxi.hdp, format = taxi.format) 24 | head( 25 | values(x) 26 | ) 27 | 28 | ## @knitr make.input.format-with-colnames-1 ------------------------------- 29 | 30 | headerInfo <- read.csv("data/dictionary_trip_data.csv", stringsAsFactors = FALSE) 31 | headerInfo 32 | colClasses <- as.character(as.vector(headerInfo[1, ])) 33 | names(headerInfo) 34 | colClasses 35 | 36 | ## @knitr make.input.format-with-colnames-2 ------------------------------- 37 | 38 | taxi.format <- make.input.format(format = "csv", sep = ",", 39 | col.names = names(headerInfo), 40 | colClasses = colClasses, 41 | stringsAsFactors = FALSE 42 | ) 43 | 44 | x <- from.dfs(taxi.hdp, format = taxi.format) 45 | str(values(x)) 46 | 47 | 48 | ## @knitr mapreduce-1-a --------------------------------------------------- 49 | 50 | m <- mapreduce(taxi.hdp, input.format = taxi.format) 51 | m 52 | m() 53 | 54 | ## @knitr mapreduce-1-b --------------------------------------------------- 55 | m <- mapreduce(taxi.hdp, input.format = taxi.format) 56 | head( 57 | values(from.dfs(m)) 58 | ) 59 | 60 | 61 | 62 | ## @knitr mapreduce-2 ----------------------------------------------------- 63 | taxi.map <- function(k, v){ 64 | original <- v[[6]] 65 | original 66 | } 67 | m <- mapreduce(taxi.hdp, input.format = taxi.format, 68 | map = taxi.map 69 | ) 70 | head( 71 | values(from.dfs(m)) 72 | ) 73 | 74 | ## @knitr mapreduce-3 ----------------------------------------------------- 75 | taxi.map <- function(k, v){ 76 | original <- v[[6]] 77 | date <- as.Date(original, origin = "1970-01-01") 78 | wkday <- weekdays(date) 79 | keyval(wkday, 1) 80 | } 81 | m <- mapreduce(taxi.hdp, input.format = taxi.format, 82 | map = taxi.map 83 | ) 84 | head( 85 | keys(from.dfs(m)), 86 | 20 87 | ) 88 | head( 89 | values(from.dfs(m)), 90 | 20 91 | ) 92 | 93 | ## @knitr mapreduce-4 ----------------------------------------------------- 94 | taxi.map <- function(k, v){ 95 | original <- v[[6]] 96 | date <- as.Date(original, origin = "1970-01-01") 97 | wkday <- weekdays(date) 98 | keyval(wkday, 1) 99 | } 100 | taxi.reduce <- function(k, v){ 101 | keyval(k, sum(v)) 102 | } 103 | m <- mapreduce(taxi.hdp, input.format = taxi.format, 104 | map = taxi.map, 105 | reduce = taxi.reduce 106 | ) 107 | head( 108 | keys(from.dfs(m)) 109 | ) 110 | head( 111 | values(from.dfs(m)) 112 | ) 113 | 114 | ## @knitr mapreduce-5 ----------------------------------------------------- 115 | taxi.map <- function(k, v){ 116 | original <- v[[6]] 117 | date <- as.Date(original, origin = "1970-01-01") 118 | wkday <- weekdays(date) 119 | dat <- data.frame(date, wkday) 120 | z <- aggregate(date ~ wkday, dat, FUN = length) 121 | keyval(z[[1]], z[[2]]) 122 | } 123 | m <- mapreduce(taxi.hdp, input.format = taxi.format, 124 | map = taxi.map 125 | ) 126 | keys(from.dfs(m)) 127 | values(from.dfs(m)) 128 | 129 | ## @knitr mapreduce-6 ----------------------------------------------------- 130 | taxi.map <- function(k, v){ 131 | original <- v[[6]] 132 | date <- as.Date(original, origin = "1970-01-01") 133 | wkday <- weekdays(date) 134 | dat <- data.frame(date, wkday) 135 | z <- aggregate(date ~ wkday, dat, FUN = length) 136 | keyval(z[[1]], z[[2]]) 137 | } 138 | taxi.reduce <- function(k, v){ 139 | data.frame(weekday = k, trips = sum(v), row.names = k) 140 | } 141 | m <- mapreduce(taxi.hdp, input.format = taxi.format, 142 | map = taxi.map, 143 | reduce = taxi.reduce 144 | ) 145 | keys(from.dfs(m)) 146 | values(from.dfs(m)) 147 | 148 | ## @knitr mapreduce-7-a --------------------------------------------------- 149 | taxi.map <- function(k, v){ 150 | original <- v[[6]] 151 | date <- as.Date(original, origin = "1970-01-01") 152 | wkday <- weekdays(date) 153 | hour <- format(as.POSIXct(original), "%H") 154 | dat <- data.frame(date, hour) 155 | z <- aggregate(date ~ hour, dat, FUN = length) 156 | keyval(z[[1]], z[[2]]) 157 | } 158 | 159 | taxi.reduce <- function(k, v){ 160 | data.frame(hour = k, trips = sum(v), row.names = k) 161 | } 162 | 163 | ## @knitr mapreduce-7-b --------------------------------------------------- 164 | m <- mapreduce(taxi.hdp, input.format = taxi.format, 165 | map = taxi.map, 166 | reduce = taxi.reduce 167 | ) 168 | keys(from.dfs(m)) 169 | dat <- values(from.dfs(m)) 170 | dat 171 | 172 | ## @knitr mapreduce-7-plot-1 ---------------------------------------------- 173 | 174 | library("ggplot2") 175 | p <- ggplot(dat, aes(x = hour, y = trips, group = 1)) + 176 | geom_smooth(method = loess, span = 0.5, 177 | col = "grey50", fill = "yellow") + 178 | geom_line(col = "blue") + 179 | expand_limits(y = 0) + 180 | ggtitle("Sample of taxi trips in New York") 181 | 182 | 183 | ## @knitr mapreduce-7-plot-2 ---------------------------------------------- 184 | p 185 | -------------------------------------------------------------------------------- /taxi/taxi-2-rmr-lon-lat.R: -------------------------------------------------------------------------------- 1 | library(rmr2) 2 | 3 | 4 | # Define compute context -------------------------------------------------- 5 | 6 | ### local 7 | rmr.options(backend = "local") 8 | taxi.hdp <- "data/trip_data_1_sample.csv" 9 | 10 | ### hadoop 11 | rmr.options(backend = "hadoop") 12 | homeFolder <- file.path("/user", Sys.getenv("USER")) 13 | taxi.hdp <- file.path(homeFolder, "taxi", "sample") 14 | rmr.options(backend.parameters = list( 15 | "mapreduce.map.java.opts=-Xmx800M", "mapreduce.reduce.java.opts=-Xmx800M")) 16 | 17 | 18 | 19 | # Define input format ----------------------------------------------------- 20 | 21 | headerInfo <- read.csv("data/dictionary_trip_data.csv", stringsAsFactors = FALSE) 22 | colClasses <- as.character(as.vector(headerInfo[1, ])) 23 | 24 | taxi.format <- make.input.format(format = "csv", sep = ",", 25 | col.names = names(headerInfo), 26 | colClasses = colClasses, 27 | stringsAsFactors = FALSE 28 | ) 29 | 30 | 31 | # Helper functions to compute great circle distance ----------------------- 32 | 33 | 34 | # Calculates the geodesic distance between two points specified by 35 | # radian latitude/longitude using the Spherical Law of Cosines (slc) 36 | # Source: http://www.r-bloggers.com/great-circle-distance-calculations-in-r/ 37 | gcd.slc <- function(long1, lat1, long2, lat2) { 38 | R <- 6371 # Earth mean radius [km] 39 | d <- acos(sin(lat1)*sin(lat2) + cos(lat1)*cos(lat2) * cos(long2-long1)) * R 40 | return(d) # Distance in km 41 | } 42 | 43 | # Convert degrees to radians 44 | deg2rad <- function(deg) return(deg*pi/180) 45 | 46 | 47 | # Mapper: compute trip time for subset of trips originating at JFK -------- 48 | 49 | taxi.map <- function(k, v){ 50 | # browser() 51 | lon <- deg2rad(v$pickup_longitude) 52 | lat <- deg2rad(v$pickup_latitude) 53 | jfk_lon <- deg2rad(-73.779564) 54 | jfk_lat <- deg2rad(40.646908) 55 | distToJfk <- gcd.slc(lon, lat, jfk_lon, jfk_lat) 56 | 57 | lon <- deg2rad(v$dropoff_longitude) 58 | lat <- deg2rad(v$dropoff_latitude) 59 | ts_lon <- deg2rad(-73.985131) 60 | ts_lat <- deg2rad(40.758895) 61 | distToTimesSquare <- gcd.slc(lon, lat, jfk_lon, jfk_lat) 62 | 63 | original <- v[distToJfk < 1.6 & distToTimesSquare < 1.6, ] 64 | time <- as.POSIXct(original$dropoff_datetime) - as.POSIXct(original$pickup_datetime) 65 | time <- as.numeric(time) 66 | 67 | keep <- time > 600 # 10 minutes artificial threshold - noisy data 68 | original <- original[keep, ] 69 | time <- time[keep] 70 | 71 | if(nrow(original) == 0){ 72 | z <- data.frame(wkday="None", hour="00", 73 | time = matrix(c(time.1=0, time.2=0), nrow=1), 74 | stringsAsFactors = FALSE 75 | ) 76 | } else { 77 | date <- as.Date(original[[6]], origin = "1970-01-01") 78 | wkday <- weekdays(date) 79 | hour <- format(as.POSIXct(original[[6]]), "%H") 80 | # browser() 81 | dat <- data.frame(wkday, hour, time) 82 | z <- aggregate(time ~ wkday + hour, dat, 83 | FUN = function(x)cbind(sum(x), length(x))) 84 | } 85 | keyval(z[, 1:2], z[, 3]) 86 | } 87 | 88 | # Reducer ----------------------------------------------------------------- 89 | 90 | taxi.reduce <- function(k, v){ 91 | # browser() 92 | time = sum(v[, 1]) 93 | count = sum(v[, 2]) 94 | cbind(k, duration = time/count / 60) # convert seconds to minutes 95 | } 96 | 97 | 98 | 99 | # Mapreduce --------------------------------------------------------------- 100 | 101 | m <- mapreduce(taxi.hdp, input.format = taxi.format, 102 | map = taxi.map, 103 | reduce = taxi.reduce 104 | ) 105 | keys(from.dfs(m)) 106 | dat <- values(from.dfs(m)) 107 | dat 108 | 109 | 110 | # Plot results ------------------------------------------------------------ 111 | 112 | library("ggplot2") 113 | ggplot(dat, aes(x = hour, y = duration, group = wkday)) + 114 | geom_point(col = "blue") + 115 | geom_line() + 116 | expand_limits(y = 0) + 117 | facet_grid(wkday ~ .) + 118 | ggtitle("Sample of taxi trips in New York") 119 | 120 | 121 | -------------------------------------------------------------------------------- /taxi/taxi-rmr-3-hadoop.R: -------------------------------------------------------------------------------- 1 | library(rmr2) 2 | library(rhdfs) 3 | hdfs.init() 4 | 5 | rmr.options(backend = "hadoop") 6 | 7 | hdfs.ls("taxi")$file 8 | # taxi.hdp <- "/user/andrie.devries/taxi/sample" 9 | homeFolder <- file.path("/user", Sys.getenv("USER")) 10 | taxi.hdp <- file.path(homeFolder, "taxi/sample") 11 | 12 | 13 | headerInfo <- read.csv("data/dictionary_trip_data.csv", stringsAsFactors = FALSE) 14 | colClasses <- as.character(as.vector(headerInfo[1, ])) 15 | 16 | taxi.format <- make.input.format(format = "csv", sep = ",", 17 | col.names = names(headerInfo), 18 | colClasses = colClasses, 19 | stringsAsFactors = FALSE 20 | ) 21 | 22 | taxi.map <- function(k, v){ 23 | original <- v[[6]] 24 | date <- as.Date(original, origin = "1970-01-01") 25 | wkday <- weekdays(date) 26 | hour <- format(as.POSIXct(original), "%H") 27 | dat <- data.frame(date, hour) 28 | z <- aggregate(date ~ hour, dat, FUN = length) 29 | keyval(z[[1]], z[[2]]) 30 | } 31 | 32 | taxi.reduce <- function(k, v){ 33 | data.frame(hour = k, trips = sum(v), row.names = k) 34 | } 35 | 36 | m <- mapreduce(taxi.hdp, input.format = taxi.format, 37 | map = taxi.map, 38 | reduce = taxi.reduce 39 | ) 40 | 41 | dat <- values(from.dfs(m)) 42 | 43 | library("ggplot2") 44 | p <- ggplot(dat, aes(x = hour, y = trips, group = 1)) + 45 | geom_smooth(method = loess, span = 0.5, 46 | col = "grey50", fill = "yellow") + 47 | geom_line(col = "blue") + 48 | expand_limits(y = 0) + 49 | ggtitle("Sample of taxi trips in New York") 50 | 51 | 52 | p 53 | -------------------------------------------------------------------------------- /test.txt: -------------------------------------------------------------------------------- 1 | test 2 | -------------------------------------------------------------------------------- /utils/put-taxi-data-to-dfs.R: -------------------------------------------------------------------------------- 1 | ## @knitr rhdfs ----------------------------------------------------------- 2 | library(rhdfs) 3 | hdfs.init() 4 | 5 | localFiles <- dir("data", pattern = "_sample.csv", full.names = TRUE) 6 | localFiles 7 | hdfs.mkdir("taxi") 8 | hdfs.put(localFiles, "taxi") 9 | hdfs.ls("taxi") 10 | 11 | hdfs.ls("taxi")$file 12 | -------------------------------------------------------------------------------- /utils/sample-taxi-data.R: -------------------------------------------------------------------------------- 1 | infolder <- "C:/Users/adevries/downloads/Taxi/Foil2013" 2 | outfolder <- "C:/Users/adevries/documents/github/RHadoop-tutorial/RHadoop-tutorial/data" 3 | 4 | zips <- list.files(infolder, pattern = "trip_data.*", full.names = TRUE) 5 | 6 | downSample <- function(infile, outfile, n = 1000, keepHeader = FALSE){ 7 | message(basename(outfile)) 8 | con <- file(infile, open = "r") 9 | conout <- file(outfile, open = "a") 10 | 11 | on.exit({ 12 | close(con) 13 | close(conout) 14 | }) 15 | 16 | if(keepHeader){ 17 | header <- readLines(con, n = 1) 18 | writeLines(header, con = conout) 19 | } 20 | 21 | eof <- FALSE 22 | i <- 0 23 | while(!eof){ 24 | dat <- readLines(con, n = n) 25 | if(length(dat) != n) eof <- TRUE 26 | keep <- sample(dat, 1) 27 | writeLines(keep, con = conout) 28 | i <- i + 1 29 | if (i %% n == 0) message(i/n) 30 | } 31 | 32 | } 33 | 34 | # ------------------------------------------------------------------------ 35 | 36 | 37 | for(infile in zips){ 38 | outfile <- file.path(outfolder, gsub("\\.csv$", "_sample.csv", basename(infile))) 39 | downSample(infile, outfile) 40 | } 41 | 42 | # file.remove(outfile) 43 | 44 | 45 | dat <- readLines(outfile) 46 | length(dat) 47 | head(dat) 48 | tail(dat) 49 | 50 | --------------------------------------------------------------------------------