├── .gitignore
├── 1-Using-R-with-Hadoop.Rpres
├── 1-Using-R-with-Hadoop.html
├── 2-Taxi-analysis-with-RHadoop.Rpres
├── 2-Taxi-analysis-with-RHadoop.html
├── 4-Computing-on-distributed-matrices.Rpres
├── 4-Computing-on-distributed-matrices.html
├── 5-hive.Rpres
├── 5-hive.html
├── LICENSE
├── README.md
├── css
    ├── custom.css
    ├── mslogo.png
    └── title-background.png
├── data
    ├── dictionary_trip_data.csv
    ├── trip_data_10_sample.csv
    ├── trip_data_11_sample.csv
    ├── trip_data_12_sample.csv
    ├── trip_data_1_sample.csv
    ├── trip_data_2_sample.csv
    ├── trip_data_3_sample.csv
    ├── trip_data_4_sample.csv
    ├── trip_data_5_sample.csv
    ├── trip_data_6_sample.csv
    ├── trip_data_7_sample.csv
    ├── trip_data_8_sample.csv
    ├── trip_data_9_sample.csv
    └── ullyses.txt
├── demo
    ├── 01-intro-lapply.R
    ├── 02-intro-tapply.R
    ├── 03-download-ebook-to-hdfs.R
    ├── 04-wordcount-1-algorithm.R
    ├── 05-wordcount-2-mapreduce.R
    ├── 06-logistic-regression-iris.R
    ├── 07-logistic-regression.R
    ├── 08-logistic-regression-mapreduce.R
    ├── 09-linear-regression.R
    └── 10-kmeans.R
├── exercises
    ├── ex-1-lapply.R
    ├── ex-2-taxi-local.R
    ├── ex-3-put-taxi-data-to-dfs.R
    └── ex-4-taxi-hadoop.R
├── hdinsight
    ├── r-installer.ps1
    ├── remove.ps1
    ├── settings.ps1
    └── setup.ps1
├── hive
    ├── 1-create-external-table.sql
    └── 2-hive-queries.R
├── images
    ├── SSCP-matrix.png
    ├── cluster-structure.png
    ├── dilbert-big-data-in-the-cloud.png
    ├── img-hadoop-logical-flow.png
    ├── img-rmr2.png
    ├── indeed-job-trend-stats.png
    ├── jfk-times-square.png
    ├── mapreduce-weekdays-0.png
    ├── mapreduce-weekdays-1.png
    ├── mapreduce-weekdays-2.png
    ├── r-for-dummies.jpg
    ├── r-machine-learning.png
    ├── taxi-tweet.png
    ├── xkcd-my-job-is-compiling.png
    └── xkcd-wordcount.png
├── taxi
    ├── taxi-1-upload.R
    ├── taxi-2-rmr-local.R
    ├── taxi-2-rmr-lon-lat.R
    └── taxi-rmr-3-hadoop.R
├── test.txt
└── utils
    ├── put-taxi-data-to-dfs.R
    └── sample-taxi-data.R


/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | 
 5 | # Example code in package build process
 6 | *-Ex.R
 7 | 
 8 | # RStudio files
 9 | .Rproj.user/
10 | 
11 | # produced vignettes
12 | vignettes/*.html
13 | vignettes/*.pdf
14 | 
15 | data/trip_data_1.csv
16 | 
17 | # knitr artefacts
18 | *.md
19 | *-cache/*
20 | *-figure/*
21 | *_files/*
22 | 
23 | 
24 | # gh-pages artefacts
25 | .ssas-cache/*
26 | _site/*
27 | .Rproj.user
28 | 
29 | 
30 | 9-old-demo*


--------------------------------------------------------------------------------
/1-Using-R-with-Hadoop.Rpres:
--------------------------------------------------------------------------------
  1 | Introduction to Using R with Hadoop
  2 | ========================================================
  3 | author: Andrie de Vries & Michele Usuelli
  4 | date: 2015-07-01, UseR!2015
  5 | width: 1680
  6 | height: 1050
  7 | css: css/custom.css
  8 | 
  9 | ```{r include=FALSE}
 10 | knitr::opts_chunk$set(cache=TRUE)
 11 | ```
 12 | 
 13 | 
 14 | About us
 15 | ========================================================
 16 | 
 17 | Andrie de Vries
 18 | - Programme Manager, Community projects (Microsoft)
 19 | - Set up an independent market research firm in 2009
 20 | - Joined Revolution Analytics in 2013
 21 | - Author of `ggdendro`, `checkpoint` and `miniCRAN` packages on CRAN
 22 | - Co-author of *R for Dummies*
 23 | 
 24 | ![](images/r-for-dummies.jpg)
 25 | 
 26 | ***
 27 | 
 28 | Michele Usuelli
 29 | - Data Scientist (Microsoft)
 30 | - Joined Revolution Analytics in 2014
 31 | - Author of *R Machine Learning Essentials*
 32 | 
 33 | ![](images/r-machine-learning.png)
 34 | 
 35 | 
 36 | 
 37 | Connecting to Azure with your browser
 38 | ========================================================
 39 | 
 40 | http://ra-ldn-cluster-master-02.cloudapp.net:8787
 41 | 
 42 | You should already have received individual login details
 43 | 
 44 | Why Hadoop?
 45 | ========================================================
 46 | 
 47 | ![](images/indeed-job-trend-stats.png)
 48 | 
 49 | Source: http://www.indeed.com/jobtrends?q=HTML5%2C+hadoop%2C+SAS&l=
 50 | 
 51 | Hype central
 52 | ========================================================
 53 | ![](images/dilbert-big-data-in-the-cloud.png)
 54 | 
 55 | 
 56 | Is your problem big enough for Hadoop?
 57 | ================================================
 58 | 
 59 | When to use Hadoop?
 60 | * Conventional processing tools won’t work on your data
 61 | * Your data is really BIG
 62 | - Won’t fit/process in your favourite database or file-system
 63 | * Your data is really diverse !
 64 | - Semi-structured – JSON, XML, Logs, Images, Sounds
 65 | * You’re a whiz at programming
 66 | 
 67 | ***
 68 | When not to use Hadoop?
 69 | * !(When to use Hadoop?)
 70 | * You’re in a hurry !
 71 | 
 72 | 
 73 | My job is reducing
 74 | ==================
 75 | ![](images/xkcd-my-job-is-compiling.png)
 76 | 
 77 | 
 78 | 
 79 | Some important components
 80 | =====================
 81 | 
 82 | * HDFS
 83 |   - distributed file system
 84 |   - `rhdfs`
 85 | * mapreduce
 86 |   - task manager
 87 |   - `rmr2`
 88 | * hbase
 89 |   - NOSQL database
 90 |   - `rhbase`
 91 | * hive
 92 |   - SQL like database
 93 |   - `RHbase`
 94 | 
 95 | 
 96 | The Azure cluster
 97 | ==========
 98 | 
 99 | You are using a HortonWorks Hadoop cluster, provisioned in the Microsoft Azure cloud
100 | 
101 | ![](images/cluster-structure.png)
102 | 
103 | MapReduce
104 | ============
105 | type: section
106 | 
107 | MapReduce
108 | =========
109 | 
110 | A programming abstraction
111 | * Applies to many types of big data calculation
112 | 
113 | Hides messy implementation detail in library
114 | * Implicit parallelisation
115 | * Load balancing
116 | * Reduce data movement
117 | * Robust job / machine failure management
118 | 
119 | ~~You as the programmer doesn't need to think about this (too much)~~
120 | 
121 | MapReduce solves a generic problem
122 | ==================================
123 | 
124 | * Read a large amount of data
125 | * MAP
126 | * Extract a summary from each record / block
127 | * Shuffle and sort
128 | * REDUCE
129 | * Aggregate, filter transform
130 | 
131 | 
132 | The problem outline is generic – simply implement map and reduce to solve the problem at hand
133 | 
134 | ![](images/img-hadoop-logical-flow.png)
135 | 
136 | 
137 | 
138 | 
139 | The hadoop mapreduce magic
140 | ==========================
141 | type:alert
142 | 
143 | So how does Hadoop do its magic?
144 | 
145 | **Remember: the key is key**
146 | 
147 | The Hadoop promise:
148 | 
149 | **Hadoop guarantees that records with the same key will be processed by the same reducer**
150 | 
151 | This magic happens during the shuffle and sort phase
152 | 
153 | **During shuffle and sort, all items with the same key get moved to the same node**
154 | 
155 | ![](images/img-hadoop-logical-flow.png)
156 | 
157 | 
158 | rmr2
159 | ============
160 | type: section
161 | 
162 | rmr2
163 | ============
164 | 
165 | The `rmr2` package allows you to write R code in the mapper and reducer, without having to know anything about java.
166 | 
167 | ![](images/img-rmr2.png)
168 | 
169 | 
170 | MapReduce in R pseudo-code
171 | ==========================
172 | 
173 | In the mapper, `v’` is available as data – no need for an explicit read statement
174 | 
175 | ```{r, eval=FALSE}
176 | mapper <- function(k, v){
177 |     ...
178 |     keyval(k’, v’)
179 |     }
180 | ```
181 | 
182 | In the reducer, all `v’` with the same `k’` is processed together
183 | 
184 | ```{r, eval=FALSE}
185 | reducer <- function(k’, v’){
186 |     ...
187 |     keyval(k’’, v’’)
188 |     }
189 | ```
190 | 
191 | Pass these functions to `mapreduce():
192 | 
193 | ```{r, eval=FALSE}
194 | mapreduce(input,
195 |           map = mapper,
196 |           reduce = reducer,
197 |           ...)
198 | ```
199 | 
200 | Testing using the local backend
201 | ===============================
202 | 
203 | Local backend
204 | 
205 | ```{r, eval=FALSE}
206 | rmr.options(backend = "local")
207 | ```
208 | 
209 | * The `rmr2` package has a "local" back end, completely implemented in R
210 | * Useful for development, testing and debugging
211 | * **Since computation runs entirely in memory, on small data, it's fast!**
212 | 
213 | 
214 | ***
215 | 
216 | Hadoop backend
217 | 
218 | ```{r, eval=FALSE}
219 | rmr.options(backend = "hadoop")
220 | ```
221 | 
222 | * This allows easy (and fast) testing before scaling to the “hadoop” backend
223 | * Computation distributed to hdfs and mapreduce
224 | * Hadoop computation overhead
225 | 
226 | 
227 | Sending data  R <---> Hadoop
228 | ============================
229 | 
230 | * In a real Hadoop context, your data will be ingested into dfs by dedicated tools, e.g. Squoop or Flume
231 | * For easy testing and development, the `rmr2` package has two convenience functions that allow you to import / export a `big.data.object` into the R session
232 | 
233 | ```{r, eval=FALSE}
234 | to.dfs()
235 | from.dfs()
236 | ```
237 | 
238 | Using the mapreduce() function
239 | ==============================
240 | 
241 | ```{r, eval=FALSE}
242 | m <- mapreduce(input,
243 |                input.format,
244 |                map,
245 |                reduce,
246 |                output = NULL, ...)
247 | ```
248 | 
249 | * Specify the `input`, `map` and `reduce` functions
250 | * Optionally, specify `output`, to persist result in hdfs
251 | * If `output = NULL`, `mapreduce()` returns a temporary `big.data.object`
252 | * A `big.data.object` is a pointer to a temporary file in dfs
253 | 
254 | If you know that the resulting object is small, use `to.dfs()` to return data from dfs into your R session
255 | 
256 | ```{r, eval=FALSE}
257 | m()  # returns the file location of the big.data.object
258 | from.dfs(m) # available in R session
259 | ```
260 | 
261 | Using the key-value pair
262 | ========================
263 | 
264 | Everything in hadoop uses a key - value pair. (~~Remember: the key is key!~~)
265 | 
266 | Use `keyval()` to create the key-value pair inside your mapper and reducer.
267 | 
268 | ```{r, eval=FALSE}
269 | mapper <- function(k, v){
270 |     ...
271 |     keyval(k', v')
272 | }
273 | reducer <- function(k, v){
274 |     ...
275 |     keyval(k', v')
276 | }
277 | ```
278 | 
279 | 
280 | Use the helper functions `keys()` and `values()` to separate components from the `big.data.object`
281 | 
282 | ```{r, eval=FALSE}
283 | m <- mapreduce(input, map, reduce, ...)
284 | x <- from.dfs(m) # available in R session
285 | 
286 | keys(x)
287 | values(x)
288 | ```
289 | 
290 | Cheat sheet for a mapreduce flow using rmr2
291 | =====================
292 | 
293 | Optional: get a sample of data to dfs:
294 | ```{r, eval=FALSE}
295 | hdp.file <- to.dfs(...)
296 | ```
297 | 
298 | Mapreduce:
299 | 
300 | ```{r, eval=FALSE}
301 | map.fun <- function(k, v){...; keyval(k', v')}
302 | reduce.fun <- function(k, v){...; keyval(k', v')}
303 | 
304 | m <- mapreduce(input,
305 |                map = map.fun,
306 |                reduce = reduce.fun,
307 |                ...
308 | )
309 | ```
310 | 
311 | Inspect results
312 | 
313 | ```{r, eval=FALSE}
314 | x <- from.dfs(m)
315 | 
316 | keys(x)
317 | values(x)
318 | ```
319 | 
320 | 
321 | 
322 | 
323 | 
324 | End
325 | ===
326 | type: section
327 | 
328 | Thank you.
329 | 


--------------------------------------------------------------------------------
/2-Taxi-analysis-with-RHadoop.Rpres:
--------------------------------------------------------------------------------
  1 | Analysing New York taxis with RHadoop
  2 | ========================================================
  3 | author: Andrie de Vries & Michele Usuelli
  4 | date: 2015-07-01, UseR!2015
  5 | width: 1680
  6 | height: 1050
  7 | css: css/custom.css
  8 | 
  9 | 
 10 | Motivation: New York taxi data
 11 | =============
 12 | 
 13 | An inconveniently sized data set (~200GB uncompressed CSV).
 14 | 
 15 | Contains information about every single taxi trip in New York over a 4-year period.
 16 | 
 17 | ***
 18 | 
 19 | ![](images/taxi-tweet.png)
 20 | 
 21 | Source: http://chriswhong.com/open-data/foil_nyc_taxi/
 22 | 
 23 | Introduction to the taxi data
 24 | ====
 25 | type: section
 26 | 
 27 | ```{r include=FALSE}
 28 | knitr::opts_chunk$set(cache = FALSE)
 29 | ```
 30 | 
 31 | 
 32 | Introduction to the taxi data
 33 | ====
 34 | 
 35 | The data is at http://publish.illinois.edu/dbwork/open-data/
 36 | 
 37 | Previous analysis published at
 38 | 
 39 | > Brian Donovan and Daniel B. Work. “Using coarse GPS data to quantify city-scale transportation system resilience to extreme events.”  to appear, Transportation Research Board 94th Annual Meeting, August 2014.  [preprint](https://www.dropbox.com/s/deruyszudfqrll0/TRB15DonovanWork.pdf), [source code](https://github.com/UIUC-Transportation-Data/gpsresilience).
 40 | 
 41 | 
 42 | Data dictionary
 43 | ===============
 44 | 
 45 | Field | Description
 46 | ----- | -----------
 47 | medallion |  a permit to operate a yellow taxi cab in New York City, it is effectively a (randomly assigned) car ID.  See also medallions.
 48 | hack_license |  a license to drive the vehicle, it is effectively a (randomly assigned) driver ID. See also hack license.
 49 | vendor_id |   e.g., Verifone Transportation Systems (VTS), or Mobile Knowledge Systems Inc (CMT), implemented as part of the Technology Passenger Enhancements Project.
 50 | rate_code |  taximeter rate, see NYCT&L description.
 51 | store_and_fwd_flag |  unknown attribute.
 52 | pickup_datetime |  start time of the trip, mm-dd-yyyy hh24:mm:ss EDT.
 53 | dropoff_datetime |  end time of the trip, mm-dd-yyyy hh24:mm:ss EDT.
 54 | passenger_count |  number of passengers on the trip, default value is one.
 55 | trip_time_in_secs |  trip time measured by the taximeter in seconds.
 56 | trip_distance |  trip distance measured by the taximeter in miles.
 57 | pickup_longitude and pickup_latitude |  GPS coordinates at the start of the trip.
 58 | dropoff_longitude and dropoff_latitude |  GPS coordinates at the end of the trip.
 59 | 
 60 | Source: http://publish.illinois.edu/dbwork/open-data/
 61 | 
 62 | Introduction to mapreduce()
 63 | ==========
 64 | type: section
 65 | 
 66 | 
 67 | Using a local context in rmr2
 68 | ======
 69 | 
 70 | ```{r taxi-2-rmr-local, cache=FALSE, include=FALSE}
 71 | read_chunk("taxi/taxi-2-rmr-local.R")
 72 | ```
 73 | 
 74 | ```{r load-packages}
 75 | ```
 76 | 
 77 | 
 78 | Exercise 1
 79 | ==========
 80 | type: cobalt
 81 | 
 82 | * Open the folder `exercises`
 83 | * Open the file `ex-1-lapply.R`
 84 | * Source the script
 85 | 
 86 | You are looking at a simple `mapreduce()` job that simulates `lapply()` in base R.
 87 | 
 88 | 
 89 | make.input.format()
 90 | ======
 91 | 
 92 | The function `make.input.format()` allows you to specify the attributes of your input data.
 93 | 
 94 | The argument `format = "csv"` specifies a csv file. This is a wrapper around `read.table()`.
 95 | 
 96 | ```{r make.input.format}
 97 | ```
 98 | 
 99 | from.dfs()
100 | ======
101 | 
102 | Use from.dfs() to get a (small) file from dfs to local memory
103 | 
104 | ```{r from.dfs-1}
105 | ```
106 | 
107 | from.dfs()
108 | ======
109 | 
110 | Use `keys()` and `values()` to extract the components
111 | 
112 | ```{r from.dfs-2}
113 | ```
114 | 
115 | Why do the columns not have labels?
116 | ======
117 | type: alert
118 | 
119 | Remember: hdfs splits the individual files across nodes
120 | 
121 | Implications:
122 | * The chunk that's available to your mapper may not have a header file
123 | * Thus, in general, csv files should not have a header at all!
124 | 
125 | The solution:
126 | * Make a custom input format, specifying the column names
127 | * (in the same way you specify `col.names` to `read.table()`)
128 | 
129 | Make an input format
130 | ======
131 | 
132 | The file, `data/dictionary_trip_data.csv` (in the linux file system) contains a data dictionary:
133 | 
134 | ```{r make.input.format-with-colnames-1}
135 | ```
136 | 
137 | Use this information to configure the input format
138 | 
139 | Make an input format
140 | ======
141 | 
142 | Use the data dictionary information to configure the input format:
143 | 
144 | 
145 | ```{r make.input.format-with-colnames-2}
146 | ```
147 | 
148 | 
149 | mapreduce() without any transformation
150 | ======
151 | 
152 | ```{r mapreduce-1-a}
153 | ```
154 | 
155 | mapreduce() without any transformation
156 | ======
157 | 
158 | ```{r mapreduce-1-b}
159 | ```
160 | 
161 | mapreduce() with a simpler mapper
162 | ======
163 | 
164 | ```{r mapreduce-2}
165 | ```
166 | 
167 | mapreduce() with a simpler mapper
168 | ======
169 | 
170 | ![](images/mapreduce-weekdays-0.png)
171 | 
172 | 
173 | mapreduce() emitting a keyvalue pair
174 | ======
175 | 
176 | ```{r mapreduce-3}
177 | ```
178 | 
179 | mapreduce() with a reducer
180 | ======
181 | 
182 | ```{r mapreduce-4}
183 | ```
184 | 
185 | mapreduce() with a reducer
186 | ======
187 | 
188 | ![](images/mapreduce-weekdays-1.png)
189 | 
190 | Can we write a more sensible mapper and reducer?
191 | ======
192 | 
193 | ![](images/mapreduce-weekdays-2.png)
194 | 
195 | mapreduce() with a more sensible mapper
196 | ======
197 | 
198 | ```{r mapreduce-5}
199 | ```
200 | 
201 | mapreduce() the final step
202 | ======
203 | 
204 | ```{r mapreduce-6}
205 | ```
206 | 
207 | Exercise 2
208 | ==========
209 | type: cobalt
210 | 
211 | * Open the folder `exercises`
212 | * Open the file `ex-2-taxi-local.R`
213 | * Source the script
214 | 
215 | This script contains a short `mapreduce()` script on the taxi data.
216 | 
217 | Run the code, section by section.
218 | 
219 | Ensure you understand what happens.
220 | 
221 | Try to compute the mean of number of passengers.
222 | 
223 | Hint:
224 | 
225 | * The mapper should compute sum and count of passengers
226 | * The reduced should compute `mean <- sum / count`
227 | 
228 | 
229 | Analysing by hour
230 | =================
231 | type: section
232 | 
233 | Doing the analysis by hour
234 | ==========================
235 | 
236 | ```{r mapreduce-7-a}
237 | ```
238 | 
239 | Doing the analysis by hour
240 | ==========================
241 | 
242 | ```{r mapreduce-7-b}
243 | ```
244 | 
245 | 
246 | Plotting the results
247 | ====================
248 | 
249 | ```{r mapreduce-7-plot-1, out.width="1500px", out.height="1000px"}
250 | ```
251 | 
252 | Plotting the results
253 | ====================
254 | 
255 | ```{r mapreduce-7-plot-2, echo=FALSE, out.width="1500px", out.height="1000px"}
256 | ```
257 | 
258 | 
259 | Putting files in hdfs with rhdfs
260 | ===============
261 | type: section
262 | 
263 | rhdfs function overview
264 | =======================
265 | 
266 | * Initialize
267 |   - `hdfs.init()`
268 |   - `hdfs.defaults()`
269 | * File and directory manipulation
270 |   - `hdfs.ls()`
271 |   - `hdfs.delete()`
272 |   - `hdfs.mkdir()`
273 |   - `hdfs.exists()`
274 | * Copy and move from local <-> HDFS
275 |   - `hdfs.put()`
276 |   - `hdfs.get()`
277 | 
278 | ***
279 | 
280 | * Manipulate files within HDFS
281 |   - `hdfs.copy()`
282 |   - `hdfs.move()`
283 |   - `hdfs.rename()`
284 | * Reading files directly from HDFS
285 |   - `hdfs.file()`
286 |   - `hdfs.read()`
287 |   - `hdfs.write()`
288 |   - `hdfs.flush()`
289 |   - `hdfs.seek()`
290 |   - `hdfs.tell(con)`
291 |   - `hdfs.close()`
292 |   - `hdfs.line.reader()`
293 |   -  `hdfs.read.text.file()`
294 | 
295 | 
296 | Putting the data from local file system to dfs
297 | ============
298 | 
299 | You use the `rhdfs` package to manipulate files in the hadoop dfs.
300 | 
301 | ```{r put-taxi-in-dfs, cache=FALSE, include=FALSE}
302 | read_chunk("utils/put-taxi-data-to-dfs.R")
303 | ```
304 | 
305 | ```{r rhdfs, eval=FALSE}
306 | ```
307 | 
308 | 
309 | Exercise 3
310 | ==========
311 | type: cobalt
312 | 
313 | * Open the folder `exercises`
314 | * Open the file `ex-3-put-taxi-data-to-dfs.R`
315 | * Source the script
316 | 
317 | Use the `rhdfs` functions to satisfy yourself the files are in hadoop.
318 | 
319 | Hint: Use `hdfs.ls()`
320 | 
321 | Running the script in Hadoop compute context
322 | =================
323 | type: section
324 | 
325 | 
326 | Running the script in Hadoop compute context
327 | =================
328 | 
329 | Once you've tested your script in local context, it is generally very easy to deploy on all your data.
330 | 
331 | ```{r, eval=FALSE}
332 | rmr.options(backend = "hadoop")
333 | ```
334 | 
335 | What the hadoop output means
336 | ============
337 | 
338 | ```
339 | packageJobJar: [] [/usr/lib/hadoop-mapreduce/hadoop-streaming-2.4.0.2.1.5.0-695.jar] /tmp/streamjob4577673905010649130.jar tmpDir=null
340 | 15/06/23 14:18:01 INFO impl.TimelineClientImpl: Timeline service address: http://0.0.0.0:8188/ws/v1/timeline/
341 | 15/06/23 14:18:01 INFO client.RMProxy: Connecting to ResourceManager at ra-ldn-cluster-master-02.cloudapp.net/172.16.0.5:8050
342 | 15/06/23 14:18:02 INFO impl.TimelineClientImpl: Timeline service address: http://0.0.0.0:8188/ws/v1/timeline/
343 | ....
344 | 15/06/23 14:18:17 INFO mapreduce.Job: Job job_1434365936669_0041 running in uber mode : false
345 | 15/06/23 14:18:17 INFO mapreduce.Job:  map 0% reduce 0%
346 | 15/06/23 14:18:31 INFO mapreduce.Job:  map 1% reduce 0%
347 | 15/06/23 14:18:34 INFO mapreduce.Job:  map 17% reduce 0%
348 | 15/06/23 14:18:40 INFO mapreduce.Job:  map 19% reduce 0%
349 | 15/06/23 14:18:41 INFO mapreduce.Job:  map 21% reduce 0%
350 | 15/06/23 14:18:47 INFO mapreduce.Job:  map 37% reduce 0%
351 | 15/06/23 14:18:48 INFO mapreduce.Job:  map 92% reduce 0%
352 | 15/06/23 14:18:49 INFO mapreduce.Job:  map 92% reduce 19%
353 | 15/06/23 14:18:50 INFO mapreduce.Job:  map 100% reduce 19%
354 | 15/06/23 14:18:52 INFO mapreduce.Job:  map 100% reduce 100%
355 | 15/06/23 14:19:00 INFO mapreduce.Job: Job job_1434365936669_0041 completed successfully
356 | ...
357 |   rmr
358 | 		reduce calls=24
359 | 15/06/23 14:19:01 INFO streaming.StreamJob: Output directory: /tmp/file55a346591243
360 | ```
361 | 
362 | Exercise 4
363 | ==========
364 | type: cobalt
365 | 
366 | * Open the folder `exercises`
367 | * Open the file `ex-4-taxi-hadoop.R`
368 | * Source the script
369 | 
370 | 
371 | 
372 | Summary
373 | =========
374 | type: section
375 | 
376 | 
377 | Summary
378 | =========
379 | 
380 | In this session you:
381 | 
382 | * Used an Azure cluster running HortonWorks
383 | * Developed scripts in local context with the `rmr2` package:
384 |   - Write mappers
385 |   - Wrote reducers
386 |   - Used `mapreduce()`
387 | * Returned data to your local R session, for plotting
388 | * Uploaded files into hdfs using `rhdfs`
389 | * Ran the script directly in hadoop, using the hadoop context
390 | 
391 | Questions
392 | =========
393 | type: section
394 | 
395 | 
396 | Questions
397 | =========
398 | 
399 | 
400 | ?
401 | ?
402 | ?
403 | 
404 | 
405 | 
406 | End
407 | ===
408 | type: section
409 | 
410 | Thank you.
411 | 


--------------------------------------------------------------------------------
/4-Computing-on-distributed-matrices.Rpres:
--------------------------------------------------------------------------------
 1 | Computing on distributed matrices
 2 | ========================================================
 3 | author: Andrie de Vries & Michele Usuelli
 4 | date: 2015-07-01, UseR!2015
 5 | width: 1680
 6 | height: 1050
 7 | css: css/custom.css
 8 | 
 9 | 
10 | Linear regression
11 | =============
12 | 
13 | $$Y = X \beta + e$$
14 | 
15 | To solve for `b`, create a loss function, differentiate and solve for a minimum:
16 | 
17 | $$B = (X’X)^{-1} X’Y$$
18 | 
19 | Re-arrange:
20 | 
21 | $$(X’X) \beta = X’ Y$$
22 | 
23 | The R function  `b <- solve(x, y)` gives the solution to  $xb = y$
24 | 
25 | ```{r, eval=FALSE}
26 | b <- solve(x’x, x’y)
27 | ```
28 | 
29 | The math of distributed computing
30 | =================================
31 | 
32 | * Commutative property
33 | 
34 | $$a + b  == b + a$$
35 | 
36 | * Associative property
37 | 
38 | $$(a + b) + c == a + (b + c)$$
39 | 
40 | * Distributive property
41 | 
42 | $$a * (b + c) == (a * b) + (b * c)$$
43 | 
44 | 
45 | 
46 | Computing X'X is a distributive operation
47 | =============
48 | 
49 | ![](images/SSCP-matrix.png)
50 | 
51 | Regression
52 | ==========
53 | 
54 | ```{r taxi-2-rmr-local, cache=FALSE, include=FALSE}
55 | read_chunk("demo/09-linear-regression.R")
56 | ```
57 | 
58 | ```{r load-packages}
59 | ```
60 | 
61 | Regression
62 | ==========
63 | 
64 | ```{r generate-data}
65 | ```
66 | 
67 | Regression
68 | ==========
69 | 
70 | ```{r sum-function}
71 | ```
72 | 
73 | Regression
74 | ==========
75 | 
76 | ```{r XtX}
77 | ```
78 | 
79 | Regression
80 | ==========
81 | 
82 | ```{r XtY}
83 | ```
84 | 
85 | Regression
86 | ==========
87 | 
88 | ```{r solve}
89 | ```
90 | 


--------------------------------------------------------------------------------
/5-hive.Rpres:
--------------------------------------------------------------------------------
  1 | Analysing New York taxis with RHive
  2 | ========================================================
  3 | author: Andrie de Vries & Michele Usuelli
  4 | date: 2015-07-01, UseR!2015
  5 | width: 1680
  6 | height: 1050
  7 | css: css/custom.css
  8 | 
  9 | 
 10 | 
 11 | Introduction to Hive
 12 | ====
 13 |   
 14 | - Hadoop data warehouse
 15 | - SQL-like interface
 16 | - Language: HQL (Hive Query Language)
 17 | - HQL translated into MapReduce jobs
 18 | - R interface: RHive, RevoScaleR
 19 | - Hue web interface
 20 | 
 21 | 
 22 | 
 23 | Using Hive with R
 24 | ====
 25 | 
 26 | - Building queries using string manipulation tools like sprintf and paste
 27 | - Running queries using RHive
 28 | 
 29 | 
 30 | Syntax difference between Hive and RHive
 31 | ====
 32 | type: alert
 33 | 
 34 | - Hive: semi-colon in the end
 35 | 
 36 | ```
 37 | SELECT COUNT(*) FROM table_data;
 38 | ```
 39 | 
 40 | - RHive: no semi-colon in the end
 41 | 
 42 | ```
 43 | SELECT COUNT(*) FROM table_data
 44 | ```
 45 | 
 46 | 
 47 | 
 48 | 
 49 | 
 50 | Computing the number of taxi rides by hour
 51 | ====
 52 |   
 53 | Using Hive, we can replicate the rmr2 example counting the number of taxi runs by hour. The steps are
 54 | 
 55 | 1. Importing the data
 56 | 2. Querying the data
 57 | 
 58 | 
 59 | 
 60 | 
 61 | Defining an external table
 62 | ====
 63 | 
 64 | Starting from our data, we can create a Hive table. Since the data is already in CSV format, the easiest option is creating an external table.
 65 | 
 66 | We need to specify
 67 | 
 68 | - The data location
 69 | - The data format
 70 | - The field formats
 71 | 
 72 | 
 73 | 
 74 | 
 75 | Query to create an extenal table
 76 | ====
 77 | 
 78 | ```
 79 | CREATE EXTERNAL TABLE taxi_sample(medallion STRING, hack_license STRING, vendor_id STRING, rate_code INT, store_and_fwd_flag STRING, pickup_datetime STRING, dropoff_datetime STRING, passenger_count INT, trip_time_in_secs INT, trip_distance FLOAT, pickup_longitude FLOAT, pickup_latitude FLOAT, dropoff_longitude FLOAT, dropoff_latitude FLOAT) ROW FORMAT
 80 | DELIMITED FIELDS TERMINATED BY ','
 81 | LINES TERMINATED BY '\n'
 82 | STORED AS TEXTFILE
 83 | LOCATION '/user/share/taxi/sample';
 84 | ```
 85 | 
 86 | 
 87 | 
 88 | Query to create an external table - defining the field format
 89 | ====
 90 | 
 91 | The first part of the query defines a table called *taxi_sample* and the format of its fields.
 92 | 
 93 | ```
 94 | CREATE EXTERNAL TABLE taxi_sample(medallion STRING, hack_license STRING, vendor_id STRING, rate_code INT, store_and_fwd_flag STRING, pickup_datetime STRING, dropoff_datetime STRING, passenger_count INT, trip_time_in_secs INT, trip_distance FLOAT, pickup_longitude FLOAT, pickup_latitude FLOAT, dropoff_longitude FLOAT, dropoff_latitude FLOAT) ROW FORMAT
 95 | ```
 96 | 
 97 | 
 98 | 
 99 | Query to create an external table: defining the data format
100 | ====
101 | 
102 | This part of the query defines the data format.
103 | 
104 | ```
105 | DELIMITED FIELDS TERMINATED BY ','
106 | LINES TERMINATED BY '\n'
107 | STORED AS TEXTFILE
108 | ```
109 | 
110 | 
111 | Query to create an external table: defining the data location
112 | ====
113 | 
114 | This part of the query defines the HDFS path to the data folder.
115 | 
116 | ```
117 | LOCATION '/user/share/taxi/sample';
118 | ```
119 | 
120 | 
121 | Setting up RHive
122 | ====
123 | 
124 | ```{r 2-hive-queries.R, cache=FALSE, include=FALSE}
125 | read_chunk("hive/2-hive-queries.R")
126 | ```
127 | 
128 | ```{r configure-rhive}
129 | ```
130 | 
131 | 
132 | 
133 | 
134 | Simple query: count the number of records
135 | ====
136 | 
137 | 
138 | ```{r row-count}
139 | ```
140 | 
141 | ```{r run-row-count, cache=TRUE}
142 | ```
143 | 
144 | 
145 | 
146 | Simple query: count the number of records
147 | ====
148 | 
149 | 
150 | ```{r build-row-count}
151 | ```
152 | 
153 | ```{r run-row-count, cache=TRUE}
154 | ```
155 | 
156 | 
157 | 
158 | 
159 | Define the hour
160 | ====
161 | 
162 | ```{r define-hour}
163 | ```
164 | 
165 | 
166 | 
167 | 
168 | 
169 | Define the hour
170 | ====
171 | 
172 | ```{r build-define-hour}
173 | ```
174 | 
175 | 
176 | 
177 | Define the hour
178 | ====
179 | 
180 | ```{r run-define-hour, cache=TRUE}
181 | ```
182 | 
183 | 
184 | 
185 | Count by hour
186 | ====
187 | 
188 | ```{r count-by-hour}
189 | ```
190 | 
191 | 
192 | 
193 | 
194 | Count by hour
195 | ====
196 | 
197 | ```{r build-count-by-hour}
198 | ```
199 | 
200 | 
201 | 
202 | Count by hour
203 | ====
204 | 
205 | ```{r run-count-by-hour, cache=TRUE}
206 | ```
207 | 
208 | 
209 | 
210 | Conclusions
211 | ====
212 | 
213 | - Hive allows to define group-by queries writing SQL-like code
214 | - The MapReduce code is not displayed
215 | - R tools for string manipulation allow to easily build complex queries
216 | - RHive allows to run Hive queries directly from R
217 | - For more complex queries, we still need to use other tools like rmr2
218 | 
219 | 
220 | 
221 | 
222 | 
223 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc., <http://fsf.org/>
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     {description}
294 |     Copyright (C) {year}  {fullname}
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   {signature of Ty Coon}, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 
341 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # RHadoop-tutorial
 2 | A tutorial on R and Hadoop, using the RHadoop project
 3 | 
 4 | ##Slides
 5 | 
 6 | 1. [Using R with Hadoop](http://htmlpreview.github.io/?https://github.com/andrie/RHadoop-tutorial/blob/master/1-Using-R-with-Hadoop.html#/)
 7 | 2. [Taxi analysis with RHadoop](http://htmlpreview.github.io/?https://github.com/andrie/RHadoop-tutorial/blob/master/2-Taxi-analysis-with-RHadoop.html)
 8 | 3. [Computing on distributed matrices](http://htmlpreview.github.io/?https://github.com/andrie/RHadoop-tutorial/blob/master/4-Computing-on-distributed-matrices.html)
 9 | 4. [Using hive](http://htmlpreview.github.io/?https://github.com/andrie/RHadoop-tutorial/blob/master/5-hive.html)
10 | 
11 | 


--------------------------------------------------------------------------------
/css/custom.css:
--------------------------------------------------------------------------------
 1 | .reveal section del {
 2 |   color: red;
 3 | }
 4 | 
 5 | .section .reveal .state-background {
 6 |   background: #00F url("css/mslogo.png") no-repeat 5% 95%;
 7 | }
 8 | 
 9 | 
10 | 
11 | /*
12 | .title-blue .reveal .state-background {
13 |   background: #00F url("mslogo.png") no-repeat 5% 95%;
14 | }
15 | */
16 | 


--------------------------------------------------------------------------------
/css/mslogo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/css/mslogo.png


--------------------------------------------------------------------------------
/css/title-background.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/css/title-background.png


--------------------------------------------------------------------------------
/data/dictionary_trip_data.csv:
--------------------------------------------------------------------------------
1 | "medallion","hack_license","vendor_id","rate_code","store_and_fwd_flag","pickup_datetime","dropoff_datetime","passenger_count","trip_time_in_secs","trip_distance","pickup_longitude","pickup_latitude","dropoff_longitude","dropoff_latitude"
2 | "integer","integer","character","integer","character","character","character","integer","integer","numeric","numeric","numeric","numeric","numeric"
3 | 


--------------------------------------------------------------------------------
/demo/01-intro-lapply.R:
--------------------------------------------------------------------------------
 1 | ## @knitr load-packages
 2 | library(rmr2)
 3 | library(rhdfs)
 4 | hdfs.init()
 5 | 
 6 | rmr.options(backend = "local")
 7 | 
 8 | ## @knitr R ---------------------------------------------------------------
 9 | 
10 | x <- 1:1000
11 | lapply(x, function(x)cbind(x, x^2))
12 | 
13 | 
14 | ## @knitr rmr -------------------------------------------------------------
15 | 
16 | small.ints = to.dfs(1:1000)
17 | 
18 | a <- mapreduce(
19 |     input = small.ints,
20 |     map = function(k, v) cbind(v, v^2)
21 | )
22 | 
23 | a()
24 | from.dfs(a)
25 | 
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/demo/02-intro-tapply.R:
--------------------------------------------------------------------------------
 1 | ## @knitr load-packages
 2 | library(rmr2)
 3 | library(rhdfs)
 4 | hdfs.init()
 5 | 
 6 | rmr.options(backend = "local")
 7 | 
 8 | ## @knitr R ---------------------------------------------------------------
 9 | 
10 | groups <- rbinom(32, n = 50, prob = 0.4)
11 | tapply(groups, groups, length)
12 | 
13 | 
14 | ## @knitr rmr -------------------------------------------------------------
15 | 
16 | dfs.groups <- to.dfs(groups)
17 | 
18 | x <- mapreduce(input = dfs.groups,
19 |                map = function(., v) keyval(v, 1),
20 |                reduce = function(k, vv) keyval(k, length(vv))
21 | )
22 | 
23 | y <- from.dfs(x)
24 | 
25 | as.data.frame(y)[order(y[["key"]]), ]
26 | 


--------------------------------------------------------------------------------
/demo/03-download-ebook-to-hdfs.R:
--------------------------------------------------------------------------------
 1 | library(rmr2)
 2 | library(rhdfs)
 3 | hdfs.init()
 4 | 
 5 | rmr.options(backend = "local")
 6 | 
 7 | # -------------------------------------------------------------------------
 8 | 
 9 | dir.create("data")
10 | ebookLocal <- "data/ullyses.txt"
11 | 
12 | if(!file.exists(ebookLocal)) {
13 |     download.file(url = "http://www.gutenberg.org/ebooks/4300.txt.utf-8",
14 |                   destfile = ebookLocal)
15 | }
16 | 
17 | 
18 | 
19 | file.exists(ebookLocal)
20 | readLines(ebookLocal, n = 50)
21 | 
22 | 
23 | # Copy file to HDFS -------------------------------------------------------
24 | 
25 | ebookHadoop <- dirname(ebookLocal)
26 | hdfs.dircreate(ebookHadoop)
27 | hdfs.ls(ebookHadoop)
28 | 
29 | hdfs.put(src = ebookLocal, dest = ebookHadoop)
30 | 
31 | hdfs.ls(ebookHadoop)
32 | 


--------------------------------------------------------------------------------
/demo/04-wordcount-1-algorithm.R:
--------------------------------------------------------------------------------
 1 | library(rmr2)
 2 | library(rhdfs)
 3 | hdfs.init()
 4 | 
 5 | rmr.options(backend = "local")
 6 | 
 7 | # -------------------------------------------------------------------------
 8 | 
 9 | # Script to perform word count --------------------------------------------
10 | 
11 | ebookLocation <- "data/ullyses.txt"
12 | dat <- readLines(ebookLocation, n = 100)
13 | words <- unlist(strsplit(dat, split = "[[:space:][:punct:]]"))
14 | words <- tolower(words)
15 | words <- gsub("[0-9]", "", words)
16 | words <- words[words != ""]
17 | wordcount <- table(words)
18 | keyval(
19 |     key = names(wordcount),
20 |     val = as.numeric(wordcount)
21 | )
22 | 
23 | 
24 | # Function to do word count -----------------------------------------------
25 | 
26 | wordcount <- function(location, n = -1L){
27 |     dat <- readLines(location, n = n)
28 |     words <- unlist(strsplit(dat, split = "[[:space:][:punct:]]"))
29 |     words <- tolower(words)
30 |     words <- gsub("[0-9]", "", words)
31 |     words <- words[words != ""]
32 |     words <- words[!is.na(words)]
33 |     x <- table(words)
34 |     keyval(
35 |         key = names(x),
36 |         val = as.numeric(x)
37 |     )
38 | }
39 | 
40 | x <- wordcount("data/ullyses.txt", n = -1)
41 | lapply(x, head, 10)
42 | lapply(x, tail, 10)
43 | 


--------------------------------------------------------------------------------
/demo/05-wordcount-2-mapreduce.R:
--------------------------------------------------------------------------------
 1 | library(rmr2)
 2 | library(rhdfs)
 3 | hdfs.init()
 4 | 
 5 | rmr.options(backend = "local")
 6 | 
 7 | # Word count --------------------------------------------------------------
 8 | 
 9 | ebookLocation <- "data/ullyses.txt"
10 | 
11 | m <- mapreduce(input = ebookLocation,
12 |                input.format  =  "text",
13 | 
14 |                map = function(k, v){
15 |                    words <- unlist(strsplit(v, split = "[[:space:][:punct:]]"))
16 |                    words <- tolower(words)
17 |                    words <- gsub("[0-9]", "", words)
18 |                    words <- words[words != ""]
19 |                    wordcount <- table(words)
20 |                    keyval(
21 |                        key = names(wordcount),
22 |                        val = as.numeric(wordcount)
23 |                    )
24 |                },
25 | 
26 |                reduce = function(k, counts){
27 |                    keyval(key = k,
28 |                           val = sum(counts))
29 |                }
30 | )
31 | 
32 | 
33 | # Retrieve results and prepare to plot ------------------------------------
34 | 
35 | 
36 | x <- from.dfs(m)
37 | dat <- data.frame(
38 |     word  = keys(x),
39 |     count = values(x)
40 |     )
41 | dat <- dat[order(dat$count, decreasing=TRUE), ]
42 | head(dat, 50)
43 | with(head(dat, 25), plot(count, names = word))
44 | 


--------------------------------------------------------------------------------
/demo/06-logistic-regression-iris.R:
--------------------------------------------------------------------------------
 1 | iris2 <- transform(iris,
 2 |                    Setosa = Species == "virginica",
 3 |                    Species = NULL
 4 | )
 5 | 
 6 | model <- glm(Setosa ~ ., data = iris2, family = binomial)
 7 | table(iris2$Setosa,
 8 |       as.logical(round(
 9 |           predict(model, iris2, type = "response")
10 |           , 2))
11 | )
12 | 


--------------------------------------------------------------------------------
/demo/07-logistic-regression.R:
--------------------------------------------------------------------------------
 1 | gdescent <- function(input, iterations, dims, alpha){
 2 | 
 3 |     plane = t(rep(0, dims))
 4 |     M <- input
 5 |     for (i in 1:iterations) {
 6 |         # map
 7 |         Y <- M[, 1]
 8 |         X <- M[, -1]
 9 |         map <- Y * X * plogis(-Y * as.numeric(X %*% t(plane)))
10 |         # reduce
11 |         gradient <- colSums(map)
12 | 
13 |         plane <- plane + alpha * gradient
14 |     }
15 |     plane
16 | }
17 | 
18 | 
19 | #  ------------------------------------------------------------------------
20 | 
21 | library(ggplot2)
22 | mean(diamonds$price)
23 | quantile(diamonds$price)
24 | glm(price > 5324 ~ ., data = diamonds, family = binomial)
25 | iris2 <- transform(iris,
26 |                    Virginica = Species == "versicolor",
27 |                    Species = NULL
28 | )
29 | str(iris2)
30 | 
31 | dat <- cbind(Virginica = iris2$Virginica * 2 - 1,
32 |              model.matrix(Virginica ~ ., iris2)
33 | )
34 | gdescent(dat, dims = 5, iterations = 1000, alpha = 0.01)
35 | 
36 | coef(glm(Virginica ~ ., data = iris2, family = binomial))
37 | 


--------------------------------------------------------------------------------
/demo/08-logistic-regression-mapreduce.R:
--------------------------------------------------------------------------------
 1 | library(rmr2)
 2 | rmr.options(backend = "local")
 3 | 
 4 | logistic.regression <- function(input, iterations, dims, alpha){
 5 | 
 6 |     plane <- t(rep(0, dims))
 7 |     g <- function(z) 1 / (1 + exp(-z))
 8 | 
 9 |     lr.map <- function(., M) {
10 |         Y <- M[,1]
11 |         X <- M[,-1]
12 |         keyval(
13 |             1,
14 |             Y * X * g(-Y * as.numeric(X %*% t(plane)))
15 |         )
16 |     }
17 | 
18 |     lr.reduce <- function(k, Z){
19 |         keyval(k, t(as.matrix(apply(Z, 2, sum))))
20 |     }
21 | 
22 |     for (i in 1:iterations) {
23 |         x <- mapreduce(
24 |             input,
25 |             map = lr.map,
26 |             reduce = lr.reduce,
27 |             combine = TRUE
28 |         )
29 |         gradient <- values(from.dfs(x))
30 |         plane <- plane + alpha * gradient
31 |     }
32 |     plane
33 | }
34 | 
35 | 
36 | 
37 | # Create design matrix ----------------------------------------------------
38 | 
39 | iris2 <- transform(iris,
40 |                    Virginica = Species == "virginica",
41 |                    Species = NULL
42 | )
43 | 
44 | 
45 | dat <- cbind(Virginica = iris2$Virginica * 2 - 1,
46 |              model.matrix(Virginica ~ ., iris2)
47 | )
48 | str(dat)
49 | head(dat)
50 | 
51 | # Send design matrix to dfs -----------------------------------------------
52 | 
53 | hdp.iris2 <- to.dfs(dat)
54 | hdp.iris2()
55 | from.dfs(hdp.iris2)
56 | model <- logistic.regression(hdp.iris2, dims = 5, iterations = 5, alpha = 0.1)
57 | 
58 | model
59 | 
60 | 
61 | # Inspect confusion matrix ------------------------------------------------
62 | 
63 | # table(iris2$Virginica,
64 | #       as.logical(round(
65 | #           predict(model, iris2, type = "response")
66 | #           , 2))
67 | # )
68 | 


--------------------------------------------------------------------------------
/demo/09-linear-regression.R:
--------------------------------------------------------------------------------
 1 | ## @knitr load-packages ---------------------------------------------------
 2 | library(rmr2)
 3 | library(rhdfs)
 4 | hdfs.init()
 5 | 
 6 | rmr.options(backend = "local")
 7 | 
 8 | ## @knitr generate-data ---------------------------------------------------
 9 | 
10 | X <- matrix(rnorm(2000), ncol = 10)
11 | y <- as.matrix(rnorm(200))
12 | design.mat <- cbind(y, X)
13 | keyed.design <- to.dfs(design.mat)
14 | 
15 | ## @knitr sum-function  ---------------------------------------------------
16 | # The next is a reusable reduce function that just sums a list of matrices, ignores the key.
17 | 
18 | Sum <- function(., YY) keyval(1, list(Reduce('+', YY)))
19 | 
20 | ## @knitr XtX -------------------------------------------------------------
21 | 
22 | # The big matrix is passed to the mapper in chunks of complete rows. Smaller cross-products are computed for these submatrices and passed on to a single reducer, which sums them together. Since we have a single key a combiner is mandatory and since matrix sum is associative and commutative we certainly can use it here.
23 | 
24 | XtX <- values(from.dfs(
25 |     mapreduce(input = keyed.design,
26 |               map = function(., Xi) {
27 |                   yi = Xi[, 1]
28 |                   Xi = Xi[, -1]
29 |                   keyval(1, list(t(Xi) %*% Xi))
30 |               },
31 |               reduce = Sum,
32 |               combine = TRUE)
33 | ))[[1]]
34 | 
35 | ## @knitr XtY -------------------------------------------------------------
36 | 
37 | # The same pretty much goes on also for vector y
38 | 
39 | Xty <- values(from.dfs(
40 |     mapreduce(input = keyed.design,
41 |               map = function(., Xi) {
42 |                   yi = Xi[, 1]
43 |                   Xi = Xi[, -1]
44 |                   keyval(1, list(t(Xi) %*% yi))
45 |               },
46 |               reduce = Sum,
47 |               combine = TRUE)
48 | ))[[1]]
49 | 
50 | ## @knitr solve -----------------------------------------------------------
51 | 
52 | # And finally we just need to call solve.
53 | 
54 | solve(XtX, Xty)
55 | 


--------------------------------------------------------------------------------
/demo/10-kmeans.R:
--------------------------------------------------------------------------------
 1 | # Copyright 2011 Revolution Analytics
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | library(rmr2)
16 | rmr.options(backend = "local")
17 | 
18 | ##  kmeans-signature
19 | kmeans.mr <- function(P, num.clusters, num.iter, combine, in.memory.combine) {
20 |     ##  kmeans-dist.fun
21 |     dist.fun <- function(C, P) apply(C, 1, function(x) colSums((t(P) - x)^2))
22 | 
23 |     ##  kmeans.map
24 |     kmeans.map <- function(., P) {
25 |         nearest <- if(is.null(C))
26 |             sample(1:num.clusters, nrow(P), replace = TRUE)
27 |         else {
28 |             D <- dist.fun(C, P)
29 |             nearest <- max.col(-D)
30 |         }
31 | 
32 |         if(!(combine || in.memory.combine))
33 |             keyval(nearest, P)
34 |         else
35 |             keyval(nearest, cbind(1, P))}
36 | 
37 |     ##  kmeans.reduce
38 |     kmeans.reduce <-if (!(combine || in.memory.combine) )
39 |         function(., P) t(as.matrix(apply(P, 2, mean)))
40 |     else
41 |         function(k, P) keyval(k, t(as.matrix(apply(P, 2, sum))))
42 | 
43 |     ##  kmeans-main-1
44 |     C <- NULL
45 |     for(i in 1:num.iter ) {
46 |         C <- values(from.dfs(
47 |             mapreduce(P,
48 |                       map = kmeans.map,
49 |                       reduce = kmeans.reduce
50 |             )
51 |         ))
52 |         if(combine || in.memory.combine)
53 |             C <- C[, -1] / C[, 1]
54 |         ##  end
55 |         #      points(C, col = i + 1, pch = 19)
56 |         ##  kmeans-main-2
57 |         if(nrow(C) < num.clusters) {
58 |             C <-rbind(C, matrix(
59 |                 rnorm((num.clusters - nrow(C)) * nrow(C)),
60 |                 ncol = nrow(C)) %*% C
61 |             )
62 |         }
63 |     }
64 |     C
65 | }
66 | ##  end
67 | 
68 | ## sample runs
69 | ##
70 | 
71 | out <- list()
72 | 
73 | for(be in c("local")) {
74 |     rmr.options(backend = be)
75 |     set.seed(0)
76 |     ##  kmeans-data
77 |     P <- do.call(rbind,
78 |                  rep(list(matrix(
79 |                      rnorm(10, sd = 10),
80 |                      ncol=2)),
81 |                      20)) +
82 |         matrix(rnorm(200), ncol =2)
83 |     ##  end
84 |     out[[be]] =
85 |         ##  kmeans-run
86 |         kmeans.mr(to.dfs(P),
87 |                   num.clusters = 12,
88 |                   num.iter = 5,
89 |                   combine = FALSE,
90 |                   in.memory.combine = FALSE
91 |         )
92 |     ##  end
93 | }
94 | 
95 | # would love to take this step but kmeans in randomized in a way that makes it hard to be completely reprodubile
96 | # stopifnot(rmr2:::cmp(out[['hadoop']], out[['local']]))
97 | out[["local"]]
98 | 


--------------------------------------------------------------------------------
/exercises/ex-1-lapply.R:
--------------------------------------------------------------------------------
 1 | ##  load-packages
 2 | library(rmr2)
 3 | library(rhdfs)
 4 | hdfs.init()
 5 | 
 6 | rmr.options(backend = "local")
 7 | 
 8 | ##  R ---------------------------------------------------------------
 9 | 
10 | x <- 1:1000
11 | lapply(x, function(x)cbind(x, x^2))
12 | 
13 | 
14 | ##  rmr -------------------------------------------------------------
15 | 
16 | small.ints = to.dfs(1:1000)
17 | 
18 | a <- mapreduce(
19 |     input = small.ints,
20 |     map = function(k, v) cbind(v, v^2)
21 | )
22 | 
23 | a()
24 | from.dfs(a)
25 | 
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/exercises/ex-2-taxi-local.R:
--------------------------------------------------------------------------------
 1 | ##  load-packages ---------------------------------------------------
 2 | library(rmr2)
 3 | rmr.options(backend = "local")
 4 | 
 5 | 
 6 | taxi.hdp <- "data/trip_data_1_sample.csv"
 7 | 
 8 | ##  make.input.format-with-colnames-1 -------------------------------
 9 | 
10 | headerInfo <- read.csv("data/dictionary_trip_data.csv", stringsAsFactors = FALSE)
11 | headerInfo
12 | colClasses <- as.character(as.vector(headerInfo[1, ]))
13 | names(headerInfo)
14 | colClasses
15 | 
16 | taxi.format <- make.input.format(format = "csv", sep = ",",
17 |                                  col.names = names(headerInfo),
18 |                                  colClasses = colClasses,
19 |                                  stringsAsFactors = FALSE
20 | )
21 | 
22 | x <- from.dfs(taxi.hdp, format = taxi.format)
23 | str(values(x))
24 | 
25 | 
26 | taxi.map <- function(k, v){
27 |     original <- v[[6]]
28 |     date <- as.Date(original, origin = "1970-01-01")
29 |     wkday <- weekdays(date)
30 |     dat <- data.frame(date, wkday)
31 |     z <- aggregate(date ~ wkday, dat, FUN = length)
32 |     keyval(z[[1]], z[[2]])
33 | }
34 | 
35 | taxi.reduce <- function(k, v){
36 |     data.frame(weekday = k, trips = sum(v), row.names = k)
37 | }
38 | 
39 | m <- mapreduce(taxi.hdp, input.format = taxi.format,
40 |                map = taxi.map,
41 |                reduce = taxi.reduce
42 | )
43 | keys(from.dfs(m))
44 | values(from.dfs(m))
45 | 
46 | 


--------------------------------------------------------------------------------
/exercises/ex-3-put-taxi-data-to-dfs.R:
--------------------------------------------------------------------------------
 1 | library(rhdfs)
 2 | hdfs.init()
 3 | 
 4 | # List taxi data files in local file system
 5 | localFiles <- dir("data", pattern = "_sample.csv", full.names = TRUE)
 6 | localFiles
 7 | 
 8 | # Put files into dfs
 9 | hdfs.mkdir("taxi/sample")
10 | hdfs.put(localFiles, "taxi/sample")
11 | hdfs.ls("taxi/sample")
12 | 
13 | hdfs.ls("taxi/sample")$file
14 | 


--------------------------------------------------------------------------------
/exercises/ex-4-taxi-hadoop.R:
--------------------------------------------------------------------------------
 1 | library(rmr2)
 2 | library(rhdfs)
 3 | hdfs.init()
 4 | 
 5 | rmr.options(backend = "hadoop")
 6 | 
 7 | hdfs.ls("taxi")$file
 8 | homeFolder <- file.path("/user", Sys.getenv("USER"))
 9 | taxi.hdp <- file.path(homeFolder, "taxi/sample")
10 | 
11 | headerInfo <- read.csv("data/dictionary_trip_data.csv", stringsAsFactors = FALSE)
12 | colClasses <- as.character(as.vector(headerInfo[1, ]))
13 | 
14 | taxi.format <- make.input.format(format = "csv", sep = ",",
15 |                                  col.names = names(headerInfo),
16 |                                  colClasses = colClasses,
17 |                                  stringsAsFactors = FALSE
18 | )
19 | 
20 | taxi.map <- function(k, v){
21 |   original <- v[[6]]
22 |   date <- as.Date(original, origin = "1970-01-01")
23 |   wkday <- weekdays(date)
24 |   hour <- format(as.POSIXct(original), "%H")
25 |   dat <- data.frame(date, hour)
26 |   z <- aggregate(date ~ hour, dat, FUN = length)
27 |   keyval(z[[1]], z[[2]])
28 | }
29 | 
30 | taxi.reduce <- function(k, v){
31 |   data.frame(hour = k, trips = sum(v), row.names = k)
32 | }
33 | 
34 | m <- mapreduce(taxi.hdp, input.format = taxi.format,
35 |                map = taxi.map,
36 |                reduce = taxi.reduce
37 | )
38 | 
39 | dat <- values(from.dfs(m))
40 | 
41 | library("ggplot2")
42 | p <- ggplot(dat, aes(x = hour, y = trips, group = 1)) +
43 |   geom_smooth(method = loess, span = 0.5,
44 |               col = "grey50", fill = "yellow") +
45 |   geom_line(col = "blue") +
46 |   expand_limits(y = 0) +
47 |   ggtitle("Sample of taxi trips in New York")
48 | 
49 | 
50 | p
51 | 


--------------------------------------------------------------------------------
/hdinsight/r-installer.ps1:
--------------------------------------------------------------------------------
  1 | <# 
  2 | .SYNOPSIS 
  3 |   Install R to HDInsight cluster.
  4 |    
  5 | .DESCRIPTION 
  6 |   This installs R on HDInsight cluster and it runs on YARN. 
  7 |  
  8 | .EXAMPLE 
  9 |   .\r-installer-v02.ps1 -RSrc https://hdiconfigactions.blob.core.windows.net/rconfigactionv01/R-3.1.1-win.exe -RmrSrc https://hdiconfigactions.blob.core.windows.net/rconfigactionv01/rmr2_3.1.2.zip -RhdfsSrc https://hdiconfigactions.blob.core.windows.net/rconfigactionv01/rhdfs_1.0.8.zip
 10 | #> 
 11 | 
 12 | param ( 
 13 |     # The binary executable installer location for R. 
 14 |     [Parameter()]
 15 |     [String]$RSrc,
 16 |  
 17 |     # The zip file for R MapReduce.
 18 |     [Parameter()]
 19 |     [String]$RmrSrc,
 20 | 
 21 |     # The zip file for R HDFS.
 22 |     [Parameter()]
 23 |     [String]$RhdfsSrc)
 24 | 
 25 | # Use default parameters in case they are not specified.
 26 | if (!$RSrc) 
 27 | { 
 28 |     $RSrc = "https://hdiconfigactions.blob.core.windows.net/rconfigactionv01/R-3.1.1-win.exe"; 
 29 | }
 30 | if (!$RmrSrc) 
 31 | { 
 32 |     $RmrSrc = "https://hdiconfigactions.blob.core.windows.net/rconfigactionv01/rmr2_3.1.2.zip"; 
 33 | }
 34 | if (!$RhdfsSrc) 
 35 | {
 36 |     $RhdfsSrc = "https://hdiconfigactions.blob.core.windows.net/rconfigactionv01/rhdfs_1.0.8.zip";
 37 | }
 38 | 
 39 | # Download config action module from a well-known directory.
 40 | $CONFIGACTIONURI = "https://hdiconfigactions.blob.core.windows.net/configactionmodulev02/HDInsightUtilities-v02.psm1";
 41 | $CONFIGACTIONMODULE = "$env:TEMP\HDInsightUtilities.psm1";
 42 | $webclient = New-Object System.Net.WebClient;
 43 | $webclient.DownloadFile($CONFIGACTIONURI, $CONFIGACTIONMODULE);
 44 | 
 45 | # (TIP) Import config action helper method module to make writing config action easy.
 46 | if (Test-Path ($CONFIGACTIONMODULE))
 47 | { 
 48 |     Import-Module $CONFIGACTIONMODULE;
 49 | } 
 50 | else
 51 | {
 52 |     Write-Output "Failed to load HDInsightUtilities module, exiting ...";
 53 |     exit;
 54 | }
 55 | 
 56 | # (TIP) Write-HDILog is the way to write to STDOUT and STDERR in HDInsight config action script.
 57 | Write-HDILog "Starting R installation at: $(Get-Date)";
 58 | 
 59 | $rInstallationRoot = (Get-Item "$env:HADOOP_HOME").parent.FullName+'\R\R-3.1.1';
 60 | $rExecutableDir = $rInstallationRoot + '\bin\x64';
 61 | 
 62 | # (TIP) Test whether the destination file already exists and this makes the script idempotent so it functions properly upon reboot and reimage.
 63 | if (Test-Path $rInstallationRoot) 
 64 | {
 65 |     Write-HDILog "Destination: $rInstallationRoot already exists, exiting ...";
 66 |     exit;
 67 | }
 68 | 
 69 | # Install R.
 70 | # (TIP) It is always good to download to user temporary location.
 71 | $rDest = $env:temp + '\' + [guid]::NewGuid() + '.exe';
 72 | Save-HDIFile -SrcUri $RSrc -DestFile $rDest;
 73 | Start-Process -wait $rDest "/COMPONENTS=x64,main,translation /DIR=$rInstallationRoot /SILENT";
 74 | Remove-Item $rDest;
 75 | 
 76 | # Download rmr and rhdfs libraries.
 77 | $rmrDest = $env:temp + '\rmr2_3.1.2.zip';
 78 | Save-HDIFile -SrcUri $RmrSrc -DestFile $rmrDest; 
 79 | $rhdfsDest = $env:temp + '\rhdfs_1.0.8.zip';
 80 | Save-HDIFile -SrcUri $RhdfsSrc -DestFile $rhdfsDest;
 81 | 
 82 | # Install RMR and RHDFS.
 83 | [Environment]::SetEnvironmentVariable('PATH', $env:PATH + ';' + $rExecutableDir, 'Process');
 84 | $output = Invoke-HDICmdScript -CmdToExecute "RScript.exe -e ""install.packages(c('XML', 'getopt', 'dplyr', 'RCurl', 'rJava', 'Rcpp', 'RJSONIO', 'bitops', 'digest', 'functional', 'reshape2', 'stringr', 'plyr',  'caTools', 'stringdist', 'R.utils'), repos='http://ftp.heanet.ie/mirrors/cran.r-project.org/')""";
 85 | 
 86 | Write-HDILog $output;
 87 | $output = Invoke-HDICmdScript -CmdToExecute "R.exe CMD INSTALL $rmrDest";
 88 | Write-HDILog $output;
 89 | $output = Invoke-HDICmdScript -CmdToExecute "R.exe CMD INSTALL $rhdfsDest";
 90 | Write-HDILog $output;
 91 | 
 92 | # (TIP) Please clean up temporary files when no longer needed.
 93 | Remove-Item $rmrDest;
 94 | Remove-Item $rhdfsDest;
 95 | 
 96 | # Config environment variables.
 97 | [Environment]::SetEnvironmentVariable('PATH', $env:PATH + ';' + $rExecutableDir + ';' + $env:JAVA_HOME + '\jre\bin\server', 'Machine');
 98 | [Environment]::SetEnvironmentVariable('HADOOP_CMD', $env:HADOOP_HOME + '\bin\hadoop', 'Machine');
 99 | [Environment]::SetEnvironmentVariable('HDFS_CMD', $env:HADOOP_HOME + '\bin\hdfs', 'Machine');
100 | [Environment]::SetEnvironmentVariable('HADOOP_STREAMING', (gci ($env:HADOOP_HOME + '\share\hadoop\tools\lib') -filter *streaming* | Select-Object -First 1 | % { $_.FullName }), 'Machine');
101 | 
102 | # Restart nodemanager to pick up environment variable changes.
103 | if (Get-HDIService -ServiceName nodemanager) 
104 | {
105 |     Restart-Service nodemanager;
106 | }
107 | 
108 | Write-HDILog "Done with R installation at: $(Get-Date)";


--------------------------------------------------------------------------------
/hdinsight/remove.ps1:
--------------------------------------------------------------------------------
1 | $ErrorActionPreference = "Stop"
2 | 
3 | # Load settings
4 | . .\settings.ps1
5 | 
6 | # Delete cluster
7 | Remove-AzureHDInsightCluster -Name $settings.HDInsightClusterName


--------------------------------------------------------------------------------
/hdinsight/settings.ps1:
--------------------------------------------------------------------------------
 1 | # Use unique prefix for cluster resources
 2 | $prefix = "andrier"
 3 | 
 4 | $settings = New-Object PSObject -Property @{
 5 |     # Subscription
 6 |     SubscriptionName = "Visual Studio Ultimate with MSDN"
 7 | 
 8 |     # Storage account
 9 |     StorageAccountName = "$($prefix)hadooptutorial"
10 |     StorageAccountLabel = "RHaddop-tutorial"
11 |     StorageAccountLocation = "West Europe"
12 | 
13 |     # Cluster
14 |     HDInsightClusterName = "$($prefix)-r-hadoop-tutorial"
15 |     HDInsightContainerName = "$($prefix)-r-hadoop-tutorial-hdfs"
16 |     HDInsightUsername = "admin"
17 |     HDInsightPassword =  "RHadoopTutorial2015!"
18 |     HDInsightClusterSizeInNodes = 2
19 |     HDInsightHeadNodeVMSize = "Large"
20 | 
21 |     # Custom version of the script referenced at:
22 |     #   Install and use R on HDInsight Hadoop clusters
23 |     #   https://azure.microsoft.com/en-us/documentation/articles/hdinsight-hadoop-r-scripts
24 |     RInstallerScriptUri = "https://raw.githubusercontent.com/StanislawSwierc/RHadoop-tutorial/master/hdinsight/r-installer.ps1"
25 |     #RInstallerScriptUri = "https://raw.githubusercontent.com/$($user)/RHadoop-tutorial/master/hdinsight/r-installer.ps1"
26 | }
27 | 


--------------------------------------------------------------------------------
/hdinsight/setup.ps1:
--------------------------------------------------------------------------------
 1 | $ErrorActionPreference = "Stop"
 2 | 
 3 | # Load settings
 4 | . .\settings.ps1
 5 | 
 6 | # Check if Azure Powershell module is available
 7 | if (Get-Module -ListAvailable Azure) {
 8 |     Write-Host "Importing Azure Powershell module"
 9 |     Import-Module -Name Azure
10 | } else {
11 |     throw "Azure module not available. Please refer to: How to install and configure Azure PowerShell " +
12 |        "(https://azure.microsoft.com/en-us/documentation/articles/powershell-install-configure)"
13 | }
14 | 
15 | # Make Azure account and its subscriptions are available in current session
16 | Add-AzureAccount
17 | 
18 | # Select subscription in case there are several subscriptions connected to the account
19 | Select-AzureSubscription $settings.SubscriptionName
20 | 
21 | # Check if storage account exist and create it otherwise
22 | if (!(Get-AzureStorageAccount -StorageAccountName $settings.StorageAccountName -ErrorAction SilentlyContinue)) {
23 |     Write-Host "Creating new storage account: $($settings.StorageAccountName)"
24 |     New-AzureStorageAccount `
25 |         -StorageAccountName $settings.StorageAccountName `
26 |         -Label $settings.StorageAccountLabel `
27 |         -Location $settings.StorageAccountLocation
28 | }
29 | 
30 | # Select storage account
31 | Set-AzureSubscription `
32 |     -SubscriptionName $settings.SubscriptionName `
33 |     -CurrentStorageAccountName $settings.StorageAccountName
34 | 
35 | # Get the storage account key
36 | $storageAccountKey = (Get-AzureStorageKey $settings.StorageAccountName).Primary
37 | 
38 | # Create cluster configuration
39 | $hdinsightConfig = New-AzureHDInsightClusterConfig `
40 |         -HeadNodeVMSize $settings.HDInsightHeadNodeVMSize `
41 |         -ClusterSizeInNodes $settings.HDInsightClusterSizeInNodes |
42 |     Set-AzureHDInsightDefaultStorage `
43 |         -StorageAccountName $settings.StorageAccountName `
44 |         -StorageAccountKey $storageAccountKey `
45 |         -StorageContainerName $settings.HDInsightContainerName |
46 |     Add-AzureHDInsightScriptAction `
47 |         -Name "Install R" `
48 |         -ClusterRoleCollection HeadNode,DataNode `
49 |         -Uri $settings.RInstallerScriptUri
50 | 
51 | # Convert plain text user name and password to PSCredential object
52 | $hdinsightPasswordSecureString = ConvertTo-SecureString -String $settings.HDInsightPassword -AsPlainText -Force  
53 | $hdinsightCredential = New-Object -TypeName System.Management.Automation.PSCredential `
54 |     -ArgumentList $settings.HDInsightUsername, $hdinsightPasswordSecureString
55 | 
56 | # Create cluster
57 | $hdinsightCluster = New-AzureHDInsightCluster `
58 |     -Name $settings.HDInsightClusterName `
59 |     -Config $hdinsightConfig `
60 |     -Credential $hdinsightCredential `
61 |     -Location $settings.StorageAccountLocation
62 | 
63 | # Check cluster state
64 | Get-AzureHDInsightCluster -Name $settings.HDInsightClusterName 
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/hive/1-create-external-table.sql:
--------------------------------------------------------------------------------
1 | DROP TABLE IF EXISTS taxi_sample;
2 | CREATE EXTERNAL TABLE taxi_sample(medallion STRING, hack_license STRING, vendor_id STRING, rate_code INT, store_and_fwd_flag STRING, pickup_datetime STRING, dropoff_datetime STRING, passenger_count INT, trip_time_in_secs INT, trip_distance FLOAT, pickup_longitude FLOAT, pickup_latitude FLOAT, dropoff_longitude FLOAT, dropoff_latitude FLOAT) ROW FORMAT
3 | DELIMITED FIELDS TERMINATED BY ','
4 | LINES TERMINATED BY '\n'
5 | STORED AS TEXTFILE
6 | LOCATION '/user/andrie.devries/taxi/sample';
7 | 


--------------------------------------------------------------------------------
/hive/2-hive-queries.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # @knitr configure-rhive --------------------------------------------------
 4 | # install.packages("RHive")
 5 | library(RHive)
 6 | dirHive <- "/user/hive"
 7 | Sys.setenv(HIVE_HOME = "/usr/lib/hive")
 8 | rhive.init()
 9 | rhive.connect(host = "127.0.0.1", hiveServer2 = TRUE)
10 | 
11 | 
12 | 
13 | 
14 | # @knitr row-count --------------------------------------------------------
15 | query_count <- "SELECT COUNT(*) FROM taxi_sample"
16 | 
17 | 
18 | # @knitr build-row-count --------------------------------------------------
19 | name_table <- "taxi_sample"
20 | query_count <- sprintf("SELECT COUNT(*) FROM %s",
21 |                        name_table)
22 | cat(query_count)
23 | 
24 | 
25 | 
26 | # @knitr run-row-count ----------------------------------------------------
27 | table_count <- rhive.query(query_count)
28 | head(table_count)
29 | 
30 | 
31 | 
32 | # @knitr define-hour ------------------------------------------------------
33 | query_hour <- "
34 | SELECT pickup_datetime, substring(pickup_datetime, 12, 2) AS hour
35 |   FROM taxi_sample LIMIT 100"
36 | 
37 | 
38 | 
39 | # @knitr build-define-hour ------------------------------------------------
40 | field_time <- "pickup_datetime"
41 | field_hour <- sprintf("substring(%s, 12, 2)",
42 |                       field_time)
43 | query_hour <- sprintf(
44 |   "SELECT %s, %s AS hour
45 |   FROM %s LIMIT 100",
46 |   field_time, field_hour, name_table)
47 | cat(query_hour)
48 | 
49 | 
50 | 
51 | # @knitr run-define-hour --------------------------------------------------
52 | head(rhive.query(query_hour))
53 | 
54 | 
55 | 
56 | # @knitr count-by-hour ----------------------------------------------------
57 | query_count <- "
58 | SELECT substring(pickup_datetime, 12, 2) AS hour, COUNT(*) AS count
59 | FROM taxi_sample
60 | GROUP BY substring(pickup_datetime, 12, 2)"
61 | 
62 | 
63 | # @knitr build-count-by-hour ----------------------------------------------
64 | query_count <- sprintf(
65 |   "SELECT %s AS hour, COUNT(*) AS count
66 | FROM %s
67 | GROUP BY %s",
68 |   field_hour, name_table, field_hour)
69 | cat(query_count)
70 | 
71 | 
72 | # @knitr run-count-by-hour ------------------------------------------------
73 | head(rhive.query(query_count))
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/images/SSCP-matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/SSCP-matrix.png


--------------------------------------------------------------------------------
/images/cluster-structure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/cluster-structure.png


--------------------------------------------------------------------------------
/images/dilbert-big-data-in-the-cloud.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/dilbert-big-data-in-the-cloud.png


--------------------------------------------------------------------------------
/images/img-hadoop-logical-flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/img-hadoop-logical-flow.png


--------------------------------------------------------------------------------
/images/img-rmr2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/img-rmr2.png


--------------------------------------------------------------------------------
/images/indeed-job-trend-stats.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/indeed-job-trend-stats.png


--------------------------------------------------------------------------------
/images/jfk-times-square.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/jfk-times-square.png


--------------------------------------------------------------------------------
/images/mapreduce-weekdays-0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/mapreduce-weekdays-0.png


--------------------------------------------------------------------------------
/images/mapreduce-weekdays-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/mapreduce-weekdays-1.png


--------------------------------------------------------------------------------
/images/mapreduce-weekdays-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/mapreduce-weekdays-2.png


--------------------------------------------------------------------------------
/images/r-for-dummies.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/r-for-dummies.jpg


--------------------------------------------------------------------------------
/images/r-machine-learning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/r-machine-learning.png


--------------------------------------------------------------------------------
/images/taxi-tweet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/taxi-tweet.png


--------------------------------------------------------------------------------
/images/xkcd-my-job-is-compiling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/xkcd-my-job-is-compiling.png


--------------------------------------------------------------------------------
/images/xkcd-wordcount.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrie/RHadoop-tutorial/87eb3a7609f141d073358dc8b04cbf38fd5b0918/images/xkcd-wordcount.png


--------------------------------------------------------------------------------
/taxi/taxi-1-upload.R:
--------------------------------------------------------------------------------
 1 | library(rmr2)
 2 | library(rhdfs)
 3 | hdfs.init()
 4 | 
 5 | taxifile <- "data/trip_data_1_sample.csv"
 6 | file.exists(taxifile)
 7 | 
 8 | list.files("data")
 9 | hdfs.ls("data")
10 | 
11 | hdfs.ls("taxi")
12 | hdfs.put("data/trip_data_1_sample.csv", "taxi/trip_data_1_sample.csv")
13 | hdfs.ls("taxi")
14 | 
15 | # Put taxi data in dfs
16 | hdfs.ls(".")
17 | hdfs.mkdir("taxi")
18 | 
19 | hdfs.put(taxifile, file.path("taxi", basename(taxifile)))
20 | hdfs.ls(".")
21 | 


--------------------------------------------------------------------------------
/taxi/taxi-2-rmr-local.R:
--------------------------------------------------------------------------------
  1 | ## @knitr load-packages ---------------------------------------------------
  2 | library(rmr2)
  3 | rmr.options(backend = "local")
  4 | 
  5 | ## @knitr make.input.format -----------------------------------------------
  6 | taxi.format <- make.input.format("csv", sep = ",",
  7 |                                  colClasses = "character",
  8 |                                  stringsAsFactors = FALSE
  9 | )
 10 | 
 11 | 
 12 | 
 13 | 
 14 | ## @knitr from.dfs-1 ------------------------------------------------------
 15 | 
 16 | taxi.hdp <- "data/trip_data_1_sample.csv"
 17 | x <- from.dfs(taxi.hdp, format = taxi.format)
 18 | str(x)
 19 | 
 20 | 
 21 | ## @knitr from.dfs-2 ------------------------------------------------------
 22 | 
 23 | x <- from.dfs(taxi.hdp, format = taxi.format)
 24 | head(
 25 |     values(x)
 26 | )
 27 | 
 28 | ## @knitr make.input.format-with-colnames-1 -------------------------------
 29 | 
 30 | headerInfo <- read.csv("data/dictionary_trip_data.csv", stringsAsFactors = FALSE)
 31 | headerInfo
 32 | colClasses <- as.character(as.vector(headerInfo[1, ]))
 33 | names(headerInfo)
 34 | colClasses
 35 | 
 36 | ## @knitr make.input.format-with-colnames-2 -------------------------------
 37 | 
 38 | taxi.format <- make.input.format(format = "csv", sep = ",",
 39 |                                  col.names = names(headerInfo),
 40 |                                  colClasses = colClasses,
 41 |                                  stringsAsFactors = FALSE
 42 | )
 43 | 
 44 | x <- from.dfs(taxi.hdp, format = taxi.format)
 45 | str(values(x))
 46 | 
 47 | 
 48 | ## @knitr mapreduce-1-a ---------------------------------------------------
 49 | 
 50 | m <- mapreduce(taxi.hdp, input.format = taxi.format)
 51 | m
 52 | m()
 53 | 
 54 | ## @knitr mapreduce-1-b ---------------------------------------------------
 55 | m <- mapreduce(taxi.hdp, input.format = taxi.format)
 56 | head(
 57 |     values(from.dfs(m))
 58 | )
 59 | 
 60 | 
 61 | 
 62 | ## @knitr mapreduce-2 -----------------------------------------------------
 63 | taxi.map <- function(k, v){
 64 |     original <- v[[6]]
 65 |     original
 66 | }
 67 | m <- mapreduce(taxi.hdp, input.format = taxi.format,
 68 |                map = taxi.map
 69 | )
 70 | head(
 71 |     values(from.dfs(m))
 72 | )
 73 | 
 74 | ## @knitr mapreduce-3 -----------------------------------------------------
 75 | taxi.map <- function(k, v){
 76 |     original <- v[[6]]
 77 |     date <- as.Date(original, origin = "1970-01-01")
 78 |     wkday <- weekdays(date)
 79 |     keyval(wkday, 1)
 80 | }
 81 | m <- mapreduce(taxi.hdp, input.format = taxi.format,
 82 |                map = taxi.map
 83 | )
 84 | head(
 85 |     keys(from.dfs(m)),
 86 |     20
 87 | )
 88 | head(
 89 |     values(from.dfs(m)),
 90 |     20
 91 | )
 92 | 
 93 | ## @knitr mapreduce-4 -----------------------------------------------------
 94 | taxi.map <- function(k, v){
 95 |     original <- v[[6]]
 96 |     date <- as.Date(original, origin = "1970-01-01")
 97 |     wkday <- weekdays(date)
 98 |     keyval(wkday, 1)
 99 | }
100 | taxi.reduce <- function(k, v){
101 |     keyval(k, sum(v))
102 | }
103 | m <- mapreduce(taxi.hdp, input.format = taxi.format,
104 |                map = taxi.map,
105 |                reduce = taxi.reduce
106 | )
107 | head(
108 |     keys(from.dfs(m))
109 | )
110 | head(
111 |     values(from.dfs(m))
112 | )
113 | 
114 | ## @knitr mapreduce-5 -----------------------------------------------------
115 | taxi.map <- function(k, v){
116 |     original <- v[[6]]
117 |     date <- as.Date(original, origin = "1970-01-01")
118 |     wkday <- weekdays(date)
119 |     dat <- data.frame(date, wkday)
120 |     z <- aggregate(date ~ wkday, dat, FUN = length)
121 |     keyval(z[[1]], z[[2]])
122 | }
123 | m <- mapreduce(taxi.hdp, input.format = taxi.format,
124 |                map = taxi.map
125 | )
126 | keys(from.dfs(m))
127 | values(from.dfs(m))
128 | 
129 | ## @knitr mapreduce-6 -----------------------------------------------------
130 | taxi.map <- function(k, v){
131 |     original <- v[[6]]
132 |     date <- as.Date(original, origin = "1970-01-01")
133 |     wkday <- weekdays(date)
134 |     dat <- data.frame(date, wkday)
135 |     z <- aggregate(date ~ wkday, dat, FUN = length)
136 |     keyval(z[[1]], z[[2]])
137 | }
138 | taxi.reduce <- function(k, v){
139 |     data.frame(weekday = k, trips = sum(v), row.names = k)
140 | }
141 | m <- mapreduce(taxi.hdp, input.format = taxi.format,
142 |                map = taxi.map,
143 |                reduce = taxi.reduce
144 | )
145 | keys(from.dfs(m))
146 | values(from.dfs(m))
147 | 
148 | ## @knitr mapreduce-7-a ---------------------------------------------------
149 | taxi.map <- function(k, v){
150 |     original <- v[[6]]
151 |     date <- as.Date(original, origin = "1970-01-01")
152 |     wkday <- weekdays(date)
153 |     hour <- format(as.POSIXct(original), "%H")
154 |     dat <- data.frame(date, hour)
155 |     z <- aggregate(date ~ hour, dat, FUN = length)
156 |     keyval(z[[1]], z[[2]])
157 | }
158 | 
159 | taxi.reduce <- function(k, v){
160 |     data.frame(hour = k, trips = sum(v), row.names = k)
161 | }
162 | 
163 | ## @knitr mapreduce-7-b ---------------------------------------------------
164 | m <- mapreduce(taxi.hdp, input.format = taxi.format,
165 |                map = taxi.map,
166 |                reduce = taxi.reduce
167 | )
168 | keys(from.dfs(m))
169 | dat <- values(from.dfs(m))
170 | dat
171 | 
172 | ## @knitr mapreduce-7-plot-1 ----------------------------------------------
173 | 
174 | library("ggplot2")
175 | p <- ggplot(dat, aes(x = hour, y = trips, group = 1)) +
176 |     geom_smooth(method = loess, span = 0.5,
177 |                 col = "grey50", fill = "yellow") +
178 |     geom_line(col = "blue") +
179 |     expand_limits(y = 0) +
180 |     ggtitle("Sample of taxi trips in New York")
181 | 
182 | 
183 | ## @knitr mapreduce-7-plot-2 ----------------------------------------------
184 | p
185 | 


--------------------------------------------------------------------------------
/taxi/taxi-2-rmr-lon-lat.R:
--------------------------------------------------------------------------------
  1 | library(rmr2)
  2 | 
  3 | 
  4 | # Define compute context --------------------------------------------------
  5 | 
  6 | ### local
  7 | rmr.options(backend = "local")
  8 | taxi.hdp <- "data/trip_data_1_sample.csv"
  9 | 
 10 | ### hadoop
 11 | rmr.options(backend = "hadoop")
 12 | homeFolder <- file.path("/user", Sys.getenv("USER"))
 13 | taxi.hdp <- file.path(homeFolder, "taxi", "sample")
 14 | rmr.options(backend.parameters = list(
 15 |   "mapreduce.map.java.opts=-Xmx800M", "mapreduce.reduce.java.opts=-Xmx800M"))
 16 | 
 17 | 
 18 | 
 19 | # Define input format -----------------------------------------------------
 20 | 
 21 | headerInfo <- read.csv("data/dictionary_trip_data.csv", stringsAsFactors = FALSE)
 22 | colClasses <- as.character(as.vector(headerInfo[1, ]))
 23 | 
 24 | taxi.format <- make.input.format(format = "csv", sep = ",",
 25 |                                  col.names = names(headerInfo),
 26 |                                  colClasses = colClasses,
 27 |                                  stringsAsFactors = FALSE
 28 | )
 29 | 
 30 | 
 31 | # Helper functions to compute great circle distance -----------------------
 32 | 
 33 | 
 34 | # Calculates the geodesic distance between two points specified by 
 35 | # radian latitude/longitude using the Spherical Law of Cosines (slc)
 36 | # Source: http://www.r-bloggers.com/great-circle-distance-calculations-in-r/
 37 | gcd.slc <- function(long1, lat1, long2, lat2) {
 38 |   R <- 6371 # Earth mean radius [km]
 39 |   d <- acos(sin(lat1)*sin(lat2) + cos(lat1)*cos(lat2) * cos(long2-long1)) * R
 40 |   return(d) # Distance in km
 41 | }
 42 | 
 43 | # Convert degrees to radians
 44 | deg2rad <- function(deg) return(deg*pi/180)
 45 | 
 46 | 
 47 | # Mapper: compute trip time for subset of trips originating at JFK --------
 48 | 
 49 | taxi.map <- function(k, v){
 50 |   #   browser()
 51 |   lon <- deg2rad(v$pickup_longitude)
 52 |   lat <- deg2rad(v$pickup_latitude)
 53 |   jfk_lon <- deg2rad(-73.779564)
 54 |   jfk_lat <- deg2rad(40.646908)
 55 |   distToJfk <- gcd.slc(lon, lat, jfk_lon, jfk_lat)
 56 |   
 57 |   lon <- deg2rad(v$dropoff_longitude)
 58 |   lat <- deg2rad(v$dropoff_latitude)
 59 |   ts_lon <- deg2rad(-73.985131)
 60 |   ts_lat <- deg2rad(40.758895)
 61 |   distToTimesSquare <- gcd.slc(lon, lat, jfk_lon, jfk_lat)
 62 |   
 63 |   original <- v[distToJfk < 1.6 & distToTimesSquare < 1.6, ]
 64 |   time <- as.POSIXct(original$dropoff_datetime) - as.POSIXct(original$pickup_datetime)
 65 |   time <- as.numeric(time)
 66 |   
 67 |   keep <- time > 600 # 10 minutes artificial threshold - noisy data
 68 |   original <- original[keep, ]
 69 |   time <- time[keep]
 70 |   
 71 |   if(nrow(original) == 0){
 72 |     z <- data.frame(wkday="None", hour="00", 
 73 |                     time = matrix(c(time.1=0, time.2=0), nrow=1),
 74 |                     stringsAsFactors = FALSE
 75 |     )
 76 |   } else {
 77 |     date <- as.Date(original[[6]], origin = "1970-01-01")
 78 |     wkday <- weekdays(date)
 79 |     hour <- format(as.POSIXct(original[[6]]), "%H")
 80 | #     browser()
 81 |     dat <- data.frame(wkday, hour, time)
 82 |     z <- aggregate(time ~ wkday + hour, dat, 
 83 |                    FUN = function(x)cbind(sum(x), length(x)))
 84 |   }
 85 |   keyval(z[, 1:2], z[, 3])
 86 | }
 87 | 
 88 | # Reducer -----------------------------------------------------------------
 89 | 
 90 | taxi.reduce <- function(k, v){
 91 | #   browser()
 92 |   time = sum(v[, 1])
 93 |   count = sum(v[, 2])
 94 |   cbind(k, duration = time/count / 60) # convert seconds to minutes
 95 | }
 96 | 
 97 | 
 98 | 
 99 | # Mapreduce ---------------------------------------------------------------
100 | 
101 | m <- mapreduce(taxi.hdp, input.format = taxi.format,
102 |                map = taxi.map,
103 |                reduce = taxi.reduce
104 | )
105 | keys(from.dfs(m))
106 | dat <- values(from.dfs(m))
107 | dat
108 | 
109 | 
110 | # Plot results ------------------------------------------------------------
111 | 
112 | library("ggplot2")
113 | ggplot(dat, aes(x = hour, y = duration, group = wkday)) +
114 |   geom_point(col = "blue") +
115 |   geom_line() +
116 |   expand_limits(y = 0) +
117 |   facet_grid(wkday ~ .) +
118 |   ggtitle("Sample of taxi trips in New York")
119 | 
120 | 
121 | 


--------------------------------------------------------------------------------
/taxi/taxi-rmr-3-hadoop.R:
--------------------------------------------------------------------------------
 1 | library(rmr2)
 2 | library(rhdfs)
 3 | hdfs.init()
 4 | 
 5 | rmr.options(backend = "hadoop")
 6 | 
 7 | hdfs.ls("taxi")$file
 8 | # taxi.hdp <- "/user/andrie.devries/taxi/sample"
 9 | homeFolder <- file.path("/user", Sys.getenv("USER"))
10 | taxi.hdp <- file.path(homeFolder, "taxi/sample")
11 | 
12 | 
13 | headerInfo <- read.csv("data/dictionary_trip_data.csv", stringsAsFactors = FALSE)
14 | colClasses <- as.character(as.vector(headerInfo[1, ]))
15 | 
16 | taxi.format <- make.input.format(format = "csv", sep = ",",
17 |                                  col.names = names(headerInfo),
18 |                                  colClasses = colClasses,
19 |                                  stringsAsFactors = FALSE
20 | )
21 | 
22 | taxi.map <- function(k, v){
23 |   original <- v[[6]]
24 |   date <- as.Date(original, origin = "1970-01-01")
25 |   wkday <- weekdays(date)
26 |   hour <- format(as.POSIXct(original), "%H")
27 |   dat <- data.frame(date, hour)
28 |   z <- aggregate(date ~ hour, dat, FUN = length)
29 |   keyval(z[[1]], z[[2]])
30 | }
31 | 
32 | taxi.reduce <- function(k, v){
33 |   data.frame(hour = k, trips = sum(v), row.names = k)
34 | }
35 | 
36 | m <- mapreduce(taxi.hdp, input.format = taxi.format,
37 |                map = taxi.map,
38 |                reduce = taxi.reduce
39 | )
40 | 
41 | dat <- values(from.dfs(m))
42 | 
43 | library("ggplot2")
44 | p <- ggplot(dat, aes(x = hour, y = trips, group = 1)) +
45 |   geom_smooth(method = loess, span = 0.5,
46 |               col = "grey50", fill = "yellow") +
47 |   geom_line(col = "blue") +
48 |   expand_limits(y = 0) +
49 |   ggtitle("Sample of taxi trips in New York")
50 | 
51 | 
52 | p
53 | 


--------------------------------------------------------------------------------
/test.txt:
--------------------------------------------------------------------------------
1 | test
2 | 


--------------------------------------------------------------------------------
/utils/put-taxi-data-to-dfs.R:
--------------------------------------------------------------------------------
 1 | ## @knitr rhdfs -----------------------------------------------------------
 2 | library(rhdfs)
 3 | hdfs.init()
 4 | 
 5 | localFiles <- dir("data", pattern = "_sample.csv", full.names = TRUE)
 6 | localFiles
 7 | hdfs.mkdir("taxi")
 8 | hdfs.put(localFiles, "taxi")
 9 | hdfs.ls("taxi")
10 | 
11 | hdfs.ls("taxi")$file
12 | 


--------------------------------------------------------------------------------
/utils/sample-taxi-data.R:
--------------------------------------------------------------------------------
 1 | infolder  <- "C:/Users/adevries/downloads/Taxi/Foil2013"
 2 | outfolder <- "C:/Users/adevries/documents/github/RHadoop-tutorial/RHadoop-tutorial/data"
 3 | 
 4 | zips <- list.files(infolder, pattern = "trip_data.*", full.names = TRUE)
 5 | 
 6 | downSample <- function(infile, outfile, n = 1000, keepHeader = FALSE){
 7 |   message(basename(outfile))
 8 |   con <- file(infile, open = "r")
 9 |   conout <- file(outfile, open = "a")
10 |   
11 |   on.exit({
12 |     close(con)
13 |     close(conout)
14 |   })
15 |   
16 |   if(keepHeader){
17 |     header <- readLines(con, n = 1)
18 |     writeLines(header, con = conout)
19 |   }
20 |   
21 |   eof <- FALSE
22 |   i <- 0
23 |   while(!eof){
24 |     dat <- readLines(con, n = n)
25 |     if(length(dat) != n) eof <- TRUE
26 |     keep <- sample(dat, 1)
27 |     writeLines(keep, con = conout)
28 |     i <- i + 1
29 |     if (i %% n == 0) message(i/n)
30 |   }
31 | 
32 | }
33 | 
34 | #  ------------------------------------------------------------------------
35 | 
36 | 
37 | for(infile in zips){
38 |   outfile <- file.path(outfolder, gsub("\\.csv$", "_sample.csv", basename(infile)))
39 |   downSample(infile, outfile)
40 | }
41 | 
42 | # file.remove(outfile)
43 | 
44 | 
45 | dat <- readLines(outfile)
46 | length(dat)
47 | head(dat)
48 | tail(dat)
49 | 
50 | 


--------------------------------------------------------------------------------