├── README
└── airline
    └── src
        └── deptdelay_by_month
            └── R
                ├── hive
                    ├── README
                    └── hive.R
                ├── rhipe
                    ├── README
                    └── rhipe.R
                ├── rmr
                    ├── README
                    ├── deptdelay-rmr.R
                    └── deptdelay-rmr12.R
                └── streaming
                    ├── README
                    ├── map.R
                    └── reduce.R


/README:
--------------------------------------------------------------------------------
 1 | Examples of integrating Hadoop and R. This directory contains the following:
 2 | 
 3 | airline/
 4 | 
 5 | Examples which use the flight arrival and departure data available here: 
 6 |  
 7 |   http://stat-computing.org/dataexpo/2009/the-data.html
 8 | 
 9 | Note that this is the same data set used for many of the examples in the RHIPE documentation. 
10 | 
11 | The following examples are in this directory:
12 | 
13 | airline/src/deptdelay_by_month/R/streaming/ - Example that uses the Hadoop streaming MapReduce interface to calculate average departure delay by month for each airline.
14 | 
15 | airline/src/deptdelay_by_month/R/hive - Example using Hadoop Interactive for running MapReduce code to calculate average departure delay by month for each airline.
16 | 
17 | airline/src/deptdelay_by_month/R/rhipe - Example using RHIPE to run MapReduce code that calculates average departure delay by month for each airline and then visualize the results.
18 | 
19 | airline/src/deptdelay_by_month/R/rmr - Example using Revolution Analytics rmr package to calculate average departure delay by month for each airline.
20 | 
21 | Instructions for running the code can be found with each example.


--------------------------------------------------------------------------------
/airline/src/deptdelay_by_month/R/hive/README:
--------------------------------------------------------------------------------
 1 | Example script using Hadoop Interactive (hive) to execute a MapReduce job to
 2 | calculate average departure delays per month for each airline. This example
 3 | uses the airline arrival and departure data available here:
 4 |   
 5 |   http://stat-computing.org/dataexpo/2009/the-data.html
 6 | 
 7 | This example assumes the data has been uploaded to "/data/airline/" in HDFS.
 8 | If you've loaded this data into a different directory in HDFS then change the
 9 | "input" value in the hive_stream() function.
10 | 
11 | This example also assumes you've installed Hadoop, R and Hadoop Interactive. This code has been tested with CDH3 (Hadoop 0.20.2), R 2.12.2 and Hadoop Interactive 0.1-10 on Ubuntu 10.10.
12 | 
13 | Running the code:
14 | 
15 | * Set an environment variable pointing to the Hadoop install directory
16 |     $ export HADOOP_HOME=HADOOP_HOME
17 | * Execute the script:
18 |     $ ./hive.R
19 | 


--------------------------------------------------------------------------------
/airline/src/deptdelay_by_month/R/hive/hive.R:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env Rscript
 2 | 
 3 | mapper <- function() {
 4 |   # For each record in airline dataset, output a new record consisting of
 5 |   # "CARRIER|YEAR|MONTH \t DEPARTURE_DELAY"
 6 | 
 7 |   con <- file("stdin", open = "r")
 8 |   while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) {
 9 |     fields <- unlist(strsplit(line, "\\,"))
10 |     # Skip header lines and bad records:
11 |     if (!(identical(fields[[1]], "Year")) & length(fields) == 29) {
12 |       deptDelay <- fields[[16]]
13 |       # Skip records where departure delay is "NA":
14 |       if (!(identical(deptDelay, "NA"))) {
15 |         # field[9] is carrier, field[1] is year, field[2] is month:
16 |         cat(paste(fields[[9]], "|", fields[[1]], "|", fields[[2]], sep=""), "\t",
17 |             deptDelay, "\n")
18 |       }
19 |     }
20 |   }
21 |   close(con)
22 | }
23 | 
24 | reducer <- function() {
25 |   con <- file("stdin", open = "r")
26 |   delays <- numeric(0) # vector of departure delays
27 |   lastKey <- ""
28 |   while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) {
29 |     split <- unlist(strsplit(line, "\t"))
30 |     key <- split[[1]]
31 |     deptDelay <- as.numeric(split[[2]])
32 | 
33 |     # Start of a new key, so output results for previous key:
34 |     if (!(identical(lastKey, "")) & (!(identical(lastKey, key)))) {
35 |       keySplit <- unlist(strsplit(lastKey, "\\|"))
36 |       cat(keySplit[[2]], "\t", keySplit[[3]], "\t", length(delays), "\t", keySplit[[1]], "\t", (mean(delays)), "\n")
37 |       lastKey <- key
38 |       delays <- c(deptDelay) 
39 |     } else { # Still working on same key so append dept delay value to vector:
40 |         lastKey <- key
41 |         delays <- c(delays, deptDelay)
42 |     }
43 |   }
44 | 
45 |   # We're done, output last record:
46 |   keySplit <- unlist(strsplit(lastKey, "\\|"))
47 |   cat(keySplit[[2]], "\t", keySplit[[3]], "\t", length(delays), "\t", keySplit[[1]], "\t", (mean(delays)), "\n")
48 | }
49 | 
50 | library(hive)
51 | DFS_dir_remove("/dept-delay-month", recursive = TRUE, henv = hive())
52 | hive_stream(mapper = mapper, reducer = reducer, 
53 |             input="/data/airline/", output="/dept-delay-month")
54 | results <- DFS_read_lines("/dept-delay-month/part-r-00000", henv = hive())
55 | 


--------------------------------------------------------------------------------
/airline/src/deptdelay_by_month/R/rhipe/README:
--------------------------------------------------------------------------------
 1 | Example RHIPE script to calculate average departure delays per month for each
 2 | airline. This example uses the airline arrival and departure data available
 3 | here:
 4 |   
 5 |   http://stat-computing.org/dataexpo/2009/the-data.html
 6 | 
 7 | This example assumes the data has been uploaded to "/data/airline/" in HDFS.
 8 | If you've loaded this data into a different directory in HDFS then change the
 9 | "ifolder" value in the rhmr() function.
10 | 
11 | This example also assumes you've installed Hadoop, R, and RHIPE. This code has been tested with CDH3 (Hadoop 0.20.2), R 2.12.2, and RHIPE 0.65.4 on Ubuntu 10.10.
12 | 
13 | Running the code:
14 | 
15 | * Set an environment variable pointing to the Hadoop install directory
16 |     $ export HADOOP=HADOOP_HOME
17 | * Execute the script:
18 |     $ ./rhipe.R
19 | 
20 | When the script completes you should see an Rplots.pdf file containing the plot
21 | generated from the output data.
22 | 
23 | 


--------------------------------------------------------------------------------
/airline/src/deptdelay_by_month/R/rhipe/rhipe.R:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env Rscript
 2 | 
 3 | # Calculate average departure delays by year and month for each airline in the
 4 | # airline data set (http://stat-computing.org/dataexpo/2009/the-data.html)
 5 | 
 6 | library(Rhipe)
 7 | rhinit(TRUE, TRUE)
 8 | 
 9 | # Output from map is:
10 | # "CARRIER|YEAR|MONTH \t DEPARTURE_DELAY"
11 | map <- expression({
12 |   # For each input record, parse out required fields and output new record:
13 |   extractDeptDelays = function(line) {
14 |     fields <- unlist(strsplit(line, "\\,"))
15 |     # Skip header lines and bad records:
16 |     if (!(identical(fields[[1]], "Year")) & length(fields) == 29) {
17 |       deptDelay <- fields[[16]]
18 |      # Skip records where departure delay is "NA":
19 |       if (!(identical(deptDelay, "NA"))) {
20 |         # field[9] is carrier, field[1] is year, field[2] is month:
21 |         rhcollect(paste(fields[[9]], "|", fields[[1]], "|", fields[[2]], sep=""),
22 |                   deptDelay)
23 |       }
24 |     }
25 |   }
26 |   # Process each record in map input:
27 |   lapply(map.values, extractDeptDelays)
28 | })
29 | 
30 | # Output from reduce is:
31 | # YEAR \t MONTH \t RECORD_COUNT \t AIRLINE \t AVG_DEPT_DELAY
32 | reduce <- expression(
33 |   pre = {
34 |     delays <- numeric(0)
35 |   },
36 |   reduce = {
37 |     # Depending on size of input, reduce will get called multiple times
38 |     # for each key, so accumulate intermediate values in delays vector: 
39 |     delays <- c(delays, as.numeric(reduce.values))
40 |   },
41 |   post = {
42 |     # Process all the intermediate values for key:
43 |     keySplit <- unlist(strsplit(reduce.key, "\\|"))
44 |     count <- length(delays)
45 |     avg <- mean(delays)
46 |     rhcollect(keySplit[[2]], 
47 |               paste(keySplit[[3]], count, keySplit[[1]], avg, sep="\t"))
48 |   }
49 | )
50 | 
51 | inputPath <- "/data/airline/"
52 | outputPath <- "/dept-delay-month"
53 | 
54 | # Create job object:
55 | z <- rhmr(map=map, reduce=reduce,
56 |           ifolder=inputPath, ofolder=outputPath,
57 |           inout=c('text', 'text'), jobname='Avg Departure Delay By Month',
58 |           mapred=list(mapred.reduce.tasks=2))
59 | # Run it:
60 | rhex(z)
61 | 
62 | library(lattice)
63 | 
64 | # Get the results from HDFS and use to create a dataframe:
65 | results <- rhread(paste(outputPath, "/part-*", sep = ""), type = "text")
66 | write(results, file="deptdelays.dat")
67 | deptdelays.monthly.full <- read.delim("deptdelays.dat", header=F)
68 | names(deptdelays.monthly.full)<- c("Year","Month","Count","Airline","Delay")
69 | deptdelays.monthly.full$Year <- as.character(deptdelays.monthly.full$Year)
70 | 
71 | # Visualize results:
72 | h <- histogram(~Delay|Year,data=deptdelays.monthly.full,layout=c(5,5))
73 | update(h)
74 | 


--------------------------------------------------------------------------------
/airline/src/deptdelay_by_month/R/rmr/README:
--------------------------------------------------------------------------------
 1 | Example rmr script to calculate average departure delays per month for each
 2 | airline. This example uses the airline arrival and departure data available
 3 | here:
 4 |   
 5 |   http://stat-computing.org/dataexpo/2009/the-data.html
 6 | 
 7 | This example assumes the data has been uploaded to "/data/airline/" in HDFS.
 8 | If you've loaded this data into a different directory in HDFS then change the
 9 | input argument in the call to deptdelay().
10 | 
11 | This example also assumes you've installed Hadoop, R, and rmr. This code has 
12 | been tested with CDH3 (Hadoop 0.20.2), R 2.12.2, and rmr 1.0 on Ubuntu 10.10.
13 | 
14 | Running the code:
15 | 
16 | * Set an environment variable pointing to the Hadoop install and Hadoop conf
17 | directories.
18 |     $ export HADOOP_HOME=HADOOP_HOME
19 |     $ export HADOOP_CONF=HADOOP_CONF
20 | * Execute the script:
21 |     $ ./deptdelay-rmr.R
22 | 
23 | When the script completes output should be in the "/dept-delay-month" 
24 | directory in HDFS. 
25 | 
26 | 


--------------------------------------------------------------------------------
/airline/src/deptdelay_by_month/R/rmr/deptdelay-rmr.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # Calculate average departure delays by year and month for each airline in the
 4 | # airline data set (http://stat-computing.org/dataexpo/2009/the-data.html).
 5 | # Requires rmr package (https://github.com/RevolutionAnalytics/RHadoop/wiki).
 6 | 
 7 | library(rmr)
 8 | 
 9 | csvtextinputformat = function(line) keyval(NULL, unlist(strsplit(line, "\\,")))
10 | 
11 | deptdelay = function (input, output) {
12 |   mapreduce(input = input,
13 |             output = output,
14 |             textinputformat = csvtextinputformat,
15 |             map = function(k, fields) {
16 |               # Skip header lines and bad records:
17 |               if (!(identical(fields[[1]], "Year")) & length(fields) == 29) {
18 |                 deptDelay <- fields[[16]]
19 |                 # Skip records where departure delay is "NA":
20 |                 if (!(identical(deptDelay, "NA"))) {
21 |                   # field[9] is carrier, field[1] is year, field[2] is month:
22 |                   keyval(c(fields[[9]], fields[[1]], fields[[2]]), deptDelay)
23 |                 }
24 |               }
25 |             },
26 |             reduce = function(keySplit, vv) {
27 |               keyval(keySplit[[2]], c(keySplit[[3]], length(vv), keySplit[[1]], mean(as.numeric(vv))))
28 |             })
29 | }
30 | 
31 | #from.dfs(deptdelay("/data/airline/1987.csv", "/dept-delay-month"))
32 | from.dfs(deptdelay("/data/airline/", "/dept-delay-month-orig"))
33 | 


--------------------------------------------------------------------------------
/airline/src/deptdelay_by_month/R/rmr/deptdelay-rmr12.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # Calculate average departure delays by year and month for each airline in the
 4 | # airline data set (http://stat-computing.org/dataexpo/2009/the-data.html).
 5 | # Requires rmr package (https://github.com/RevolutionAnalytics/RHadoop/wiki).
 6 | 
 7 | #
 8 | # This file is updated to work with the new version 1.2 of rmr:
 9 | # 
10 | # Lots changed in rmr 1.2. The focus of the release was on adding flexibility
11 | # to the I/O, such as adding support for binary files, etc.
12 | # Partially as a result, there are some incompatibilities in calling mapreduce(),
13 | # particularly related to I/O. Relevant changes are noted below.
14 | # 
15 | # --jbreen 2/28/12
16 | #
17 | 
18 | library(rmr)
19 | 
20 | # first, let's adapt our input function to look like one from make.input.format()
21 | # (using the result of make.input.format('csv') would require changes to our mapper )
22 | 
23 | csvtextinputformat = list(mode = 'text', format = function(line) {
24 | 								keyval(NULL, unlist(strsplit(line, "\\,")))
25 | 							}, streaming.format=NULL)
26 | 
27 | deptdelay = function (input, output) {
28 |   mapreduce(input = input,
29 |             output = output,
30 |             input.format = csvtextinputformat,
31 |             map = function(k, fields) {
32 |               # Skip header lines and bad records:
33 |               if (!(identical(fields[[1]], "Year")) & length(fields) == 29) {
34 |                 deptDelay <- fields[[16]]
35 |                 # Skip records where departure dalay is "NA":
36 |                 if (!(identical(deptDelay, "NA"))) {
37 |                   # field[9] is carrier, field[1] is year, field[2] is month:
38 |                   keyval(c(fields[[9]], fields[[1]], fields[[2]]), deptDelay)
39 |                 }
40 |               }
41 |             },
42 |             reduce = function(keySplit, vv) {
43 |               keyval(keySplit[[2]], c(keySplit[[3]], length(vv), keySplit[[1]], mean(as.numeric(vv))))
44 |             })
45 | }
46 | 
47 | df = from.dfs(deptdelay("/data/airline/", "/dept-delay-month-rmr12"), to.data.frame=T)
48 | colnames(df) = c('year', 'month', 'count', 'airline', 'mean.delay')
49 | print(df)
50 | 


--------------------------------------------------------------------------------
/airline/src/deptdelay_by_month/R/streaming/README:
--------------------------------------------------------------------------------
 1 | Example MapReduce script using the Hadoop streaming interface to calculate
 2 | average departure delays per month for each airline. This example uses the
 3 | airline arrival and departure data available here:
 4 |   
 5 |   http://stat-computing.org/dataexpo/2009/the-data.html
 6 | 
 7 | This example assumes the data has been uploaded to "/data/airline/" in HDFS.
 8 | If you've loaded this data into a different directory in HDFS then change the
 9 | "-input" value in the command line.
10 | 
11 | This example also assumes you've installed Hadoop and R. This code has been tested with CDH3 (Hadoop 0.20.2) and R 2.12.2 on Ubuntu 10.10.
12 | 
13 | Running the code:
14 | 
15 | The following is an example command line to run this example:
16 | 
17 | $ $HADOOP_HOME/bin/hadoop jar \
18 | > $HADOOP_HOME/contrib/streaming/hadoop-streaming-*.jar \
19 | > -input /data/airline/test.dat -output /dept-delay-month \
20 | > -mapper map.R -reducer reduce.R -file map.R -file reduce.R
21 | 


--------------------------------------------------------------------------------
/airline/src/deptdelay_by_month/R/streaming/map.R:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env Rscript
 2 | 
 3 | # For each record in airline dataset, output a new record consisting of
 4 | # "CARRIER|YEAR|MONTH \t DEPARTURE_DELAY"
 5 | 
 6 | con <- file("stdin", open = "r")
 7 | while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) {
 8 |   fields <- unlist(strsplit(line, "\\,"))
 9 |   # Skip header lines and bad records:
10 |   if (!(identical(fields[[1]], "Year")) & length(fields) == 29) {
11 |     deptDelay <- fields[[16]]
12 |     # Skip records where departure delay is "NA":
13 |     if (!(identical(deptDelay, "NA"))) {
14 |       # field[9] is carrier, field[1] is year, field[2] is month:
15 |       cat(paste(fields[[9]], "|", fields[[1]], "|", fields[[2]], sep=""), "\t",
16 |           deptDelay, "\n")
17 |     }
18 |   }
19 | }
20 | close(con)
21 | 
22 | 


--------------------------------------------------------------------------------
/airline/src/deptdelay_by_month/R/streaming/reduce.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | 
 3 | # For each input key, output a record composed of 
 4 | # YEAR \t MONTH \t RECORD_COUNT \t AIRLINE \t AVG_DEPT_DELAY
 5 | 
 6 | con <- file("stdin", open = "r")
 7 | delays <- numeric(0) # vector of departure delays
 8 | lastKey <- ""
 9 | while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) {
10 |   split <- unlist(strsplit(line, "\t"))
11 |   key <- split[[1]]
12 |   deptDelay <- as.numeric(split[[2]])
13 | 
14 |   # Start of a new key, so output results for previous key:
15 |   if (!(identical(lastKey, "")) & (!(identical(lastKey, key)))) {
16 |     keySplit <- unlist(strsplit(lastKey, "\\|"))
17 |     cat(keySplit[[2]], "\t", keySplit[[3]], "\t", length(delays), "\t", keySplit[[1]], "\t", (mean(delays)), "\n")
18 |     lastKey <- key
19 |     delays <- c(deptDelay) 
20 |   } else { # Still working on same key so append dept delay value to vector:
21 |       lastKey <- key
22 |       delays <- c(delays, deptDelay)
23 |   }
24 | }
25 | 
26 | # We're done, output last record:
27 | keySplit <- unlist(strsplit(lastKey, "\\|"))
28 | cat(keySplit[[2]], "\t", keySplit[[3]], "\t", length(delays), "\t", keySplit[[1]], "\t", (mean(delays)), "\n")
29 | 
30 | 


--------------------------------------------------------------------------------