├── .gitattributes
├── .gitignore
├── Makefile
├── README.md
├── build
├── rmr2_2.3.0.tar.gz
├── rmr2_2.3.0.zip
├── rmr2_3.0.0.tar.gz
├── rmr2_3.0.0.zip
├── rmr2_3.1.0.tar.gz
├── rmr2_3.1.0.zip
├── rmr2_3.1.1.tar.gz
├── rmr2_3.1.2.tar.gz
├── rmr2_3.1.2.zip
├── rmr2_3.2.0.tar.gz
├── rmr2_3.2.0.zip
├── rmr2_3.3.0.tar.gz
└── rmr2_3.3.0.zip
├── docs
├── IO-speed-tests.Rmd
├── IO-speed-tests.md
├── Makefile
├── OCHUG-presentation
│ ├── presentation.Rmd
│ └── presentation.md
├── benchmark-slides
│ ├── benchmark-slides.Rmd
│ └── benchmark-slides.md
├── getting-data-in-and-out.Rmd
├── getting-data-in-and-out.md
├── kmeans.gif
├── new-in-this-release.Rmd
├── new-in-this-release.md
├── readme.Rmd
├── readme.md
├── resources
│ ├── Mapreduce.png
│ ├── R.png
│ ├── hadoop-logo.gif
│ ├── hadoop-logo.jpg
│ ├── revo-home.png
│ ├── revolution.jpeg
│ ├── rhadoop.png
│ └── rhadoop.svg
├── testing.Rmd
├── testing.md
├── trulia-presentation
│ ├── presentation.Rmd
│ ├── presentation.md
│ ├── summary.Rmd
│ └── summary.md
├── tutorial-slides
│ ├── tutorial-slides.Rmd
│ └── tutorial-slides.md
├── tutorial.Rmd
└── tutorial.md
├── hadoopy_hbase
├── README
├── hadoopy_hbase
│ ├── Hbase.thrift
│ ├── __init__.py
│ └── hbase
│ │ ├── Hbase-remote
│ │ ├── Hbase.py
│ │ ├── __init__.py
│ │ ├── constants.py
│ │ └── ttypes.py
├── java
│ ├── build.sh
│ ├── build.xml
│ ├── build_cdh4.sh
│ ├── build_linux.sh
│ ├── common.xml
│ └── src
│ │ └── java
│ │ └── com
│ │ └── dappervision
│ │ └── hbase
│ │ └── mapred
│ │ ├── TypedBytesTableInputFormat.java
│ │ ├── TypedBytesTableInputFormatBase.java
│ │ ├── TypedBytesTableRecordReader.java
│ │ └── TypedBytesTableRecordReaderSingleValue.java
├── setup.py
└── tests
│ ├── auth.py
│ ├── flickr_count.py
│ ├── flickr_count_hadoop.py
│ ├── flickr_count_job.py
│ ├── flickr_crawl.py
│ ├── hbase_test.py
│ ├── hbase_test_job.py
│ ├── hbase_test_job2.py
│ ├── server.py
│ ├── thrift_bench.py
│ └── thrift_example.py
└── pkg
├── DESCRIPTION
├── NAMESPACE
├── R
├── IO.R
├── basic.R
├── extras.R
├── hdfs.R
├── keyval.R
├── local.R
├── mapreduce.R
├── parse-url.R
├── quickcheck-rmr.R
└── streaming.R
├── examples
├── airline.R
├── avro.R
├── cluster.mr.R
├── collocations.R
├── counts.R
├── hbase.R
├── large-kmeans-test.R
├── mclust.mr.R
└── ngram.R
├── man
├── bigdataobject.Rd
├── dfs.empty.Rd
├── equijoin.Rd
├── fromdfstodfs.Rd
├── hadoop-setting.Rd
├── keyval.Rd
├── make.io.format.Rd
├── mapreduce.Rd
├── rmr-package.Rd
├── rmr.options.Rd
├── rmr.sample.Rd
├── rmr.str.Rd
├── scatter.Rd
├── status.Rd
├── tomaptoreduce.Rd
└── vsum.Rd
├── src
├── Makevars
├── Makevars.win
├── catwin
│ ├── Makefile
│ └── catwin.c
├── extras.cpp
├── extras.h
├── hbase-io
├── hbase-to-df.cpp
├── hbase-to-df.h
├── keyval.cpp
├── keyval.h
├── t-list.cpp
├── t-list.h
├── typed-bytes.cpp
└── typed-bytes.h
├── tests
├── IO.R
├── avro.R
├── basic-examples.R
├── basic.R
├── benchmarks.R
├── getting-data-in-and-out.R
├── keyval.R
├── kmeans.R
├── linear-least-squares.R
├── logistic-regression.R
├── mapreduce.R
├── naive-bayes.R
└── wordcount.R
└── tools
└── whirr
├── README
├── hadoop-ec2-centos.properties
├── hadoop-ec2.properties
├── lzo-centos.sh
├── lzo-ubuntu.sh
├── rmr-dev.sh
├── rmr-master-centos.sh
└── rmr-master.sh
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Set default behaviour, in case users don't have core.autocrlf set.
2 | * text=auto
3 |
4 | *.cpp text eol=lf
5 | *.h text eol=lf
6 | *.html text eol=lf
7 | *.java text eol=lf
8 | *.jpeg binary
9 | *.md text eol=lf
10 | *.o: binary
11 | *.png binary
12 | *.properties text eol=lf
13 | *.R: text eol=lf
14 | *.Rd text eol=lf
15 | *.Rmd text eol=lf
16 | *.sh text eol=lf
17 | *.so binary
18 | *.xml text eol=lf
19 | .gitignore text eol=lf
20 | Makevars* text eol=lf
21 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | #for Mac users
2 | .DS_Store
3 | #for R users
4 | .RData
5 | .Rhistory
6 | *.Rcheck
7 | Rprof.out
8 | #for RStudio users
9 | .Rproj.user
10 | *.Rproj
11 | #for rmr users
12 | rmr-*-env*
13 | rmr-streaming-*
14 | #for emacs users
15 | *~
16 | #for whirr users
17 | whirr.log*
18 | #for emerge users
19 | *.orig
20 | rhbase/pkg/config.log
21 | #Compilation artifacts
22 | src-i386
23 | src-x86_64
24 | *.o
25 | *.so
26 | *.rds
27 | Rprof.out
28 | *.aup
29 | *.au
30 | *.swp
31 | *.gz
32 | !build/*.gz
33 | out/*
34 | *.Rout
35 | file*
36 | quickcheck*
37 | *.html
38 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | TESTS := $(wildcard pkg/tests/*.R)
2 | OUTPUT := $(addprefix out/,$(notdir $(TESTS:.R=.out)))
3 |
4 | check: $(OUTPUT)
5 |
6 | out/%.out:pkg/tests/%.R
7 | R CMD BATCH --vanilla --slave $< $@
8 |
9 | clean:
10 | rm -f $(OUTPUT)
11 | rm -rf rmr-* job_local*
12 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | rmr2
2 | ====
3 |
4 | A package that allows R developer to use Hadoop MapReduce, developed as part of the RHadoop project. Please see the [RHadoop wiki](https://github.com/RevolutionAnalytics/RHadoop/wiki) for information.
5 |
--------------------------------------------------------------------------------
/build/rmr2_2.3.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/build/rmr2_2.3.0.tar.gz
--------------------------------------------------------------------------------
/build/rmr2_2.3.0.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/build/rmr2_2.3.0.zip
--------------------------------------------------------------------------------
/build/rmr2_3.0.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/build/rmr2_3.0.0.tar.gz
--------------------------------------------------------------------------------
/build/rmr2_3.0.0.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/build/rmr2_3.0.0.zip
--------------------------------------------------------------------------------
/build/rmr2_3.1.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/build/rmr2_3.1.0.tar.gz
--------------------------------------------------------------------------------
/build/rmr2_3.1.0.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/build/rmr2_3.1.0.zip
--------------------------------------------------------------------------------
/build/rmr2_3.1.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/build/rmr2_3.1.1.tar.gz
--------------------------------------------------------------------------------
/build/rmr2_3.1.2.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/build/rmr2_3.1.2.tar.gz
--------------------------------------------------------------------------------
/build/rmr2_3.1.2.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/build/rmr2_3.1.2.zip
--------------------------------------------------------------------------------
/build/rmr2_3.2.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/build/rmr2_3.2.0.tar.gz
--------------------------------------------------------------------------------
/build/rmr2_3.2.0.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/build/rmr2_3.2.0.zip
--------------------------------------------------------------------------------
/build/rmr2_3.3.0.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/build/rmr2_3.3.0.tar.gz
--------------------------------------------------------------------------------
/build/rmr2_3.3.0.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/build/rmr2_3.3.0.zip
--------------------------------------------------------------------------------
/docs/IO-speed-tests.Rmd:
--------------------------------------------------------------------------------
1 | Knit document for some timing results:
2 |
3 | ```{r, echo=TRUE}
4 | zz = rmr2:::interleave(1:10^6, 1:10^6)
5 | con = file("/tmp/n-test", "wb")
6 | system.time({rmr2:::typedbytes.writer(zz, con, TRUE)})
7 | close(con)
8 | con = file("/tmp/tb-test", "wb")
9 | system.time({rmr2:::typedbytes.writer(zz, con, FALSE)})
10 | close(con)
11 | system.time({save(zz, file= "/tmp/save-test")})
12 | system.time({rmr2:::make.typedbytes.input.format()(file("/tmp/n-test", "rb"), 10^6)})
13 | system.time({rmr2:::make.typedbytes.input.format()(file("/tmp/tb-test", "rb"), 10^6)})
14 | system.time({load(file="/tmp/save-test")})
15 | ```
--------------------------------------------------------------------------------
/docs/IO-speed-tests.md:
--------------------------------------------------------------------------------
1 | Knit document for some timing results:
2 |
3 |
4 | ```r
5 | zz = rmr2:::interleave(1:10^6, 1:10^6)
6 | con = file("/tmp/n-test", "wb")
7 | system.time({
8 | rmr2:::typedbytes.writer(zz, con, TRUE)
9 | })
10 | ```
11 |
12 | ```
13 | ## user system elapsed
14 | ## 0.582 0.033 0.615
15 | ```
16 |
17 | ```r
18 | close(con)
19 | con = file("/tmp/tb-test", "wb")
20 | system.time({
21 | rmr2:::typedbytes.writer(zz, con, FALSE)
22 | })
23 | ```
24 |
25 | ```
26 | ## user system elapsed
27 | ## 0.295 0.023 0.317
28 | ```
29 |
30 | ```r
31 | close(con)
32 | system.time({
33 | save(zz, file = "/tmp/save-test")
34 | })
35 | ```
36 |
37 | ```
38 | ## user system elapsed
39 | ## 2.365 0.022 2.390
40 | ```
41 |
42 | ```r
43 | system.time({
44 | rmr2:::make.typedbytes.input.format()(file("/tmp/n-test", "rb"), 10^6)
45 | })
46 | ```
47 |
48 | ```
49 | ## user system elapsed
50 | ## 9.229 0.374 9.603
51 | ```
52 |
53 | ```r
54 | system.time({
55 | rmr2:::make.typedbytes.input.format()(file("/tmp/tb-test", "rb"), 10^6)
56 | })
57 | ```
58 |
59 | ```
60 | ## Warning: closing unused connection 4 (/tmp/n-test)
61 | ```
62 |
63 | ```
64 | ## user system elapsed
65 | ## 7.387 0.328 7.716
66 | ```
67 |
68 | ```r
69 | system.time({
70 | load(file = "/tmp/save-test")
71 | })
72 | ```
73 |
74 | ```
75 | ## Warning: closing unused connection 4 (/tmp/tb-test)
76 | ```
77 |
78 | ```
79 | ## user system elapsed
80 | ## 0.652 0.001 0.653
81 | ```
82 |
83 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | htmls = $(wildcard *.html)
2 | pdfs = $(htmls:.html=.pdf)
3 | pdf: $(pdfs)
4 |
5 | %.pdf: %.html
6 | /Applications/wkhtmltopdf.app/Contents/MacOS/wkhtmltopdf $< $@
7 |
--------------------------------------------------------------------------------
/docs/benchmark-slides/benchmark-slides.Rmd:
--------------------------------------------------------------------------------
1 | `r read_chunk('../../pkg/tests/benchmarks.R')`
2 | `r opts_chunk$set(echo=TRUE, eval=FALSE, cache=FALSE, tidy=FALSE)`
3 |
4 | ##
5 |
6 |
7 |
8 | ```{r write}
9 | ```
10 |
11 | ```{r read}
12 | ```
13 |
14 |
15 | ##
16 | ```{r pass-through}
17 | ```
18 | ##
19 |
20 |
21 | ```{r predicate }
22 | ```
23 |
24 | ```{r filter }
25 | ```
26 |
27 | ##
28 |
29 |
30 | ```{r select-input }
31 | ```
32 |
33 | ```{r select }
34 | ```
35 |
36 | ##
37 |
38 |
39 | ```{r bigsum-input}
40 | ```
41 |
42 | ```{r bigsum }
43 | ```
44 |
45 | ##
46 | ```{r group-aggregate-input}
47 | ```
48 | ##
49 | ```{r group-aggregate-functions}
50 | ```
51 | ##
52 | ```{r group-aggregate}
53 | ```
54 |
--------------------------------------------------------------------------------
/docs/benchmark-slides/benchmark-slides.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | ##
5 |
6 |
7 |
8 |
9 | ```r
10 | input = to.dfs(1:input.size)
11 | ```
12 |
13 |
14 |
15 | ```r
16 | from.dfs(input)
17 | ```
18 |
19 |
20 |
21 | ##
22 |
23 | ```r
24 | mapreduce(
25 | input,
26 | map = function(k,v) keyval(k,v))
27 | ```
28 |
29 | ##
30 |
31 |
32 |
33 | ```r
34 | predicate =
35 | function(.,v) unlist(v)%%2 == 0
36 | ```
37 |
38 |
39 |
40 | ```r
41 | mapreduce(
42 | input,
43 | map =
44 | function(k,v) {
45 | filter = predicate(k,v);
46 | keyval(k[filter], v[filter])})
47 | ```
48 |
49 |
50 | ##
51 |
52 |
53 |
54 | ```r
55 | input.select =
56 | to.dfs(
57 | data.frame(
58 | a = rnorm(input.size),
59 | b = 1:input.size,
60 | c = sample(as.character(1:10),
61 | input.size,
62 | replace=TRUE)))
63 | ```
64 |
65 |
66 |
67 | ```r
68 | mapreduce(input.select,
69 | map = function(.,v) v$b)
70 | ```
71 |
72 |
73 | ##
74 |
75 |
76 |
77 | ```r
78 | set.seed(0)
79 | big.sample = rnorm(input.size)
80 | input.bigsum = to.dfs(big.sample)
81 | ```
82 |
83 |
84 |
85 | ```r
86 | mapreduce(
87 | input.bigsum,
88 | map =
89 | function(.,v) keyval(1, sum(v)),
90 | reduce =
91 | function(., v) keyval(1, sum(v)),
92 | combine = TRUE)
93 | ```
94 |
95 |
96 | ##
97 |
98 | ```r
99 | input.ga =
100 | to.dfs(
101 | keyval(
102 | 1:input.size,
103 | rnorm(input.size)))
104 | ```
105 |
106 | ##
107 |
108 | ```r
109 | group = function(k,v) k%%100
110 | aggregate = function(x) sum(x)
111 | ```
112 |
113 | ##
114 |
115 | ```r
116 | mapreduce(
117 | input.ga,
118 | map =
119 | function(k,v)
120 | keyval(group(k,v), v),
121 | reduce =
122 | function(k, vv)
123 | keyval(k, aggregate(vv)),
124 | combine = TRUE)
125 | ```
126 |
127 |
--------------------------------------------------------------------------------
/docs/kmeans.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/docs/kmeans.gif
--------------------------------------------------------------------------------
/docs/new-in-this-release.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What's new in `rmr2` x.y.z"
3 | output:
4 | html_document:
5 | keep_md: true
6 | ---
7 |
8 | We switched to using the github [release page](https://github.com/RevolutionAnalytics/rmr2/releases). Please head there and update your links or bookmarks.
--------------------------------------------------------------------------------
/docs/new-in-this-release.md:
--------------------------------------------------------------------------------
1 | # What's new in `rmr2` x.y.z
2 |
3 | We switched to using the github [release page](https://github.com/RevolutionAnalytics/rmr2/releases). Please head there and update your links or bookmarks.
4 |
--------------------------------------------------------------------------------
/docs/readme.Rmd:
--------------------------------------------------------------------------------
1 | Each document is present in three formats. Markdown, extension **.md, is the one you want to click on**. R Markdown, extension Rmd, is the original format, see the package `knitr` for details, and is the only one that should be edited. html is not used at this time.
2 |
--------------------------------------------------------------------------------
/docs/readme.md:
--------------------------------------------------------------------------------
1 | Each document is present in three formats. Markdown, extension **.md, is the one you want to click on**. R Markdown, extension Rmd, is the original format, see the package `knitr` for details, and is the only one that should be edited. html is not used at this time.
2 |
--------------------------------------------------------------------------------
/docs/resources/Mapreduce.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/docs/resources/Mapreduce.png
--------------------------------------------------------------------------------
/docs/resources/R.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/docs/resources/R.png
--------------------------------------------------------------------------------
/docs/resources/hadoop-logo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/docs/resources/hadoop-logo.gif
--------------------------------------------------------------------------------
/docs/resources/hadoop-logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/docs/resources/hadoop-logo.jpg
--------------------------------------------------------------------------------
/docs/resources/revo-home.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/docs/resources/revo-home.png
--------------------------------------------------------------------------------
/docs/resources/revolution.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/docs/resources/revolution.jpeg
--------------------------------------------------------------------------------
/docs/resources/rhadoop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/docs/resources/rhadoop.png
--------------------------------------------------------------------------------
/docs/resources/rhadoop.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
18 |
20 |
21 |
23 | image/svg+xml
24 |
26 |
27 |
28 |
29 |
30 |
32 |
52 |
57 |
64 |
70 |
77 |
83 |
88 |
98 |
108 |
118 |
125 |
130 |
131 |
132 |
--------------------------------------------------------------------------------
/docs/testing.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Testing for rmr2 3.3.0"
3 | output:
4 | html_document:
5 | keep_md: true
6 | ---
7 |
8 | In the table at the bottom we collect results concerning testing of rmr on a given combination of R/OS and Hadoop releases. We collect both positive and negative results if available. If a combination is not present in this table, it doesn't imply lack of compatibility. In case of negative results, they will be recorded but there is no guarantee that they will be fixed, albeit it's likely for current and common setups. In the early days `rmr` required a specific list of patches to be present in Hadoop to work. Currently, we expect it to work on any current or recent distibution by the Apache foundation, Hortonworks, Cloudera and MapR.
9 |
10 | Testing is conducted by running `R CMD check path-to-rmr` and requires an additional dependency, quickcheck, also downloadable from our wiki. Failures on producing documentation in legacy formats are not important and are ignored. Notes and warnings are not important in the sense that they do not determine success, but it may be helpful to report them in the issue tracker. Please contribute additional testing reports.
11 |
12 | If you are interested in the testing conducted on other releases, choose one from the drop down menu on the top left, under tags and find this document again (under docs).
13 |
14 |
15 |
16 |
17 |
18 |
19 | Hadoop R OS Notes Reporter
20 |
21 |
22 |
23 |
24 | Hadoop 2.4.0
25 | R 3.1.1 (Revolution R Open 8.0 beta)
26 | CentOS 6.4
27 |
28 | Revolution
29 |
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/docs/testing.md:
--------------------------------------------------------------------------------
1 | # Testing for rmr2 3.3.0
2 |
3 | In the table at the bottom we collect results concerning testing of rmr on a given combination of R/OS and Hadoop releases. We collect both positive and negative results if available. If a combination is not present in this table, it doesn't imply lack of compatibility. In case of negative results, they will be recorded but there is no guarantee that they will be fixed, albeit it's likely for current and common setups. In the early days `rmr` required a specific list of patches to be present in Hadoop to work. Currently, we expect it to work on any current or recent distibution by the Apache foundation, Hortonworks, Cloudera and MapR.
4 |
5 | Testing is conducted by running `R CMD check path-to-rmr` and requires an additional dependency, quickcheck, also downloadable from our wiki. Failures on producing documentation in legacy formats are not important and are ignored. Notes and warnings are not important in the sense that they do not determine success, but it may be helpful to report them in the issue tracker. Please contribute additional testing reports.
6 |
7 | If you are interested in the testing conducted on other releases, choose one from the drop down menu on the top left, under tags and find this document again (under docs).
8 |
9 |
10 |
11 |
12 |
13 |
14 | Hadoop R OS Notes Reporter
15 |
16 |
17 |
18 |
19 | Hadoop 2.4.0
20 | R 3.1.1 (Revolution R Open 8.0 beta)
21 | CentOS 6.4
22 |
23 | Revolution
24 |
25 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/docs/trulia-presentation/summary.Rmd:
--------------------------------------------------------------------------------
1 | # Scalable Analytics in R with rmr
2 |
3 | *RHadoop* is an open source project started by Revolution Analytics to provide data scientists using R access to Hadoop’s scalability without giving up their favorite language flexibility and convenience.
4 |
5 | So far it has three main packages:
6 |
7 | * rhdfs provides file level manipulation for HDFS, the Hadoop file system
8 | * rhbase provides access to HBASE, the hadoop database
9 | * rmr allows to write mapreduce programs in R. This will be the focus of this presentation.
10 |
11 | rmr allows R developers to program in the mapreduce framework, and to all developers provides an alternative way to implement mapreduce programs that strikes a delicate compromise betwen power and usability. It allows to write general mapreduce programs, offering the full power and ecosystem of an existing, established programming language. It doesn’t force you to replace the R interpreter with a special run-time $mdash;it is just a library. You can write logistic regression in half a page and even understand it. It feels and behaves almost like the usual R iteration and aggregation primitives. It is comprised of a handful of functions with a modest number of arguments and sensible defaults that combine in many useful ways. But there is no way to prove that an API works: one can only show examples of what it allows to do and we will do that covering a few from machine learning and statistics. Finally, we will discuss how to get involved.
--------------------------------------------------------------------------------
/docs/trulia-presentation/summary.md:
--------------------------------------------------------------------------------
1 | # Scalable Analytics in R with rmr
2 |
3 | *RHadoop* is an open source project started by Revolution Analytics to provide data scientists using R access to Hadoop’s scalability without giving up their favorite language flexibility and convenience.
4 |
5 | So far it has three main packages:
6 |
7 | * rhdfs provides file level manipulation for HDFS, the Hadoop file system
8 | * rhbase provides access to HBASE, the hadoop database
9 | * rmr allows to write mapreduce programs in R. This will be the focus of this presentation.
10 |
11 | rmr allows R developers to program in the mapreduce framework, and to all developers provides an alternative way to implement mapreduce programs that strikes a delicate compromise betwen power and usability. It allows to write general mapreduce programs, offering the full power and ecosystem of an existing, established programming language. It doesn’t force you to replace the R interpreter with a special run-time $mdash;it is just a library. You can write logistic regression in half a page and even understand it. It feels and behaves almost like the usual R iteration and aggregation primitives. It is comprised of a handful of functions with a modest number of arguments and sensible defaults that combine in many useful ways. But there is no way to prove that an API works: one can only show examples of what it allows to do and we will do that covering a few from machine learning and statistics. Finally, we will discuss how to get involved.
12 |
--------------------------------------------------------------------------------
/docs/tutorial-slides/tutorial-slides.Rmd:
--------------------------------------------------------------------------------
1 | ```{r}
2 | library(knitr)
3 | read_chunk('../../pkg/tests/benchmarks.R')
4 | read_chunk('../../pkg/tests/basic-examples.R')
5 | read_chunk('../../pkg/tests/wordcount.R')
6 | read_chunk('../../pkg/tests/logistic-regression.R')
7 | read_chunk('../../pkg/tests/linear-least-squares.R')
8 | read_chunk('../../pkg/tests/kmeans.R')
9 | opts_chunk$set(echo=TRUE, eval=FALSE, cache=FALSE, tidy=FALSE)
10 | ```
11 |
12 |
13 | ## RHadoop Tutorial
14 | ### Revolution Analytics
15 | #### Antonio Piccolboni
16 | #### rhadoop@revolutionanalytics.com
17 | #### antonio@piccolboni.info
18 |
19 | #RHadoop
20 |
21 | ##
22 |
23 | - R + Hadoop
24 | - OSS
25 | -
26 | -
27 | - rhdfs
28 | - rhbase
29 | - rmr2
30 |
31 | # Mapreduce
32 |
33 | ##
34 |
35 |
36 |
37 | ```{r lapply}
38 | ```
39 |
40 | ```{r lapply-mapreduce}
41 | ```
42 |
43 |
44 | ##
45 |
46 |
47 |
48 | ```{r tapply}
49 | ```
50 |
51 | ```{r tapply-mapreduce}
52 | ```
53 |
54 |
55 | # rmr-ABC
56 |
57 | ##
58 |
59 |
60 |
61 | ```{r write}
62 | ```
63 |
64 | ```{r read}
65 | ```
66 |
67 |
68 | ##
69 | ```{r pass-through}
70 | ```
71 | ##
72 |
73 |
74 | ```{r predicate }
75 | ```
76 |
77 | ```{r filter }
78 | ```
79 |
80 | ##
81 |
82 |
83 | ```{r select-input }
84 | ```
85 |
86 | ```{r select }
87 | ```
88 |
89 | ##
90 |
91 |
92 | ```{r bigsum-input}
93 | ```
94 |
95 | ```{r bigsum }
96 | ```
97 |
98 | ##
99 | ```{r group-aggregate-input}
100 | ```
101 | ##
102 | ```{r group-aggregate-functions}
103 | ```
104 | ##
105 | ```{r group-aggregate}
106 | ```
107 |
108 | # Wordcount
109 | ##
110 |
111 |
112 |
113 | ```{r wordcount-signature}
114 | ```
115 |
116 | ```{r wordcount-mapreduce}
117 | ```
118 |
119 |
120 | ##
121 |
122 |
123 |
124 | ```{r wordcount-map}
125 | ```
126 |
127 | ```{r wordcount-reduce}
128 | ```
129 |
130 |
131 |
132 |
133 | # Logistic Regression
134 |
135 | ##
136 |
137 |
138 |
139 | ```{r logistic.regression-signature}
140 | ```
141 |
142 | ```{r logistic.regression-main}
143 | ```
144 |
145 |
146 | ##
147 |
148 |
149 |
150 | ```{r logistic.regression-map}
151 | ```
152 |
153 | ```{r logistic.regression-reduce}
154 | ```
155 |
156 |
157 | ##
158 |
159 |
160 |
161 |
162 | ```{r logistic.regression-data}
163 | ```
164 |
165 | ```{r logistic.regression-run}
166 | ```
167 |
168 |
169 |
170 | # K-means
171 |
172 | ##
173 |
174 | ```{r kmeans-dist.fun}
175 | ```
176 |
177 | ##
178 |
179 | ```{r kmeans.map}
180 | ```
181 |
182 | ##
183 |
184 | ```{r kmeans.reduce}
185 | ```
186 |
187 | ##
188 |
189 | ```{r kmeans-signature}
190 | ```
191 |
192 | ##
193 |
194 | ```{r kmeans-main-1}
195 | ```
196 |
197 | ##
198 |
199 | ```{r kmeans-main-2}
200 | ```
201 |
202 | ##
203 |
204 |
205 |
206 | ```{r kmeans-data}
207 | ```
208 |
209 | ```{r kmeans-run}
210 | ```
211 |
212 |
213 | # Linear Least Squares
214 |
215 | ##
216 |
217 | $$ \mathbf{X b = y}$$
218 |
219 | ```
220 | solve(t(X)%*%X, t(X)%*%y)
221 | ```
222 |
223 | ##
224 |
225 | ```{r LLS-sum}
226 | ```
227 |
228 | ##
229 | ```{r LLS-XtX}
230 | ```
231 |
232 | ##
233 | ```{r LLS-Xty}
234 | ```
235 |
236 | ##
237 |
238 |
239 |
240 | ```{r LLS-solve}
241 | ```
242 |
243 |
244 | ```{r LLS-data}
245 | ```
246 |
--------------------------------------------------------------------------------
/hadoopy_hbase/README:
--------------------------------------------------------------------------------
1 | Hadoopy HBase
2 | License: Apache V2 (http://www.apache.org/licenses/LICENSE-2.0)
3 |
4 |
5 | How to Run Tests
6 | sudo /usr/lib/hbase/bin/start-hbase.sh
7 | sudo /etc/init.d/hadoop-hbase-thrift start
8 | cd java
9 | sudo bash build.sh
10 | cd ../tests
11 | python thrift_example.py
12 | python hbase_test.py
13 |
14 |
15 |
16 |
17 | Acknowledgements: Lasthbase (https://github.com/tims/lasthbase) inspired this module but is incompatible with new HBase versions, necessitating starting from scratch. The build scripts and general layout were used as a starting point.
18 |
19 |
20 |
--------------------------------------------------------------------------------
/hadoopy_hbase/hadoopy_hbase/__init__.py:
--------------------------------------------------------------------------------
1 | import hadoopy
2 | from thrift.transport.TSocket import TSocket
3 | from thrift.transport.TTransport import TBufferedTransport
4 | from thrift.protocol import TBinaryProtocol
5 | from hbase import Hbase
6 | from hbase.ttypes import ColumnDescriptor, Mutation, BatchMutation, TScan
7 | import hadoopy_hbase
8 | import hashlib
9 | import base64
10 |
11 |
12 | def connect(server='localhost', port=9090):
13 | transport = TBufferedTransport(TSocket(server, int(port)))
14 | transport.open()
15 | protocol = TBinaryProtocol.TBinaryProtocol(transport)
16 | client = Hbase.Client(protocol)
17 | return client
18 |
19 |
20 | def scanner_create_id(client, table, columns=None, start_row=None, stop_row=None, filter=None, caching=None):
21 | return client.scannerOpenWithScan(table, TScan(startRow=start_row, stopRow=stop_row, columns=columns if columns else [], caching=caching, filterString=filter))
22 |
23 |
24 | def scanner_from_id(client, table, sc, per_call=1, close=True):
25 | try:
26 | if per_call == 1:
27 | scanner = lambda : client.scannerGet(sc)
28 | else:
29 | scanner = lambda : client.scannerGetList(sc, per_call)
30 | while True:
31 | outs = scanner()
32 | if outs:
33 | for out in outs:
34 | yield (out.row, dict((x, y.value) for x, y in out.columns.items()))
35 | else:
36 | break
37 | finally:
38 | if sc is not None and close:
39 | client.scannerClose(sc)
40 |
41 |
42 | def scanner(client, table, per_call=1, close=True, **kw):
43 | sc = scanner_create_id(client, table, **kw)
44 | return scanner_from_id(client, table, sc, per_call, close)
45 |
46 |
47 | def scanner_row_column(client, table, column, **kw):
48 | scanner = hadoopy_hbase.scanner(client, table, columns=[column], **kw)
49 | for row, cols in scanner:
50 | yield row, cols[column]
51 |
52 |
53 | def scanner_column(*args, **kw):
54 | return (y for x, y in scanner_row_column(*args, **kw))
55 |
56 |
57 | def _launch_args(hbase_in, hbase_out, columns, start_row, stop_row, single_value, kw):
58 | if hbase_in:
59 | kw['input_format'] = 'com.dappervision.hbase.mapred.TypedBytesTableInputFormat'
60 | if hbase_out:
61 | kw['output_format'] = 'com.dappervision.hbase.mapred.TypedBytesTableOutputFormat'
62 | jobconfs = hadoopy._runner._listeq_to_dict(kw.get('jobconfs', []))
63 | jobconfs['hbase.mapred.tablecolumnsb64'] = ' '.join(map(base64.b64encode, columns))
64 | if start_row is not None:
65 | jobconfs['hbase.mapred.startrowb64'] = base64.b64encode(start_row)
66 | if stop_row is not None:
67 | jobconfs['hbase.mapred.stoprowb64'] = base64.b64encode(stop_row)
68 | if single_value:
69 | jobconfs['hbase.mapred.valueformat'] = 'singlevalue'
70 | kw['jobconfs'] = jobconfs
71 |
72 |
73 | def launch_frozen(in_name, out_name, script_path, hbase_in=True, hbase_out=False, columns=(), start_row=None, stop_row=None, single_value=None, **kw):
74 | _launch_args(hbase_in, hbase_out, columns, start_row, stop_row, single_value, kw)
75 | hadoopy.launch_frozen(in_name, out_name, script_path, **kw)
76 |
77 |
78 | def launch(in_name, out_name, script_path, hbase_in=True, hbase_out=False, columns=(), start_row=None, stop_row=None, single_value=None, **kw):
79 | _launch_args(hbase_in, hbase_out, columns, start_row, stop_row, single_value, kw)
80 | hadoopy.launch(in_name, out_name, script_path, **kw)
81 |
82 |
83 | class HBaseColumnDict(object):
84 |
85 | def __init__(self, table, row, cf, db=None, **kw):
86 | if db is None:
87 | self._db = hadoopy_hbase.connect(**kw)
88 | else:
89 | self._db = db
90 | self._table = table
91 | self._row = row
92 | self._cf = cf + ':'
93 |
94 | def __setitem__(self, key, value):
95 | assert isinstance(key, str)
96 | assert isinstance(value, str)
97 | self._db.mutateRow(self._table, self._row, [hadoopy_hbase.Mutation(column=self._cf + key, value=value)])
98 |
99 | def __getitem__(self, key):
100 | assert isinstance(key, str)
101 | result = self._db.get(self._table, self._row, self._cf + key)
102 | if not result:
103 | raise KeyError
104 | return result[0].value
105 |
106 | def __delitem__(self, key):
107 | assert isinstance(key, str)
108 | self._db.mutateRow(self._table, self._row, [hadoopy_hbase.Mutation(column=self._cf + key, isDelete=True)])
109 |
110 | def items(self):
111 | result = self._db.getRow(self._table, self._row)
112 | if not result:
113 | return []
114 | return [(x, y.value) for x, y in result[0].columns.items()]
115 |
116 |
117 | class HBaseRowDict(object):
118 |
119 | def __init__(self, table, col, db=None, **kw):
120 | if db is None:
121 | self._db = hadoopy_hbase.connect(**kw)
122 | else:
123 | self._db = db
124 | self._table = table
125 | self._col = col
126 |
127 | def __setitem__(self, key, value):
128 | assert isinstance(key, str)
129 | assert isinstance(value, str)
130 | self._db.mutateRow(self._table, key, [hadoopy_hbase.Mutation(column=self._col, value=value)])
131 |
132 | def __getitem__(self, key):
133 | assert isinstance(key, str)
134 | result = self._db.get(self._table, key, self._col)
135 | if not result:
136 | raise KeyError
137 | return result[0].value
138 |
139 | def __delitem__(self, key):
140 | assert isinstance(key, str)
141 | self._db.mutateRow(self._table, key, [hadoopy_hbase.Mutation(column=self._col, isDelete=True)])
142 |
143 |
144 | def hash_key(*args, **kw):
145 | """Convenient key engineering function
146 |
147 | Allows for raw prefix/suffix, with other arguments md5 hashed and truncated.
148 | The key is only guaranteed to be unique if its prefix+suffix is unique. If
149 | being used to create a start key, you can leave off args/suffix but they must
150 | be done in order (e.g., if you leave off an arg you must also leave off suffix).
151 |
152 | Args:
153 | *args: List of arguments to hash in order using hash_bytes of md5
154 | prefix: Raw prefix of the string (default '')
155 | suffix: Raw suffix of the string (default '')
156 | delimiter: Raw delimiter of each field (default '')
157 | hash_bytes: Number of md5 bytes (binary not hex) for each of *args
158 |
159 | Returns:
160 | Combined key (binary)
161 | """
162 | prefix = kw.get('prefix', '')
163 | suffix = kw.get('suffix', '')
164 | delimiter = kw.get('delimiter', '')
165 | if args:
166 | try:
167 | hash_bytes = kw['hash_bytes']
168 | except KeyError:
169 | raise ValueError('hash_bytes keyword argument must be specified')
170 | return delimiter.join([prefix] + [hashlib.md5(x).digest()[:hash_bytes] for x in args] + [suffix])
171 | else:
172 | return delimiter.join([prefix, suffix])
173 |
174 |
--------------------------------------------------------------------------------
/hadoopy_hbase/hadoopy_hbase/hbase/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ['ttypes', 'constants', 'Hbase']
2 |
--------------------------------------------------------------------------------
/hadoopy_hbase/hadoopy_hbase/hbase/constants.py:
--------------------------------------------------------------------------------
1 | #
2 | # Autogenerated by Thrift Compiler (0.8.0)
3 | #
4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING
5 | #
6 | # options string: py
7 | #
8 |
9 | from thrift.Thrift import TType, TMessageType, TException
10 | from ttypes import *
11 |
12 |
--------------------------------------------------------------------------------
/hadoopy_hbase/java/build.sh:
--------------------------------------------------------------------------------
1 | echo "Note this assumes that the paths/versions are correct, make changes as necesssary"
2 | HADOOP_PATH="/usr/lib/hadoop"
3 | HBASE_PATH="/usr/lib/hbase"
4 |
5 | echo "Copying HBASE libs to Hadoop library path (simple way so that it can find them)"
6 | sudo cp -R ${HBASE_PATH}/lib/* ${HADOOP_PATH}/lib/
7 | sudo cp -R ${HBASE_PATH}/*.jar ${HADOOP_PATH}/lib/
8 |
9 | echo "Copying libs into local build directory"
10 | mkdir -p ./lib/
11 | cp ${HBASE_PATH}/lib/commons-logging* ./lib/
12 | cp ${HBASE_PATH}/hbase-* ./lib/
13 | cp ${HADOOP_PATH}/hadoop-*-core.jar ./lib/
14 | cp ${HADOOP_PATH}/contrib/streaming/hadoop-streaming-*.jar ./lib/
15 |
16 |
17 | echo "Building hadoopy_hbase.jar"
18 | ant
19 |
20 | echo "Copying hadoopy_hbase.jar into Hadoop library path"
21 | cp build/dist/hadoopy_hbase.jar ${HADOOP_PATH}/lib/hadoopy_hbase.jar
22 |
23 | echo "Restarting jobtracker and tasktracker"
24 | /etc/init.d/hadoop-0.20-jobtracker restart
25 | /etc/init.d/hadoop-0.20-tasktracker restart
--------------------------------------------------------------------------------
/hadoopy_hbase/java/build.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/hadoopy_hbase/java/build_cdh4.sh:
--------------------------------------------------------------------------------
1 |
2 | #echo "Copying HBASE libs to Hadoop library path (simple way so that it can find them)"
3 | #sudo cp -R ${HBASE_PATH}/lib/* ${HADOOP_HOME}/lib/
4 | #sudo cp -R ${HBASE_PATH}/*.jar ${HADOOP_HOME}/lib/
5 |
6 | echo "Copying libs into local build directory"
7 | mkdir -p ./lib/
8 | echo $HBASE_HOME
9 | echo $HADOOP_HOME
10 | cp ${HBASE_HOME}/lib/commons-logging* ./lib/
11 | cp ${HBASE_HOME}/hbase-* ./lib/
12 | cp ${HADOOP_COMMONS_HOME}/*.jar ./lib/
13 | cp ${HADOOP_HOME}/hadoop-*-core.jar ./lib/
14 | cp ${HADOOP_HOME}/contrib/streaming/hadoop-streaming-*.jar ./lib/
15 | cp /usr/share/java/commons-codec.jar ./lib/
16 |
17 |
18 | echo "Building hadoopy_hbase.jar"
19 | ant
20 |
21 | echo "Copying hadoopy_hbase.jar into Hadoop library path"
22 | #cp build/dist/hadoopy_hbase.jar ${HADOOP_HOME}/lib/hadoopy_hbase.jar
23 |
24 | #echo "Restarting jobtracker and tasktracker"
25 | #/etc/init.d/hadoop-0.20-jobtracker restart
26 | #/etc/init.d/hadoop-0.20-tasktracker restart
--------------------------------------------------------------------------------
/hadoopy_hbase/java/common.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
--------------------------------------------------------------------------------
/hadoopy_hbase/java/src/java/com/dappervision/hbase/mapred/TypedBytesTableInputFormat.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2010 The Apache Software Foundation
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one
5 | * or more contributor license agreements. See the NOTICE file
6 | * distributed with this work for additional information
7 | * regarding copyright ownership. The ASF licenses this file
8 | * to you under the Apache License, Version 2.0 (the
9 | * "License"); you may not use this file except in compliance
10 | * with the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | */
20 | package com.dappervision.hbase.mapred;
21 |
22 | import java.io.IOException;
23 |
24 | import org.apache.commons.logging.Log;
25 | import org.apache.commons.logging.LogFactory;
26 | import org.apache.hadoop.fs.Path;
27 | import org.apache.hadoop.hbase.HBaseConfiguration;
28 | import org.apache.hadoop.hbase.client.HTable;
29 | import org.apache.hadoop.hbase.util.Bytes;
30 | import org.apache.hadoop.mapred.FileInputFormat;
31 | import org.apache.hadoop.mapred.JobConf;
32 | import org.apache.hadoop.mapred.JobConfigurable;
33 | import org.apache.hadoop.hbase.filter.Filter;
34 | import org.apache.hadoop.util.StringUtils;
35 | import com.dappervision.hbase.mapred.TypedBytesTableInputFormatBase;
36 | import org.apache.hadoop.hbase.filter.RowFilter;
37 | import org.apache.hadoop.hbase.filter.RegexStringComparator;
38 | import org.apache.hadoop.hbase.filter.CompareFilter;
39 | import java.io.UnsupportedEncodingException;
40 | import org.apache.commons.codec.binary.Base64;
41 |
42 | import java.io.IOException;
43 | import java.util.HashMap;
44 | import java.util.HashSet;
45 | import java.util.Map;
46 | import java.util.Set;
47 |
48 | import org.apache.commons.logging.Log;
49 | import org.apache.commons.logging.LogFactory;
50 | import org.apache.hadoop.fs.Path;
51 | import org.apache.hadoop.hbase.HBaseConfiguration;
52 | import org.apache.hadoop.hbase.HConstants;
53 | import org.apache.hadoop.hbase.KeyValue;
54 | import org.apache.hadoop.hbase.UnknownScannerException;
55 | import org.apache.hadoop.hbase.client.HTable;
56 | import org.apache.hadoop.hbase.client.Result;
57 | import org.apache.hadoop.hbase.client.ResultScanner;
58 | import org.apache.hadoop.hbase.client.Scan;
59 | import org.apache.hadoop.hbase.mapred.TableSplit;
60 | import org.apache.hadoop.hbase.regionserver.HRegion;
61 | import org.apache.hadoop.hbase.util.Bytes;
62 | import org.apache.hadoop.mapred.FileInputFormat;
63 | import org.apache.hadoop.mapred.InputFormat;
64 | import org.apache.hadoop.mapred.InputSplit;
65 | import org.apache.hadoop.mapred.JobConf;
66 | import org.apache.hadoop.mapred.JobConfigurable;
67 | import org.apache.hadoop.mapred.RecordReader;
68 | import org.apache.hadoop.mapred.Reporter;
69 | import org.apache.hadoop.record.Buffer;
70 | import org.apache.hadoop.typedbytes.TypedBytesWritable;
71 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
72 | import org.apache.hadoop.util.StringUtils;
73 | import com.dappervision.hbase.mapred.TypedBytesTableRecordReader;
74 | import com.dappervision.hbase.mapred.TypedBytesTableRecordReaderSingleValue;
75 |
76 |
77 |
78 |
79 | /**
80 | * Convert HBase tabular data into a format that is consumable by Map/Reduce.
81 | */
82 | @Deprecated
83 | public class TypedBytesTableInputFormat extends TypedBytesTableInputFormatBase implements
84 | JobConfigurable {
85 | private final Log LOG = LogFactory.getLog(TypedBytesTableInputFormat.class);
86 |
87 |
88 | /**
89 | * space delimited list of columns
90 | */
91 | public static final String COLUMN_LIST = "hbase.mapred.tablecolumnsb64";
92 | public static final String ROW_FILTER_REGEX = "hbase.mapred.rowfilter";
93 | public static final String START_ROW = "hbase.mapred.startrowb64";
94 | public static final String STOP_ROW = "hbase.mapred.stoprowb64";
95 | public static final String VALUE_FORMAT = "hbase.mapred.valueformat";
96 |
97 | private byte [][] inputColumns;
98 | private HTable table;
99 | private TypedBytesTableRecordReader tableRecordReader;
100 |
101 |
102 | /**
103 | * Builds a TableRecordReader. If no TableRecordReader was provided, uses
104 | * the default.
105 | *
106 | * @see org.apache.hadoop.mapred.InputFormat#getRecordReader(InputSplit,
107 | * JobConf, Reporter)
108 | */
109 |
110 | public void configure(JobConf job) {
111 | Path[] tableNames = FileInputFormat.getInputPaths(job);
112 | String colArg = job.get(COLUMN_LIST);
113 | String[] colNames = colArg.split(" ");
114 | byte [][] m_cols = new byte[colNames.length][];
115 | for (int i = 0; i < m_cols.length; i++) {
116 | m_cols[i] = Base64.decodeBase64(Bytes.toBytes(colNames[i]));
117 | }
118 | setInputColumns(m_cols);
119 | if (job.get(ROW_FILTER_REGEX) != null) {
120 | LOG.info("Row Regex Filter[" + job.get(ROW_FILTER_REGEX) + "]");
121 | setRowFilter(new RowFilter(CompareFilter.CompareOp.EQUAL, new RegexStringComparator(job.get(ROW_FILTER_REGEX))));
122 | }
123 | if (job.get(START_ROW) != null) {
124 | LOG.info("Start Row[" + job.get(START_ROW) + "]");
125 | try {
126 | setStartRow(Base64.decodeBase64(job.get(START_ROW).getBytes("US-ASCII")));
127 | } catch( UnsupportedEncodingException e){
128 | LOG.error("Start Row[" + job.get(START_ROW) + "] - Error");
129 | }
130 | }
131 | if (job.get(STOP_ROW) != null) {
132 | LOG.info("Stop Row[" + job.get(STOP_ROW) + "]");
133 | try {
134 | setStopRow(Base64.decodeBase64(job.get(STOP_ROW).getBytes("US-ASCII")));
135 | } catch( UnsupportedEncodingException e){
136 | LOG.error("Stop Row[" + job.get(STOP_ROW) + "] - Error");
137 | }
138 | }
139 | try {
140 | setHTable(new HTable(HBaseConfiguration.create(job), tableNames[0].getName()));
141 | } catch (Exception e) {
142 | LOG.error(StringUtils.stringifyException(e));
143 | }
144 | if (job.get(VALUE_FORMAT) != null && job.get(VALUE_FORMAT).equalsIgnoreCase("singlevalue")) {
145 | LOG.info("Value Format[" + job.get(VALUE_FORMAT) + "]");
146 | super.setTableRecordReader(new TypedBytesTableRecordReaderSingleValue());
147 | } else {
148 | LOG.info("Value Format[familiescolumns]");
149 | super.setTableRecordReader(new TypedBytesTableRecordReader());
150 | }
151 | }
152 |
153 | public void validateInput(JobConf job) throws IOException {
154 | // expecting exactly one path
155 | Path [] tableNames = FileInputFormat.getInputPaths(job);
156 | if (tableNames == null || tableNames.length > 1) {
157 | throw new IOException("expecting one table name");
158 | }
159 |
160 | // connected to table?
161 | if (getHTable() == null) {
162 | throw new IOException("could not connect to table '" +
163 | tableNames[0].getName() + "'");
164 | }
165 |
166 | // expecting at least one column
167 | String colArg = job.get(COLUMN_LIST);
168 | if (colArg == null || colArg.length() == 0) {
169 | throw new IOException("expecting at least one column");
170 | }
171 | }
172 | }
--------------------------------------------------------------------------------
/hadoopy_hbase/java/src/java/com/dappervision/hbase/mapred/TypedBytesTableRecordReader.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2010 The Apache Software Foundation
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one
5 | * or more contributor license agreements. See the NOTICE file
6 | * distributed with this work for additional information
7 | * regarding copyright ownership. The ASF licenses this file
8 | * to you under the Apache License, Version 2.0 (the
9 | * "License"); you may not use this file except in compliance
10 | * with the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | */
20 | package com.dappervision.hbase.mapred;
21 |
22 | import java.io.IOException;
23 |
24 | import org.apache.hadoop.hbase.client.HTable;
25 | import org.apache.hadoop.hbase.client.Result;
26 | import org.apache.hadoop.hbase.filter.Filter;
27 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
28 | import org.apache.hadoop.io.BytesWritable;
29 | import org.apache.hadoop.mapred.RecordReader;
30 | import org.apache.hadoop.typedbytes.TypedBytesOutput;
31 | import org.apache.hadoop.typedbytes.TypedBytesWritable;
32 | import org.apache.hadoop.hbase.mapred.TableRecordReaderImpl;
33 | import java.io.ByteArrayOutputStream;
34 | import java.io.DataOutputStream;
35 | import org.apache.hadoop.record.Buffer;
36 | import java.util.TreeMap;
37 | import java.util.Map;
38 | import java.util.NavigableMap;
39 |
40 | /**
41 | * Iterate over an HBase table data, return (Text, RowResult) pairs
42 | */
43 | public class TypedBytesTableRecordReader
44 | implements RecordReader {
45 |
46 | protected TableRecordReaderImpl recordReaderImpl = new TableRecordReaderImpl();
47 |
48 | /**
49 | * Restart from survivable exceptions by creating a new scanner.
50 | *
51 | * @param firstRow
52 | * @throws IOException
53 | */
54 | public void restart(byte[] firstRow) throws IOException {
55 | this.recordReaderImpl.restart(firstRow);
56 | }
57 |
58 | /**
59 | * Build the scanner. Not done in constructor to allow for extension.
60 | *
61 | * @throws IOException
62 | */
63 | public void init() throws IOException {
64 | this.recordReaderImpl.init();
65 | }
66 |
67 | /**
68 | * @param htable the {@link HTable} to scan.
69 | */
70 | public void setHTable(HTable htable) {
71 | this.recordReaderImpl.setHTable(htable);
72 | }
73 |
74 | /**
75 | * @param inputColumns the columns to be placed in {@link TypedBytesWritable}.
76 | */
77 | public void setInputColumns(final byte [][] inputColumns) {
78 | this.recordReaderImpl.setInputColumns(inputColumns);
79 | }
80 |
81 | /**
82 | * @param startRow the first row in the split
83 | */
84 | public void setStartRow(final byte [] startRow) {
85 | this.recordReaderImpl.setStartRow(startRow);
86 | }
87 |
88 | /**
89 | *
90 | * @param endRow the last row in the split
91 | */
92 | public void setEndRow(final byte [] endRow) {
93 | this.recordReaderImpl.setEndRow(endRow);
94 | }
95 |
96 | /**
97 | * @param rowFilter the {@link Filter} to be used.
98 | */
99 | public void setRowFilter(Filter rowFilter) {
100 | this.recordReaderImpl.setRowFilter(rowFilter);
101 | }
102 |
103 | public void close() {
104 | this.recordReaderImpl.close();
105 | }
106 |
107 | /**
108 | * @return TypedBytesWritable
109 | *
110 | * @see org.apache.hadoop.mapred.RecordReader#createKey()
111 | */
112 | public TypedBytesWritable createKey() {
113 | //return this.recordReaderImpl.createKey();
114 | return new TypedBytesWritable();
115 | }
116 |
117 | /**
118 | * @return RowTypedBytesWritable
119 | *
120 | * @see org.apache.hadoop.mapred.RecordReader#createValue()
121 | */
122 | public TypedBytesWritable createValue() {
123 | //return this.recordReaderImpl.createValue();
124 | return new TypedBytesWritable();
125 | }
126 |
127 | public long getPos() {
128 |
129 | // This should be the ordinal tuple in the range;
130 | // not clear how to calculate...
131 | return this.recordReaderImpl.getPos();
132 | }
133 |
134 | public float getProgress() {
135 | // Depends on the total number of tuples and getPos
136 | return this.recordReaderImpl.getPos();
137 | }
138 |
139 | /**
140 | * @param key HStoreKey as input key.
141 | * @param value MapWritable as input value
142 | * @return true if there was more data
143 | * @throws IOException
144 | */
145 | public boolean next(TypedBytesWritable key, TypedBytesWritable value)
146 | throws IOException {
147 | ImmutableBytesWritable key0 = new ImmutableBytesWritable();
148 | Result value0 = new Result();
149 | boolean out = this.recordReaderImpl.next(key0, value0);
150 | if (out) {
151 | TreeMap tm = new TreeMap();
152 | for (Map.Entry> entry : value0.getNoVersionMap().entrySet()) {
153 | TreeMap tm_inner = new TreeMap();
154 | for (Map.Entry entry0 : entry.getValue().entrySet()) {
155 | tm_inner.put(new Buffer(entry0.getKey()), new Buffer(entry0.getValue()));
156 | }
157 | tm.put(new Buffer(entry.getKey()), tm_inner);
158 | }
159 | key.setValue(new Buffer(key0.get()));
160 | value.setValue(tm);
161 | }
162 | return out;
163 |
164 | }
165 | }
--------------------------------------------------------------------------------
/hadoopy_hbase/java/src/java/com/dappervision/hbase/mapred/TypedBytesTableRecordReaderSingleValue.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright 2010 The Apache Software Foundation
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one
5 | * or more contributor license agreements. See the NOTICE file
6 | * distributed with this work for additional information
7 | * regarding copyright ownership. The ASF licenses this file
8 | * to you under the Apache License, Version 2.0 (the
9 | * "License"); you may not use this file except in compliance
10 | * with the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | */
20 | package com.dappervision.hbase.mapred;
21 |
22 | import java.io.IOException;
23 |
24 | import org.apache.hadoop.hbase.client.HTable;
25 | import org.apache.hadoop.hbase.client.Result;
26 | import org.apache.hadoop.hbase.filter.Filter;
27 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
28 | import org.apache.hadoop.io.BytesWritable;
29 | import org.apache.hadoop.mapred.RecordReader;
30 | import org.apache.hadoop.typedbytes.TypedBytesOutput;
31 | import org.apache.hadoop.typedbytes.TypedBytesWritable;
32 | import org.apache.hadoop.hbase.mapred.TableRecordReaderImpl;
33 | import java.io.ByteArrayOutputStream;
34 | import java.io.DataOutputStream;
35 | import org.apache.hadoop.record.Buffer;
36 |
37 | public class TypedBytesTableRecordReaderSingleValue extends TypedBytesTableRecordReader {
38 | /**
39 | * @param key HStoreKey as input key.
40 | * @param value MapWritable as input value
41 | * @return true if there was more data
42 | * @throws IOException
43 | */
44 | public boolean next(TypedBytesWritable key, TypedBytesWritable value)
45 | throws IOException {
46 | ImmutableBytesWritable key0 = new ImmutableBytesWritable();
47 | Result value0 = new Result();
48 | boolean out = this.recordReaderImpl.next(key0, value0);
49 | if (out) {
50 | byte [] value_byte = value0.value();
51 | if (value_byte == null) {
52 | throw new IOException("SingleValue requires at least one column to be present for each row, this should not be possible!");
53 | }
54 | key.setValue(new Buffer(key0.get()));
55 | value.setValue(new Buffer(value_byte));
56 | }
57 | return out;
58 |
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/hadoopy_hbase/setup.py:
--------------------------------------------------------------------------------
1 | from distutils.core import setup
2 |
3 | setup(name='hadoopy_hbase',
4 | version='.01',
5 | packages=['hadoopy_hbase', 'hadoopy_hbase.hbase'])
6 |
--------------------------------------------------------------------------------
/hadoopy_hbase/tests/auth.py:
--------------------------------------------------------------------------------
1 | import bottle
2 | import base64
3 | import random
4 | import os
5 |
6 |
7 | AUTH_KEY = os.environ.get('AUTH_KEY')
8 |
9 |
10 | def _make_key(l=16):
11 | global AUTH_KEY
12 | s = hex(random.getrandbits(8 * l))[2:]
13 | if s[-1] == 'L':
14 | s = s[:-1]
15 | # Pad with zeros
16 | if len(s) != l * 2:
17 | s = '0' * (2 * l - len(s)) + s
18 | AUTH_KEY = base64.urlsafe_b64encode(s.decode('hex')).rstrip('=')
19 |
20 |
21 | def verify(func):
22 |
23 | def inner(*args, **kw):
24 | if not bottle.request.path.startswith('/%s/' % AUTH_KEY):
25 | bottle.abort(401)
26 | return func(*args, **kw)
27 | if AUTH_KEY is None:
28 | _make_key()
29 | print('AUTH_KEY: /%s/' % AUTH_KEY)
30 | return inner
31 |
--------------------------------------------------------------------------------
/hadoopy_hbase/tests/flickr_count.py:
--------------------------------------------------------------------------------
1 | import hadoopy_hbase
2 | import time
3 |
4 | c = hadoopy_hbase.connect('localhost')
5 | cnt = 0
6 | st = time.time()
7 | N = 5000
8 | for x in hadoopy_hbase.scanner(c, 'flickr', per_call=N, columns=['metadata:license']):
9 | cnt += 1
10 | if cnt % N == 0:
11 | print(((time.time() - st) / N, cnt))
12 | st = time.time()
13 |
--------------------------------------------------------------------------------
/hadoopy_hbase/tests/flickr_count_hadoop.py:
--------------------------------------------------------------------------------
1 | import hadoopy
2 | import hadoopy_hbase
3 | import time
4 | import logging
5 | logging.basicConfig(level=logging.DEBUG)
6 |
7 | st = time.time()
8 |
9 | # NOTE(brandyn): If launch fails, you may need to use launch_frozen see hadoopy.com for details
10 |
11 | out = 'out-%f/0' % st
12 | jobconfs = ['mapred.map.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec',
13 | 'mapred.compress.map.output=true',
14 | 'mapred.output.compression.type=BLOCK']
15 | hadoopy_hbase.launch('flickr', out, 'identity_hbase_job.py', libjars=['hadoopy_hbase.jar'],
16 | num_mappers=8, columns=['metadata:'], jobconfs=jobconfs)
17 | #results = dict(hadoopy.readtb(out))
18 | #print(results)
19 |
--------------------------------------------------------------------------------
/hadoopy_hbase/tests/flickr_count_job.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | import hadoopy
3 |
4 |
5 | def mapper(row, column_families):
6 | yield 'num_rows', 1
7 |
8 | def reducer(key, values):
9 | yield key, sum(values)
10 |
11 | if __name__ == '__main__':
12 | hadoopy.run(mapper, reducer)
13 |
--------------------------------------------------------------------------------
/hadoopy_hbase/tests/flickr_crawl.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import gevent.monkey
3 | gevent.monkey.patch_all()
4 | import hadoopy_hbase
5 | from hadoopy_hbase import BatchMutation, Mutation, ColumnDescriptor
6 | from thrift_bench import random_string, remove_table
7 | import vision_data
8 | import random
9 | #import multiprocessing
10 | import time
11 |
12 | def main():
13 | #tags = ' animals architecture art asia australia autumn baby band barcelona beach berlin bike bird birds birthday black blackandwhite blue bw california canada canon car cat chicago china christmas church city clouds color concert dance day de dog england europe fall family fashion festival film florida flower flowers food football france friends fun garden geotagged germany girl graffiti green halloween hawaii holiday house india instagramapp iphone iphoneography island italia italy japan kids la lake landscape light live london love macro me mexico model museum music nature new newyork newyorkcity night nikon nyc ocean old paris park party people photo photography photos portrait raw red river rock san sanfrancisco scotland sea seattle show sky snow spain spring square squareformat street summer sun sunset taiwan texas thailand tokyo travel tree trees trip uk unitedstates urban usa vacation vintage washington water wedding white winter woman yellow zoo '.strip().split()
14 | tags = ['Pyramids Of Giza', 'Great Wall Of China', 'Terracotta Warriors', 'Statue Of Liberty', 'Edinburgh Castle', 'Stirling Castle', 'Empire State Building', 'Stonehenge', 'Blackpool Tower', 'London Bridge', 'Tower Bridge', 'Buckinghampalace', 'Sphinx', 'Eiffle Tower', 'Arc Du Triomph', 'Louvre', 'Cristo Redentor', 'CN Tower', 'Norte Dame', 'River Nile', 'Mount Rushmore', 'Pentagon', 'White House', 'Lincoln Memorial', 'Grand Canyon', 'Leaning Tower Of Piza', 'Easter Island Heads', 'Niagara Falls', 'Abbey Road', 'Ayers Rock', 'Evangeline Oak', 'Lone Cyprus', 'Golden Gate Bridge', 'Colosseum', 'Taj Mahal', 'Santorini']
15 | client = hadoopy_hbase.connect('localhost')
16 | random.shuffle(tags)
17 | flickr = vision_data.Flickr(max_iters=1)
18 | #remove_table(client, 'flickr')
19 | #client.createTable('flickr', [ColumnDescriptor('metadata:'), ColumnDescriptor('images:')])
20 | while True:
21 | for tag in tags:
22 | mutations = []
23 | try:
24 | for url_m, metadata in flickr.image_class_meta_url(tag):
25 | mutations.append(BatchMutation(row=url_m, mutations=[Mutation(column='metadata:%s' % x, value=y.encode('utf-8'))
26 | for x, y in metadata.items()]))
27 | except Exception, e:
28 | print(e)
29 | continue
30 | st = time.time()
31 | client.mutateRows('flickr', mutations)
32 | if mutations:
33 | print((tag, (time.time() - st) / len(mutations), len(mutations)))
34 | else:
35 | print((tag, 0., len(mutations)))
36 |
37 |
38 | def display():
39 | client = hadoopy_hbase.connect('localhost')
40 | for x in hadoopy_hbase.scanner(client, 'flickr', ['metadata:title']):
41 | print(x)
42 |
43 | if __name__ == '__main__':
44 | gevent.joinall([gevent.spawn(main) for x in range(30)])
45 |
--------------------------------------------------------------------------------
/hadoopy_hbase/tests/hbase_test.py:
--------------------------------------------------------------------------------
1 | import hadoopy
2 | import hadoopy_hbase
3 | import time
4 | import logging
5 | logging.basicConfig(level=logging.DEBUG)
6 |
7 | st = time.time()
8 |
9 | # NOTE(brandyn): If launch fails, you may need to use launch_frozen see hadoopy.com for details
10 | #,
11 | #
12 | out = 'out-%f/3' % st
13 | hadoopy_hbase.launch('testtable', out, 'hbase_test_job.py', columns=['colfam1:'], libjars=['hadoopy_hbase.jar'], start_row='5', stop_row='52')
14 | results = hadoopy.readtb(out)
15 | print list(results)[:10]
16 |
17 | out = 'out-%f/1' % st
18 | hadoopy_hbase.launch('testtable', out, 'hbase_test_job.py', columns=['colfam1:'], libjars=['hadoopy_hbase.jar'], jobconfs={'hbase.mapred.rowfilter': '.*3'})
19 | results = hadoopy.readtb(out)
20 | print list(results)[:10]
21 |
22 | out = 'out-%f/0' % st
23 | hadoopy_hbase.launch('testtable', out, 'hbase_test_job.py', columns=['colfam1:'], libjars=['hadoopy_hbase.jar'])
24 | results = hadoopy.readtb(out)
25 | print list(results)[:10]
26 |
27 | out = 'out-%f/2' % st
28 | hadoopy_hbase.launch('testtable', out, 'hbase_test_job2.py', columns=['colfam1:'], libjars=['hadoopy_hbase.jar'])
29 | results = hadoopy.readtb(out)
30 | print list(results)[:10]
31 |
--------------------------------------------------------------------------------
/hadoopy_hbase/tests/hbase_test_job.py:
--------------------------------------------------------------------------------
1 | import hadoopy
2 |
3 | def mapper(k, v):
4 | #yield 'KEY[%s]' % k, 'VALUE[%s]' % v
5 | yield k, v
6 |
7 |
8 | if __name__ == '__main__':
9 | hadoopy.run(mapper)
10 |
--------------------------------------------------------------------------------
/hadoopy_hbase/tests/hbase_test_job2.py:
--------------------------------------------------------------------------------
1 | import hadoopy
2 |
3 |
4 | def mapper(row, column_families):
5 | for column_fam, columns in column_families.items():
6 | for column, data in columns.items():
7 | yield row, (column_fam, column, data)
8 |
9 | if __name__ == '__main__':
10 | hadoopy.run(mapper)
11 |
--------------------------------------------------------------------------------
/hadoopy_hbase/tests/server.py:
--------------------------------------------------------------------------------
1 | from gevent import monkey
2 | monkey.patch_all()
3 | import bottle
4 | import os
5 | import argparse
6 | import random
7 | import base64
8 | from auth import verify
9 | from flickr_crawl import setup, scanner
10 | import itertools
11 | import time
12 |
13 | START_ROW = ''
14 |
15 | @bottle.route('/:auth_key#[a-zA-Z0-9\_\-]+#/')
16 | @verify
17 | def main(auth_key):
18 | global START_ROW
19 | st = time.time()
20 | x = ''
21 | images = [' ' % y['metadata:url_s'] for x, y in itertools.islice(scanner(client, 'flickr', ['metadata:url_s'], per_call=100, start_row=START_ROW), 100)]
22 | START_ROW = x
23 | run_time = time.time() - st
24 | return ('%d-%f ' % (len(images), run_time)) + ' '.join(images)
25 |
26 |
27 |
28 | if __name__ == "__main__":
29 | parser = argparse.ArgumentParser(description="Serve a directory")
30 |
31 | # Server port
32 | parser.add_argument('--port', type=str, help='bottle.run webpy on this port',
33 | default='8080')
34 | ARGS = parser.parse_args()
35 | client = setup()
36 | bottle.run(host='0.0.0.0', port=ARGS.port, server='gevent')
37 |
--------------------------------------------------------------------------------
/hadoopy_hbase/tests/thrift_example.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | from thrift.transport.TSocket import TSocket
3 | from thrift.transport.TTransport import TBufferedTransport
4 | from thrift.protocol import TBinaryProtocol
5 | from thrift_bench import random_string, remove_table
6 | import hadoopy_hbase
7 |
8 | client = hadoopy_hbase.connect('localhost')
9 | remove_table(client, 'testtable')
10 | client.createTable('testtable', [hadoopy_hbase.ColumnDescriptor('colfam1:')])
11 |
12 | for x in xrange(100):
13 | client.mutateRow('testtable', str(x), [hadoopy_hbase.Mutation(column='colfam1:col%d' % y, value=random_string(5)) for y in range(10)])
14 | print(client.getRow('testtable', '0'))
15 |
--------------------------------------------------------------------------------
/pkg/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: rmr2
2 | Type: Package
3 | Title: R and Hadoop Streaming Connector
4 | Version: 3.3.1
5 | Date: 2015-2-10
6 | Author: Revolution Analytics
7 | Depends: R (>= 2.6.0), methods
8 | Imports: Rcpp, RJSONIO (>= 0.8-2), digest, functional, reshape2, stringr, plyr, caTools (>= 1.16)
9 | Suggests: quickcheck (>= 3.1.0), ravro, rhdfs, testthat
10 | Collate: basic.R extras.R hdfs.R keyval.R IO.R local.R mapreduce.R parse-url.R quickcheck-rmr.R streaming.R
11 | Maintainer: Revolution Analytics
12 | Description: Supports the map reduce programming model on top of hadoop streaming
13 | License: Apache License (== 2.0)
14 | ByteCompile: TRUE
15 | BuildVignettes: FALSE
16 |
--------------------------------------------------------------------------------
/pkg/NAMESPACE:
--------------------------------------------------------------------------------
1 | useDynLib(rmr2)
2 | export(mapreduce)
3 | export(from.dfs, to.dfs)
4 | export(equijoin)
5 | export(scatter, gather, rmr.sample)
6 | export(dfs.empty, dfs.size, dfs.exists, dfs.rmr, dfs.mv, dfs.ls)
7 | export(rmr.options)
8 | export(keyval, keys, values, c.keyval)
9 | export(make.input.format, make.output.format)
10 | export(to.map, to.reduce)
11 | export(rmr.str)
12 | export(status, increment.counter)
13 | export(vsum)
14 |
15 | S3method(gorder, default)
16 | S3method(gorder, factor)
17 | S3method(gorder, data.frame)
18 | S3method(gorder, matrix)
19 | S3method(gorder, raw)
20 |
21 | S3method(deraw, data.frame)
22 | S3method(deraw, matrix)
23 | S3method(deraw, raw)
24 | S3method(deraw, default)
25 |
26 | importFrom(functional, Curry)
27 | importFrom(plyr, splat)
28 | importFrom(plyr, quickdf)
29 | import(Rcpp)
30 | importFrom(stringr, str_detect)
31 | importFrom(stringr, str_match)
32 | importFrom(stringr, str_replace)
33 | importFrom(stringr, str_split)
34 | importFrom(digest, digest)
35 | importFrom(reshape2, dcast)
36 | importFrom(caTools, base64encode)
37 | importFrom(RJSONIO, fromJSON)
38 | importFrom(RJSONIO, toJSON)
39 | import(methods)
40 |
--------------------------------------------------------------------------------
/pkg/R/basic.R:
--------------------------------------------------------------------------------
1 | # Copyright 2011 Revolution Analytics
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | #string
16 |
17 | qw = function(...) as.character(match.call())[-1]
18 |
19 | #assignment
20 |
21 | default =
22 | function(x, value, bad.value = is.null) {
23 | test = if(is.function(bad.value)) bad.value(x) else identical(bad.value, x)
24 | if(test) value else x}
25 |
26 | #functional
27 |
28 | Make.single.arg =
29 | function(f)
30 | function(x) do.call(f, x)
31 |
32 | Make.multi.arg =
33 | function(f)
34 | function(...) f(list(...))
35 |
36 | Make.single.or.multi.arg = function(f, from = c("single", "multi")) {
37 | from = match.arg(from)
38 | if (from == "single") {
39 | f.single = f
40 | f.multi = Make.multi.arg(f)}
41 | else {
42 | f.single = Make.single.arg(f)
43 | f.multi = f}
44 | function(...) {
45 | args = list(...)
46 | if(length(args) == 1)
47 | f.single(args[[1]])
48 | else
49 | f.multi(...)}}
50 |
51 | all.predicate = function(x, P) all(sapply(x, P))
52 |
53 | #data structures
54 |
55 | make.fast.list = function(l = list()) {
56 | l1 = l
57 | l2 = list(NULL)
58 | i = 1
59 | function(els = NULL){
60 | if(missing(els)) c(l1, l2[!sapply(l2, is.null)])
61 | else{
62 | if(i + length(els) - 1 > length(l2)) {
63 | l1 <<- c(l1, l2[!sapply(l2, is.null)])
64 | i <<- 1
65 | l2 <<- rep(list(NULL), length(l1) + length(els))}
66 | l2[i:(i + length(els) - 1)] <<- els
67 | i <<- i + length(els)}}}
68 |
69 | named.slice = function(x, n) x[which(names(x) == n)]
70 |
71 | mapply.list =
72 | function(...) mapply(FUN = list, ..., SIMPLIFY = FALSE)
73 |
74 | t.list =
75 | function(l) {
76 | if(length(l) == 0) l
77 | else
78 | .Call(
79 | "t_list",
80 | if(!all(sapply.is.list(l)))
81 | lapply(l, as.list)
82 | else l,
83 | PACKAGE = "rmr2")}
84 |
85 | #data frame manip
86 |
87 | sane.c =
88 | function(...) {
89 | if(all(are.factor(list(...))))
90 | unlist(list(...))
91 | else
92 | c(...)}
93 |
94 | rbind.fill.fast =
95 | function(...) {
96 | xx = list(...)
97 | cols = unique(unlist(lapply(xx, names)))
98 | ll =
99 | lapply(
100 | cols,
101 | function(n)
102 | do.call(
103 | sane.c,
104 | lapply(
105 | xx,
106 | function(x){
107 | if(is.null(x[[n]]))
108 | rep(NA, nrow(x))
109 | else
110 | x[[n]]})))
111 | names(ll) = cols
112 | do.call(
113 | data.frame,
114 | c(
115 | lapply(
116 | ll,
117 | function(x)
118 | if (is.atomic(x)) x
119 | else I(x)),
120 | stringsAsFactors = FALSE))}
121 |
122 |
123 |
124 | every.second =
125 | function(pattern)
126 | function(x) {
127 | opt = options("warn")[[1]]
128 | options(warn = -1)
129 | y = x[pattern]
130 | options(warn = opt)
131 | y}
132 |
133 | odd = every.second(c(TRUE, FALSE))
134 | even = every.second(c(FALSE, TRUE))
135 |
136 | interleave =
137 | function(l1, l2) {
138 | l = list()
139 | l[2*(1:length(l1)) - 1] = l1
140 | l[2*(1:length(l1))] = l2
141 | l}
142 |
143 | #con
144 |
--------------------------------------------------------------------------------
/pkg/R/extras.R:
--------------------------------------------------------------------------------
1 | # Copyright 2011 Revolution Analytics
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | ## push a file through this to get as many partitions as possible (depending on system settings)
16 | ## data is unchanged
17 |
18 | scatter = function(input, output = NULL, ...)
19 | mapreduce(input,
20 | output,
21 | map = function(k, v) keyval(runif(1), v),
22 | reduce = function(k, vv) vv,
23 | ...)
24 |
25 | gather = function(input, output = NULL, ...) {
26 | backend.parameters = list(...)['backend.prameters']
27 | backend.parameters$hadoop = append(backend.parameters$hadoop, list(D='mapred.reduce.tasks=1'))
28 | mapreduce(input,
29 | output,
30 | backend.parameters = backend.parameters,
31 | ...)}
32 |
33 | #sampling
34 |
35 | rmr.sample = function(input, output = NULL, method = c("any", "Bernoulli"), ...) {
36 | method = match.arg(method)
37 | if (method == "any") {
38 | n = list(...)[['n']]
39 | some = function(k, v)
40 | keyval(
41 | if(is.null(k))
42 | list(NULL)
43 | else
44 | rmr.slice(k, 1:min(n, rmr.length(k))),
45 | rmr.slice(v, 1:min(n, rmr.length(v))))
46 | mapreduce(input,
47 | output,
48 | map = some,
49 | combine = TRUE,
50 | reduce = some)}
51 | else
52 | if(method == "Bernoulli"){
53 | p = list(...)[['p']]
54 | mapreduce(input,
55 | output,
56 | map = function(k, v) {
57 | filter = rbinom(rmr.length(v), 1, p) == 1
58 | keyval(rmr.slice(k, filter),
59 | rmr.slice(v, filter))})}}
60 |
61 | ## map and reduce generators
62 |
63 | partitioned.map =
64 | function(map, n)
65 | function(k,v) {
66 | kv = map(k,v)
67 | keyval(
68 | data.frame(
69 | sample(
70 | 1:n, size=length(k),
71 | replace = TRUE), k),
72 | v)}
73 |
74 | partitioned.combine =
75 | function(reduce)
76 | function(k,vv) {
77 | kv = reduce(k,vv)
78 | keyval(k[,-1], vv)}
79 |
80 | ## fast aggregate functions
81 |
82 | vsum =
83 | function(x) {
84 | if(is.list(x))
85 | .Call("vsum", x, PACKAGE = "rmr2")
86 | else
87 | stop(paste("can't vsum a ", class(x)))}
88 |
89 | ## dev support
90 |
91 | reload =
92 | function() {
93 | detach("package:rmr2", unload = TRUE)
94 | library.dynam.unload("rmr2",system.file(package="rmr2"))
95 | library(rmr2)}
96 |
97 | rmr.str =
98 | function(x, ...) {
99 | sc = sys.calls()
100 | message(
101 | paste(
102 | c(
103 | capture.output(
104 | str(sc)),
105 | match.call() [[2]],
106 | capture.output(str(x, ...))),
107 | collapse="\n"))
108 | x}
109 |
--------------------------------------------------------------------------------
/pkg/R/hdfs.R:
--------------------------------------------------------------------------------
1 | hdfs.ls =
2 | function(fname)
3 | read.table(
4 | textConnection(hdfs("ls", fname, intern = TRUE)),
5 | skip=1,
6 | col.names=c("permissions", "links", "owner", "group", "size", "date", "time", "path"),
7 | stringsAsFactors = FALSE)
8 | hdfs.exists =
9 | function(fname)
10 | hdfs("test -e", fname, test = TRUE)
11 | test.rmr =
12 | function() {
13 | length(
14 | suppressWarnings(
15 | hdfs("- 2>&1 | grep rmr", intern = TRUE))) > 0}
16 |
17 | hdfs.rmr =
18 | (function() {
19 | rmr = NULL
20 | function(fname) {
21 | if(is.null(rmr))
22 | rmr <<- test.rmr()
23 | if(rmr)
24 | hdfs("rmr", fname)
25 | else
26 | hdfs("rm -r", fname)}})()
27 | hdfs.isdir =
28 | function(fname) {
29 | if(.Platform$OS.type == "windows")
30 | length(grep(pattern = "^Found", hdfs("ls", fname, intern = TRUE))) == 1
31 | else
32 | hdfs("test -d", fname, test = TRUE)}
33 | hdfs.mv =
34 | function(src, dst)
35 | hdfs("mv", src, dst)
36 | hdfs.mkdir =
37 | function(fname)
38 | hdfs("mkdir", fname)
39 | hdfs.put =
40 | function(src, dst)
41 | hdfs("put", src, dst)
42 | hdfs.get =
43 | function(src, dst)
44 | hdfs("get", src, dst)
45 |
46 | hdfs =
47 | function(cmd, ..., intern = FALSE, test = FALSE) {
48 | retval =
49 | system(
50 | paste(
51 | hdfs.cmd(),
52 | "dfs",
53 | paste("-", cmd, sep = ""),
54 | paste(
55 | sapply(
56 | list(...),
57 | rmr.normalize.path),
58 | collapse=" ")),
59 | intern = intern)
60 | if(intern)
61 | retval
62 | else{
63 | if(test)
64 | retval == 0
65 | else {
66 | stopifnot(retval == 0)
67 | NULL }}}
--------------------------------------------------------------------------------
/pkg/R/local.R:
--------------------------------------------------------------------------------
1 | # Copyright 2011 Revolution Analytics
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | mr.local = function(
16 | in.folder,
17 | out.folder,
18 | map,
19 | reduce,
20 | vectorized.reduce,
21 | combine,
22 | in.memory.combine,
23 | input.format,
24 | output.format,
25 | backend.parameters,
26 | verbose) {
27 |
28 | profile.nodes = rmr.options("profile.nodes")
29 | get.data =
30 | function(fname) {
31 | environment(input.format$format) =
32 | list2env(as.list(environment(input.format$format)))
33 | kv = from.dfs(fname, format = input.format)
34 | kv}
35 | map.out =
36 | c.keyval(
37 | do.call(
38 | c,
39 | lapply(
40 | in.folder,
41 | function(fname) {
42 | kv = get.data(fname)
43 | Sys.setenv(mapreduce_map_input_file = fname)
44 | lkv = length.keyval(kv)
45 | unname(
46 | tapply(
47 | 1:lkv,
48 | ceiling((1:lkv)/(lkv/(object.size(kv)/10^6))), #make this constant configurable?
49 | function(r) {
50 | kvr = slice.keyval(kv, r)
51 | as.keyval(map(keys(kvr), values(kvr)))},
52 | simplify = FALSE))})))
53 | map.out = from.dfs(to.dfs(map.out))
54 | reduce.helper =
55 | function(kk, vv) as.keyval(reduce(rmr.slice(kk, 1), vv))
56 | reduce.out = {
57 | if(!is.null(reduce)){
58 | if(!vectorized.reduce){
59 | c.keyval(
60 | reduce.keyval(
61 | map.out,
62 | reduce.helper))}
63 | else{
64 | as.keyval(
65 | reduce(
66 | keys(map.out),
67 | values(map.out)))}}
68 | else
69 | map.out}
70 | to.dfs(reduce.out, out.folder, format = output.format)
71 | NULL}
72 |
--------------------------------------------------------------------------------
/pkg/R/parse-url.R:
--------------------------------------------------------------------------------
1 | # Copyright 2011 Revolution Analytics
2 | # Factored out from the httr package https://github.com/hadley/httr
3 | # Originally under the MIT license
4 | # Original author Hadley Wickham
5 | # No Copyright information found in the original.
6 |
7 | # Licensed under the Apache License, Version 2.0 (the "License");
8 | # you may not use this file except in compliance with the License.
9 | # You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 |
19 |
20 | parse_url <- function(url) {
21 |
22 | url <- as.character(url)
23 | stopifnot(length(url) == 1)
24 |
25 | pull_off <- function(pattern) {
26 | if (!str_detect(url, pattern)) return(NULL)
27 |
28 | piece <- str_match(url, pattern)[, 2]
29 | url <<- str_replace(url, pattern, "")
30 |
31 | piece
32 | }
33 |
34 | fragment <- pull_off("#(.*)$")
35 | scheme <- pull_off("^([[:alpha:]+.-]+):")
36 | netloc <- pull_off("^//([^/]*)/?")
37 |
38 | if (!is.null(netloc)) {
39 |
40 | pieces <- str_split(netloc, "@")[[1]]
41 | if (length(pieces) == 1) {
42 | username <- NULL
43 | password <- NULL
44 |
45 | host <- pieces
46 | } else {
47 | user_pass <- str_split(pieces[[1]], ":")[[1]]
48 | username <- user_pass[1]
49 | password <- user_pass[2]
50 |
51 | host <- pieces[2]
52 | }
53 |
54 | host_pieces <- str_split(host, ":")[[1]]
55 | hostname <- host_pieces[1]
56 | port <- if (length(host_pieces) > 1) host_pieces[2]
57 | } else {
58 | port <- username <- password <- hostname <- NULL
59 | }
60 |
61 | structure(list(
62 | scheme = scheme, hostname = hostname, port = port, path = url,
63 | username = username, password = password),
64 | class = "url")
65 | }
66 |
--------------------------------------------------------------------------------
/pkg/R/quickcheck-rmr.R:
--------------------------------------------------------------------------------
1 | # Copyright 2011 Revolution Analytics
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | ##app-specific generators
16 | if(require(quickcheck)){
17 |
18 | curry.size =
19 | function(gen, size) {
20 | force(gen)
21 | Curry(gen, size = size)}
22 |
23 | curry.nrow =
24 | function(gen, nrow) {
25 | force(gen)
26 | Curry(gen, nrow = nrow, ncol = c(min = 1))}
27 |
28 | rrmr.data =
29 | function(size = c(min = 0, max = quickcheck::default(vector.size %||% 5 * severity)))
30 | quickcheck::mixture(
31 | generators =
32 | c(
33 | lapply(
34 | list(
35 | quickcheck::rlogical,
36 | quickcheck::rinteger,
37 | quickcheck::rdouble,
38 | quickcheck::rcharacter,
39 | quickcheck::rraw,
40 | quickcheck::rfactor,
41 | quickcheck::rlist),
42 | curry.size, size = size),
43 | lapply(
44 | list(
45 | quickcheck::rmatrix,
46 | quickcheck::rdata.frame),
47 | curry.nrow, nrow = size)))()
48 |
49 | rdata.frame.simple =
50 | function(
51 | nrow = c(min = 1, max = quickcheck::default(data.frame.nrow %||% 5 * severity)),
52 | ncol = c(min = 1, max = quickcheck::default(data.frame.ncol %||% severity)))
53 | rdata.frame(
54 | generator =
55 | mixture(
56 | generators =
57 | list(
58 | quickcheck::rlogical,
59 | quickcheck::rinteger,
60 | quickcheck::rdouble,
61 | quickcheck::rcharacter)),
62 | nrow = nrow,
63 | ncol = ncol)
64 |
65 | rkeyval =
66 | function(k = rrmr.data(size = c(min = 1)), v = rrmr.data(size = c(min = 1)))
67 | keyval(k, v)
68 |
69 | rkeyvalsimple = function() keyval(runif(1), runif(1)) #we can do better than this
70 |
71 | ## generic sorting for normalized comparisons
72 | gorder = function(...) UseMethod("gorder")
73 | gorder.default = order
74 | gorder.factor = function(x) order(as.character(x))
75 | gorder.data.frame =
76 | function(x) splat(gorder)(lapply(x, function(x) if(is.factor(x)) as.character(x) else if(is.list(x) || is.raw(x)) sapply(x, digest) else x))
77 | gorder.matrix = function(x) gorder(as.data.frame(x))
78 | gorder.raw = gorder.list = function(x) gorder(sapply(x, digest))
79 |
80 | reorder = function(x, o) if(has.rows(x)) x[o, , drop = FALSE] else x[o]
81 |
82 | gsort = function(x) reorder(x, gorder(x))
83 |
84 | gsort.keyval =
85 | function(kv) {
86 | k = keys(kv)
87 | v = values(kv)
88 | o = {
89 | if(is.null(k)) gorder(v)
90 | else
91 | gorder(
92 | data.frame(
93 | if(is.list(k) && !is.data.frame(k)) sapply(k, digest) else k,
94 | if(is.list(v) && !is.data.frame(v)) sapply(v, digest) else v))}
95 | keyval(reorder(k, o), reorder(v, o))}
96 |
97 | ## keyval compare
98 | kv.cmp = function(kv1, kv2)
99 | isTRUE(all.equal(gsort.keyval(kv1), gsort.keyval(kv2), tolerance=1e-4, check.attributes=FALSE))
100 |
101 | }
--------------------------------------------------------------------------------
/pkg/examples/airline.R:
--------------------------------------------------------------------------------
1 | library(rmr2)
2 | from.dfs(
3 | mapreduce(
4 | input = '../RHadoop.data/airline.1000',
5 | input.format = make.input.format("csv", sep = ","),
6 | map = function(., data) {
7 | # filter out non-numeric values (header and NA)
8 | filter = !is.na(data[,16])
9 | data = data[filter,]
10 | # emit composite key (airline|year|month) and delay
11 | keyval(
12 | data[,c(9,1,2)],
13 | data[,16, drop = FALSE])},
14 | reduce = function(k,delays) {
15 | keyval(k, mean(delays[,1]))}))
--------------------------------------------------------------------------------
/pkg/examples/avro.R:
--------------------------------------------------------------------------------
1 | # known limitations: these formats work only with mapreduce, not with from.dfs or to.dfs, nor they work in on the local backend
2 | # as a workaround, use a simple conversion job
3 | # from.dfs(mapreduce(some.input, input.format = avroIF)) or mapreduce(to.dfs(some.data), output.format = avroOF)
4 | # avroOF uses a fixed schema "bytes" containing the JSON representation of the data.
5 |
6 | avro.jar = "/Users/antonio/Downloads/avro-mapred-1.7.4-hadoop1.jar"
7 |
8 | paste.fromJSON =
9 | function(...)
10 | fromJSON(paste("[", paste(..., sep = ","), "]"))
11 |
12 | mapply.fromJSON =
13 | function(...)
14 | mapply(paste.fromJSON, ..., SIMPLIFY = FALSE)
15 |
16 | avro.input.format =
17 | function(con) {
18 | lines = readLines(con = con, n = 1000)
19 | if (length(lines) == 0) NULL
20 | else
21 | do.call(
22 | keyval,
23 | unname(
24 | do.call(
25 | mapply.fromJSON,
26 | strsplit(
27 | lines,
28 | "\t"))))}
29 |
30 | avroIF =
31 | make.input.format(
32 | format = avro.input.format,
33 | mode = "text",
34 | streaming.format = "org.apache.avro.mapred.AvroAsTextInputFormat",
35 | backend.parameters =
36 | list(
37 | hadoop =
38 | list(
39 | libjars = avro.jar)))
40 |
41 |
42 | avro.output.format =
43 | function(kv, con)
44 | writeLines(
45 | unlist(
46 | rmr2:::reduce.keyval(
47 | kv,
48 | function(k, v)
49 | paste(
50 | toJSON(k, .escapeEscapes = TRUE),
51 | toJSON(v, .escapeEscapes = TRUE),
52 | sep = "\t"))),
53 | con = con)
54 |
55 | avroOF =
56 | make.output.format(
57 | format = avro.output.format,
58 | mode = "text",
59 | streaming.format = "org.apache.avro.mapred.AvroTextOutputFormat",
60 | backend.parameters =
61 | list(
62 | hadoop =
63 | list(
64 | libjars = avro.jar)))
65 |
66 |
67 | avro.test =
68 | mapreduce(
69 | to.dfs(keyval(1:2, 1:10)),
70 | output.format = avroOF)
71 |
72 | from.dfs(
73 | mapreduce(
74 | avro.test,
75 | input.format = avroIF))
--------------------------------------------------------------------------------
/pkg/examples/cluster.mr.R:
--------------------------------------------------------------------------------
1 | # Copyright 2013 Revolution Analytics
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 |
17 | ## @knitr cluster-napply
18 | library(cluster)
19 | napply = function(ll, a.name) lapply(ll, function(l) l[[a.name]])
20 |
21 | ## @knitr cluster-mr
22 | cluster.mr =
23 | function(data, subcluster, merge)
24 | mapreduce(
25 | data,
26 | map =
27 | function(., data.chunk)
28 | keyval(1, list(subcluster(data.chunk))),
29 | combine = TRUE,
30 | reduce =
31 | function(., clusterings)
32 | keyval(1, list(merge(clusterings))))
33 |
34 | ## @knitr cluster-subclara
35 | subclara =
36 | function(data, n.centers) {
37 | clust =
38 | clara(
39 | data,
40 | n.centers,
41 | keep.data = FALSE)
42 | list(
43 | size = nrow(data),
44 | sample = data[clust$sample,],
45 | medoids = clust$medoids)}
46 |
47 | ## @knitr cluster-merge-clara
48 | merge.clara =
49 | function(clusterings, n.centers){
50 | sizes = unlist(napply(clusterings, 'size'))
51 | total.size = sum(sizes)
52 | size.range = range(sizes)
53 | size.ratio = max(size.range)/min(size.range)
54 | resample =
55 | function(x)
56 | x$sample[
57 | sample(
58 | 1:nrow(x$sample),
59 | round(nrow(x$sample) * size.ratio),
60 | replace = TRUE)]
61 | clust =
62 | subclara(
63 | do.call(
64 | rbind,
65 | lapply(
66 | clusterings,
67 | resample)),
68 | n.centers)
69 | clust$size = total.size
70 | clust}
71 |
72 | ## @knitr cluster-clara
73 | clara.mr =
74 | function(data, n.centers)
75 | values(
76 | from.dfs(
77 | cluster.mr(
78 | data,
79 | Curry(subclara, n.centers = n.centers),
80 | Curry(merge.clara, n.centers = n.centers))))[[1]]
--------------------------------------------------------------------------------
/pkg/examples/collocations.R:
--------------------------------------------------------------------------------
1 | # Copyright 2013 Revolution Analytics
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | library(rmr2)
16 |
17 | ngram.format =
18 | make.input.format(
19 | format="csv",
20 | quote = NULL,
21 | sep = "\t",
22 | comment.char = "",
23 | col.names = c("ngram", "year", "count", "pages", "books"),
24 | stringsAsFactors = FALSE)
25 |
26 | ngram.parse =
27 | function(ngram.data) {
28 | ngram.split =
29 | suppressWarnings(
30 | do.call(
31 | rbind,
32 | strsplit(
33 | paste(ngram.data$ngram, " "),
34 | " "))
35 | [,1:5])
36 | filter = ngram.split[,ncol(ngram.split)] != ""
37 | cbind(
38 | ngram.data[,-1],
39 | ngram.split,
40 | stringsAsFactors = FALSE)
41 | [filter,]}
42 |
43 | map.fun =
44 | function(k, v) {
45 | data = ngram.parse(v)
46 | keyval(
47 | as.matrix(data[, c("year", "1", names(data)[ncol(data)])]),
48 | data$count)}
49 |
50 | reduce.fun =
51 | function(k,vv) {
52 | vv = split(vv, as.data.frame(k), drop = TRUE)
53 | keyval(names(vv), vsum(vv))}
54 | #keyval(names(vv), sapply(vv, sum))}
55 | #this alone changes the runtime from 49' to 1h 27'
56 | #on a 5 node cluster with 10 reducer slots
57 |
58 | system.time({
59 | zz =
60 | mapreduce(
61 | "/user/ngrams/",
62 | #"../RHadoop.data/ngrams/10000000.csv",
63 | input.format = ngram.format,
64 | map = map.fun,
65 | reduce = reduce.fun,
66 | vectorized.reduce = TRUE,
67 | in.memory.combine = FALSE,
68 | combine = FALSE)})
--------------------------------------------------------------------------------
/pkg/examples/counts.R:
--------------------------------------------------------------------------------
1 | # Copyright 2011 Revolution Analytics
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | ## @knitr counts
17 | count =
18 | function(data, ...) {
19 | map.count =
20 | function(.,data) {
21 | counts = apply(data,2,function(x) aggregate(x,list(x),length))
22 | keyval(names(counts), counts)}
23 | reduce.count =
24 | function(colname, counts) {
25 | counts = do.call(rbind, counts)
26 | keyval(
27 | colname,
28 | list(aggregate(counts$x, list(as.character(counts$Group.1)), sum)))}
29 | from.dfs(
30 | mapreduce(
31 | data,
32 | map = map.count,
33 | reduce = reduce.count,
34 | combine = TRUE,
35 | ...))}
36 | ## @knitr end
--------------------------------------------------------------------------------
/pkg/examples/hbase.R:
--------------------------------------------------------------------------------
1 | # Copyright 2013 Revolution Analytics
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | ## @knitr hbase-blogposts
16 | from.dfs(
17 | mapreduce(
18 | input="blogposts",
19 | input.format =
20 | make.input.format(
21 | "hbase",
22 | family.columns =
23 | list(
24 | image= list("bodyimage"),
25 | post = list("author", "body")),
26 | key.deserialize = "raw",
27 | cell.deserialize = "raw",
28 | dense = TRUE,
29 | atomic = TRUE)))
30 |
31 | ## @knitr hbase-freebase.input.format
32 | freebase.input.format =
33 | make.input.format(
34 | "hbase",
35 | family.columns =
36 | list(
37 | name = "",
38 | freebase = "types"),
39 | key.deserialize = "raw",
40 | cell.deserialize = "raw",
41 | dense = FALSE,
42 | atomic = FALSE)
43 |
44 | ## @knitr hbase-freebase-mapreduce
45 | from.dfs(
46 | mapreduce(
47 | input = "freebase",
48 | input.format = freebase.input.format,
49 | map = function(k,v) keyval(k[1,], v[1,])))
50 | ## @knitr end
--------------------------------------------------------------------------------
/pkg/examples/large-kmeans-test.R:
--------------------------------------------------------------------------------
1 | # Copyright 2013 Revolution Analytics
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | input.1000 = mapreduce (input = to.dfs(1:1000),
16 | map = function(k, v) keyval(rnorm(1), v),
17 | reduce = to.reduce(identity))
18 |
19 | input.10e6 = mapreduce (input = input.1000,
20 | map = function(k, v) lapply(1:1000, function(i) keyval(rnorm(1), v)),
21 | reduce = to.reduce(identity))
22 |
23 | kmeans.input.10e6 = mapreduce(input.1000,
24 | map = function(k, v) keyval(rnorm(1), cbind(sample(0:2, recsize, replace = TRUE) +
25 | rnorm(recsize, sd = .1),
26 | sample(0:3, recsize, replace = TRUE) +
27 | rnorm(recsize, sd = .1))))
28 |
29 | kmeans.input.10e9 = mapreduce(input.10e6,
30 | map = function(k, v) keyval(rnorm(1), cbind(sample(0:2, recsize, replace = TRUE) +
31 | rnorm(recsize, sd = .1),
32 | sample(0:3, recsize, replace = TRUE) +
33 | rnorm(recsize, sd = .1))))
34 |
--------------------------------------------------------------------------------
/pkg/examples/mclust.mr.R:
--------------------------------------------------------------------------------
1 | # Copyright 2013 Revolution Analytics
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | library(mclust)
16 |
17 | fast.mclust =
18 | function(data)
19 | Mclust(
20 | data,
21 | initialization =
22 | list(
23 | subset =
24 | sample(
25 | 1:nrow(data),
26 | size = min(100, nrow(data)))))
27 |
28 |
29 | mclust.mr =
30 | function(data, merge.dataset.size = 10000)
31 | mapreduce(
32 | data,
33 | map =
34 | function(.,data)
35 | keyval(1, list(fast.mclust(data)[c('n', 'modelName', 'parameters')])),
36 | reduce =
37 | function(., models) {
38 | shrink =
39 | merge.dataset.size/
40 | sum(sapply(models, function(m) m$n))
41 | model =
42 | fast.mclust(
43 | do.call(
44 | rbind,
45 | lapply(
46 | models,
47 | function(m)
48 | sim(
49 | modelName = m$modelName,
50 | parameters = m$parameters,
51 | n = round(m$n/shrink))[,-1])))
52 | keyval(
53 | 1,
54 | list(
55 | list(
56 | n = round(model$n*shrink),
57 | modelName = model$modelName,
58 | parameters = model$parameters)))})
--------------------------------------------------------------------------------
/pkg/examples/ngram.R:
--------------------------------------------------------------------------------
1 | # Copyright 2011 Revolution Analytics
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Start cluster
16 | # $WHIRR_HOME/bin/whirr launch-cluster --config ~/Projects/Revolution/RHadoop/rmr/pkg/tools/whirr/hadoop-ec2-lzo.properties 2>&1
17 | # $WHIRR_HOME/bin/whirr run-script --script ~/Projects/Revolution/RHadoop/rmr/pkg/tools/whirr/rmr-1.3.sh --config ~/Projects/Revolution/RHadoop/rmr/pkg/tools/whirr/hadoop-ec2-lzo.properties
18 | # $WHIRR_HOME/bin/whirr run-script --script ~/Projects/Revolution/RHadoop/rmr/pkg/tools/whirr/lzo.sh --config ~/Projects/Revolution/RHadoop/rmr/pkg/tools/whirr/hadoop-ec2-lzo.properties
19 |
20 |
21 | ## @knitr fake-data
22 | fake.size = 2000000
23 | writeLines(
24 | apply(
25 | cbind(
26 | sample(sapply(1:20000, function(x) substr(digest(x),start=1,stop=3)), fake.size, replace = TRUE),
27 | sample(1800:1819, fake.size, replace = TRUE),
28 | sample (1:200, fake.size, replace = TRUE),
29 | sample (1:200, fake.size, replace = TRUE),
30 | sample (1:200, fake.size, replace = TRUE)),
31 | 1,
32 | function(x)paste(x, collapse = "\t")),
33 | file("/tmp/fake-ngram-data", "w"))
34 |
35 | source = "/tmp/fake-ngram-data"
36 | # rmr.options(backend = "local")
37 |
38 | #Timing for 12 + 1 node EC2 cluster m1.large instances
39 | ## @knitr distcp
40 | # hadoop distcp s3n://$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY@datasets.elasticmapreduce/ngrams/books/20090715/eng-all/1gram/ hdfs:///user/antonio/
41 | ## @knitr scatter
42 | ## source = scatter("/user/antonio/1gram/data")
43 | # 33 mins
44 | ## @knitr ngram.format
45 | ngram.format = function(lines){
46 | data =
47 | as.data.frame(
48 | do.call(rbind, strsplit(lines, "\t"))[,1:3],
49 | stringsAsFactors = FALSE)
50 | names(data) = c("ngram", "year", "count")
51 | data$year = as.integer(data$year)
52 | data$count = as.integer(data$count)
53 | data}
54 |
55 | ## @knitr filter.map
56 | filter.map = function(., lines) {
57 | ngram.data = ngram.format(lines)
58 | ngram.data[
59 | regexpr(
60 | "^[A-Za-z]+$",
61 | ngram.data$ngram) > -1 &
62 | ngram.data$year > 1800,]}
63 | ## @knitr end
64 |
65 | # use
66 | # input.format = "text"
67 | # on fake data
68 |
69 | ## @knitr filtered.data
70 | source = "/user/antonio/1gram/data"
71 | library(rmr2)
72 | filtered.data =
73 | mapreduce(input = source,
74 | map = filter.map)
75 | ## @knitr end
76 | #20 mins,
77 | ## @knitr sample-data
78 | from.dfs(rmr.sample(filtered.data, method="any", n = 50))
79 | ## @knitr end
80 | #5 mins
81 |
82 | ## @knitr totals.map
83 | totals.map =
84 | function(., ngram.data) {
85 | total = tapply(as.numeric(ngram.data$count), ngram.data$year, sum, na.rm = TRUE)
86 | keyval(names(total), as.vector(total))}
87 |
88 | ## @knitr totals.reduce
89 | totals.reduce =
90 | function(year, count)
91 | keyval(year, sum(count, na.rm = TRUE))
92 |
93 | ## @knitr year.totals
94 | year.totals.kv =
95 | from.dfs(
96 | mapreduce(
97 | input = filtered.data,
98 | map = totals.map,
99 | reduce = totals.reduce,
100 | combine = TRUE))
101 | ## @knitr end
102 | #9 mins
103 |
104 | ## @knitr year.totals-finish
105 | year.totals = c()
106 | year.totals[keys(year.totals.kv)] = values(year.totals.kv)
107 | ## @knitr outlier.map
108 | library(bitops)
109 | outlier.map =
110 | function(., ngram.data) {
111 | k = ngram.data$year + cksum(ngram.data$ngram)%%100/100
112 | c.keyval(
113 | keyval(k, ngram.data),
114 | keyval(k + 1, ngram.data))}
115 |
116 | ## @knitr outlier.reduce
117 | library(robustbase)
118 | library(reshape2)
119 | outlier.reduce =
120 | function(., ngram.data) {
121 | years = range(ngram.data$year)
122 | if(years[1] == years[2])
123 | NULL
124 | else {
125 | ngram.data = dcast(ngram.data, ngram ~ year, fill = 0)
126 | tryCatch({
127 | filter =
128 | !adjOutlyingness(
129 | log(
130 | t(
131 | t(ngram.data[,2:3] + 1)/
132 | as.vector(
133 | year.totals[as.character(years)] + 1))),
134 | alpha.cutoff = .95)$nonOut
135 | as.character(ngram.data[filter,'ngram'])},
136 | error = function(e) NULL)}}
137 | ## @knitr end
138 |
139 | # watch out the next doesn't seem to work beyond 10^5 ngrams
140 | # problem is inefficient assignment, still investigating
141 | ## @knitr outlier.ngram
142 | outlier.ngram =
143 | unique(
144 | values(
145 | from.dfs(
146 | mapreduce(
147 | input = filtered.data,
148 | output = "/user/antonio/1gram/outlier-ngram",
149 | map = outlier.map,
150 | reduce = outlier.reduce))))
151 |
152 | ## @knitr end
153 | # 8 hours
154 |
155 | ## @knitr plot.data
156 | plot.data =
157 | values(
158 | from.dfs(
159 | mapreduce(
160 | input = filtered.data,
161 | output = "/user/antonio/1gram/plot-data-ngram",
162 | map =
163 | function(., ngram.data)
164 | ngram.data[
165 | is.element(
166 | as.character(ngram.data$ngram),
167 | outlier.ngram),])))
168 | ## @knitr end
169 | # 5 mins
170 |
171 | ## @knitr plot.data.frame
172 | plot.data =
173 | melt(
174 | dcast(
175 | plot.data, ngram ~ year, fill = 0),
176 | variable.name="year",
177 | value.name = "count")
178 | plot.data$freq =
179 | (plot.data$count + 0.1)/
180 | year.totals[as.character(plot.data$year)]
181 | plot.data =
182 | plot.data[order(plot.data$ngram, plot.data$year),]
183 | plot.data =
184 | cbind(
185 | plot.data[-nrow(plot.data),],
186 | plot.data[-1,])
187 | plot.data =
188 | plot.data[
189 | plot.data[,1] == plot.data[,5],
190 | c(1,2,4,8)]
191 | names(plot.data) =
192 | c("id","time","freq", "freq.prev")
193 | plot.data$average =
194 | sqrt(plot.data$freq*plot.data$freq.prev)
195 | plot.data$ratio =
196 | plot.data$freq/plot.data$freq.prev
197 | plot.data$time = as.integer(as.character(plot.data$time))
198 | ## @knitr end
199 |
200 | ## save and reload, this is not necessary unless you take a break
201 | ##save(plot.data, file = "../RHadoop.data/ngram.plot.data")
202 | ##load("../RHadoop.data/ngram.plot.data")
203 | ## throw away some data points -- graphics can only use so many
204 |
205 | ## @knitr trim
206 | plot.data = plot.data[log(plot.data$average) > -10, ]
207 | summary(plot.data)
208 | ## @knitr end
209 |
210 | ## @knitr plot
211 | suppressPackageStartupMessages(library(googleVis))
212 | motion.chart =
213 | gvisMotionChart(
214 | plot.data[,c("id","time","average","ratio")],
215 | options = list(height = 768, width = 1024))
216 | plot(motion.chart)
217 | ## @knitr end
218 | print(motion.chart, "chart")
--------------------------------------------------------------------------------
/pkg/man/bigdataobject.Rd:
--------------------------------------------------------------------------------
1 | \name{big.data.object}
2 | \alias{big.data.object}
3 |
4 | \title{
5 | The big data object.}
6 |
7 | \description{
8 | A stub representing data on disk that can be manipulated by other functions in rmr. "Stub" means that the data is not actually "there" or more concretely it is not held in memory in the current process. This is a technique used in different programming languages when remote resources need to be made available. In this case the rationale is that we need to be able to process large data sets whose size is not compatible with them being held in memory at once. Nonetheless it is convenient to be able to refer to the complete data set in the language, albeit the set of operations we can perform on it is limited. Big data objects are returned by \code{\link{to.dfs}}, \code{\link{mapreduce}}, \code{\link{scatter}}, \code{\link{gather}}, \code{\link{equijoin}} and \code{\link{rmr.sample}}, and accepted as input by all of the above with the exception of \code{\link{to.dfs}} and the inclusion of \code{\link{from.dfs}}. Big data objects are NOT persistent, meaning that they are not meant to be saved beyond the limits of a session. They use temporary space and the space is reclaimed as soon as possible when the data can not be referred to any more, or at the end of a session. For data that needs to be accessible outside the current R session, you need to use paths to the file or directory where the data is or should be written to. Valid paths can be used interchangeably wherever big data objects are accepted}
9 |
10 |
11 | \examples{
12 | some.big.data = to.dfs(1:10)
13 | path = "/tmp/some/big/data"
14 | if(dfs.exists(path))
15 | dfs.rmr(path)
16 | to.dfs(1:10, path)}
17 |
--------------------------------------------------------------------------------
/pkg/man/dfs.empty.Rd:
--------------------------------------------------------------------------------
1 | \name{dfs.empty}
2 | \alias{dfs.empty}
3 | \alias{dfs.exists}
4 | \alias{dfs.size}
5 | \alias{dfs.mv}
6 | \alias{dfs.rmr}
7 | \alias{dfs.ls}
8 | \title{Backend-independent file manipulation}
9 |
10 | \description{Check if an item is empty or return its size. Move an item or remove(recursively). Here item is a valid path or \code{\link{big.data.object}}}
11 |
12 | \usage{
13 | dfs.empty(fname)
14 | dfs.exists(fname)
15 | dfs.size(fname)
16 | dfs.mv(from, to)
17 | dfs.rmr(fname)
18 | dfs.ls(fname)
19 | }
20 |
21 | \arguments{
22 | \item{fname}{A valid path or \code{\link{big.data.object}}}
23 | \item{from, to}{A valid path}
24 | }
25 |
26 | \value{For \code{dfs.size} a number of bytes; for \code{dfs.empty} and \code{dfs.exists}, a logical; for \code{dfs.ls} a data.frame}
27 |
28 | \details{
29 | The size of a directory, for the sake of this commands, is the size of the files contained therein with the exception of hidden files starting with "." and "_". This is not well documented in Hadoop but there is a private call that implements this pattern. }
30 |
31 | \examples{
32 | dfs.empty(mapreduce(to.dfs(1:10)))
33 | dfs.size(mapreduce(to.dfs(1:10)))
34 | }
--------------------------------------------------------------------------------
/pkg/man/equijoin.Rd:
--------------------------------------------------------------------------------
1 | \name{equijoin}
2 |
3 | \alias{equijoin}
4 |
5 | \title{
6 | Equijoins using map reduce
7 | }
8 |
9 | \description{
10 | A generalized form of equijoin, hybrid between the SQL brethren and mapreduce
11 | }
12 |
13 | \usage{
14 | equijoin(
15 | left.input = NULL,
16 | right.input = NULL,
17 | input = NULL,
18 | output = NULL,
19 | input.format = "native",
20 | output.format = "native",
21 | outer = c("", "left", "right", "full"),
22 | map.left = to.map(identity),
23 | map.right = to.map(identity),
24 | reduce = reduce.default)}
25 | \arguments{\item{left.input}{The left side input to the join.}
26 | \item{right.input}{The right side input to the join.}
27 | \item{input}{The only input in case of a self join. Mutually exclusive with the previous two.}
28 | \item{output}{Where to write the output.}
29 | \item{input.format}{Input format specification, see \code{\link{make.input.format}}}
30 | \item{output.format}{Output format specification, see \code{\link{make.output.format}}}
31 | \item{outer}{Whether to perform an outer join, one of the usual three types, left, right or full.}
32 | \item{map.left}{Function to apply to each record from the left input, follows same conventions as any map function. The returned keys
33 | will become join keys.}
34 | \item{map.right}{Function to apply to each record from the right input, follows same conventions as any map function. The returned keys
35 | will become join keys.}
36 | \item{reduce}{Function to be applied, key by key, on the values associated with that key. Those values are in the arguments \code{vl} (left side) and \code{vr} (right side) and their type is determined by the type returned by the map functions, separately for the left side and the right side. The allowable return values are like those of any reduce function, see \code{\link{mapreduce}}. The default performs a \code{merge} with \code{by = NULL} which performs a cartesian product, unless lists are involved in which case the arguments are simply returned in a list.}}
37 |
38 | \value{If output is specified, returns output itself. Otherwise, a \code{\link{big.data.object}}}
39 |
40 |
41 | \section{Warning}{Doesn't work with multiple inputs like \code{mapreduce}}
42 |
43 |
44 | \examples{
45 | ##---- Should be DIRECTLY executable !! ----
46 | ##-- ==> Define data, use random,
47 | ##-- or do help(data=index) for the standard data sets.
48 | from.dfs(equijoin(left.input = to.dfs(keyval(1:10, 1:10^2)), right.input = to.dfs(keyval(1:10, 1:10^3))))
49 | }
50 |
51 |
52 |
--------------------------------------------------------------------------------
/pkg/man/fromdfstodfs.Rd:
--------------------------------------------------------------------------------
1 | \name{from.dfs}
2 | \alias{from.dfs}
3 | \alias{to.dfs}
4 |
5 | \title{Read or write R objects from or to the file system}
6 | \description{Functions that read or write R objects from or to the file system}
7 |
8 | \usage{
9 | to.dfs(kv, output = dfs.tempfile(), format = "native")
10 | from.dfs(input, format = "native")
11 | }
12 |
13 | \arguments{
14 | \item{kv}{A key-value pair; also, a vector, list, matrix or a data frame (in this case the keys will be set to NULL)}
15 | \item{input}{A valid path or a \code{\link{big.data.object}}}
16 | \item{output}{A valid path}
17 | \item{format}{For \code{from.dfs} either a string naming a format, the same as those allowed by \code{make.input.format}, or the value returned by \code{\link{make.input.format}}. The same is true for \code{to.dfs}, but refer to \code{\link{make.output.format}} instead.}}
18 |
19 | \details{ These functions allow to move data from RAM to the file system and back. Keep in mind that the capacity of these two storage media is
20 | different by two or more orders of magnitude, so the conversion will make sense only in specific situations. These
21 | functions do not perform any size control, so the responsibility is on the user. For the local backend, file system means the local file system.
22 | For the Hadooop backend it means HDFS}
23 |
24 | \value{\code{from.dfs} returns the object whose representation is contained in \code{input}. \code{to.dfs} returns the value of \code{output} or, when this is missing, a \code{\link{big.data.object}} }
25 |
26 | \examples{
27 | from.dfs(to.dfs(1:10))
28 | from.dfs(to.dfs(keyval(1, 1:10)))
29 | }
30 |
--------------------------------------------------------------------------------
/pkg/man/hadoop-setting.Rd:
--------------------------------------------------------------------------------
1 | \name{hadoop.settings}
2 | \alias{hadoop.settings}
3 |
4 | \title{Important Hadoop settings in relation to rmr2}
5 | \description{There are a few hadoop settings that one should be aware of and know how to modify to allow the successful execution of mapreduce programs}
6 | \details{Since the transition to YARN and MR2, each Mapreduce job needs to secure a container in order to execute. A container is a resource allocation unit and the resource we are concerned with here is memory. At default settings, at least in a non-scientific sampling of deployments, the memory available to a container is used almost entirely by the map and reduce java processes. In rmr2 this is not compatible with the java process successfully executing an instance of the R interpreter, which is necessary for rmr2. Therefore, by default, rmr2 modifies per-job settings to set the java process to use 400MB of memory and leave the rest for use by R. This is assuming that the default container size is larger than 400MB and that R can work successfully in the remaining space. Under certain conditions, it is also possible that 400MB won't be enough for the java process. To solve these problems, the user has access to a number of properties that can be set using configuration files on an per-job basis directly in rmr2 (see \code{\link{rmr.options}}, argument \code{backend.parameters}). Four important properties are \code{mapreduce.map.java.opts}, \code{mapreduce.reduce.java.opts}, \code{mapreduce.map.memory.mb} and \code{mapreduce.reduce.memory.mb}
7 | The first two are set by \code{rmr2} to \code{-Xmx400M}, which sets the memory allocated to the map or reduce java task. The other two properties control the size of the container for, resp., the map and reduce phase and rmr2 leaves them at default values, unless the user decides otherwise. There are many other properties the control the execution environment of mapreduce jobs but they are out of scope for this help entry (you are referred to the documentation accompanying your Hadoop distribution). These four, in the experience of the RHadoop team are the ones one needs to acton upon most often.}
8 |
9 |
--------------------------------------------------------------------------------
/pkg/man/keyval.Rd:
--------------------------------------------------------------------------------
1 | \name{keyval}
2 | \alias{keyval}
3 | \alias{keys}
4 | \alias{values}
5 | \alias{c.keyval}
6 |
7 | \title{Create, project or concatenate key-value pairs}
8 | \description{Create a key-value object (a collecton of key-value pairs) from two R objects, extract keys or values from a key value object or concatenate multiple key value objects}
9 |
10 | \usage{
11 | keys(kv)
12 | values(kv)
13 | keyval(key, val)
14 | c.keyval(...)
15 | }
16 |
17 | \arguments{
18 | \item{kv}{key-value pairs}
19 | \item{key}{the desired key or keys}
20 | \item{val}{the desired value or values}
21 | \item{...}{key-value pairs to concatenate, or a single list thereof}}
22 |
23 | \details{The \code{keyval} function is used to create return values for the map and reduce functions, themselves parameters to
24 | \code{\link{mapreduce}}. Key-value objects are also appropriate arguments for the \code{\link{to.dfs}} function and are returned by
25 | \code{\link{from.dfs}}. \code{keys} and \code{values} extract keys and values resp. from a key value object. \code{c.keyval} concatenates two or more key-value objects by concatenating the keys and values separately after recycling the arguments. When invoked with a single argument, it considers it a list of key value objects to concatenate. A key value object should always be considered vectorized, meaning that it defines a collection of key-value pairs. For the purpose of forming key-value pairs, the length of an object is considered its number of rows whene defined, that is for matrices and data frames, or its R \code{\link{length}} otherwise. Consistently with this definition, the n-th element of a key or value argument is its n-th row or a subrange including only the n-th element otherwise. Data types are preserved, meaning that, for instance, if the \code{key} is a matrix its n-th element is a matrix with only one row, the n-th row of the larger matrix (the behavior of the \code{[]} operator with \code{drop = FALSE}). The same is true for data frames, list and atomic vectors. When \code{key} and \code{val} have different lengths according to this definition, recycling is applied. The pairing between keys and values is positional, meaning that the n-th element of the key argument is associated with the n-th element of the val argument in a single key-value pair. Concatenation happens with \code{rbind} or variants thereof whenever keys or values have rows, \code{c} otherwise. Mixing and matching keys of different type, e.g. a matrix with a vector, is not supported, and the same is true for keys, but key and value in the same keyval object do not need to be of the same type. When porting programs from rmr < 2 a list of non-vectorized key-value pairs can be converted with \code{c.keyval(keyval(list(k1), list(v1)), keyval(list(k2), list(v2)), ...)}. In many cases wrapping the keys and values in a \code{list} call is not necessary, but it is in the general case.}
26 |
27 | \examples{
28 | #single key-val
29 | keyval(1,2)
30 | keys(keyval(1,2))
31 | values(keyval(1,2))
32 | #10 kv pairs of the form (i,i)
33 | keyval(1:10, 1:10)
34 | #2 kv pairs (1, 2i-1) and (2, 2i) for i in 1:5
35 | keyval(1:2, 1:10)
36 | # mtcars is a data frame, each row is a value with key set to the value of column cyl
37 | keyval(mtcars$cyl, mtcars)
38 | # concatenate two keyval objects
39 | c.keyval(keyval(1,1:2), keyval(1,1:3))
40 | }
41 |
--------------------------------------------------------------------------------
/pkg/man/mapreduce.Rd:
--------------------------------------------------------------------------------
1 | \name{mapreduce}
2 | \alias{mapreduce}
3 |
4 | \title{MapReduce using Hadoop Streaming}
5 | \description{Defines and executes a map reduce job.
6 | }
7 |
8 | \usage{ mapreduce(
9 | input,
10 | output = NULL,
11 | map = to.map(identity),
12 | reduce = NULL,
13 | vectorized.reduce = FALSE,
14 | combine = NULL,
15 | in.memory.combine = FALSE,
16 | input.format = "native",
17 | output.format = "native",
18 | backend.parameters = list(),
19 | verbose = TRUE) }
20 |
21 | \arguments{
22 | \item{input}{Paths to the input folder(s) (on HDFS) or vector thereof
23 | or or the return value of another \code{mapreduce} or a \code{\link{to.dfs}} call}
24 | \item{output}{A path to the destination folder (on HDFS); if missing, a \code{\link{big.data.object}} is returned, see "Value" below}
25 | \item{map}{An optional R function of two arguments, returning either NULL or the return value of \code{\link{keyval}}, that specifies the map operation to execute as part of a mapreduce job. The two arguments represent multiple key-value pairs according to the definition of the mapreduce model. They can be any of the following: list, vector, matrix, data frame or NULL (the last one only allowed for keys). Keys are matched to the corresponding values by position, according to the second dimension if it is defined (that is rows in matrices and data frames, position otherwise), analogous to the behavior of \code{cbind}, see \code{\link{keyval}} for details.}
26 | \item{reduce}{An optional R function of two arguments, a key and a data structure representing all the values associated with that key (the same type as returned by the map call, merged with \code{rbind} for matrices and data frames and \code{c} otherwise), returning either NULL or the return value of \code{\link{keyval}}, that specifies the reduce operation to execute as part of a mapreduce job. The default is no reduce phase, that is the output of the map phase is the output of the mapreduce job, see the \code{vectorized.reduce} argument for an alternate interface}
27 | \item{vectorized.reduce}{The argument to the reduce should be construed as a collection of keys and values associated to them by position (by row when 2-dimensional). Identical keys are consecutive and once a key is present once, all the records associated with that key will be passed to the same reduce call (complete group guarantee). This form of reduce has been introduced mostly for efficiency reasons when processing small reduce groups, because the records are small and few of them are associated with the same key. This option affects the combiner too.}
28 | \item{combine}{A function with the same signature and possible return values as the reduce function, or TRUE, which means use the reduce function as combiner. NULL means no combiner is used.}
29 | \item{in.memory.combine}{Apply the combiner just after calling the map function, before returning the results to hadoop. This is useful to reduce the amount of I/O and (de)serialization work when combining on small sets of records has any effect (you may want to tune the input format to read more data for each map call together with this approach, see arguments \code{read.size} or \code{nrow} for a variety of formats)}
30 | \item{input.format}{Input format specification, see \code{\link{make.input.format}}}
31 | \item{output.format}{Output format specification, see \code{\link{make.output.format}}}
32 | \item{backend.parameters}{This option is for advanced users only and may be removed in the future. Specify additional, backend-specific
33 | options, as in \code{backend.parameters = list(hadoop = list(D = "mapred.reduce.tasks=1"), local = list())}. It is recommended not to use this argument to change the semantics of mapreduce (output should be independent of this argument). Each backend can only see the nested list named after the backend itself. The interpretation is the following: for the hadoop backend, generate an additional hadoop streaming command line argument for each element of the list, "-name value". If the value is TRUE generate "-name" only, if it is FALSE skip. One possible use is to specify the number of mappers and reducers on a per-job basis. It is not guaranteed that the generated streaming command will be a legal command. In particular, remember to put any generic options before any specific ones, as per hadoop streaming manual. For the local backend, the list is currently ignored.}
34 | \item{verbose}{Run hadoop in verbose mode. When \code{FALSE} job and, on YARN, application ids are returned as attributes. No effect on the local backend}}
35 |
36 | \value{The value of \code{output}, or, when missing, a \code{\link{big.data.object}}}
37 |
38 | \details{Defines and executes a mapreduce job. Jobs can be chained together by simply providing the return value of one as input to the
39 | other. The map and reduce functions will run in an environment that is a close approximation of the environment of this
40 | call, even if the actual execution happens in a different interpreter on a different machine. Changes to the outer
41 | environments performed inside the map and reduce functions with the \code{<<-} operator will only affect a per-process copy of the
42 | environment, not the original one, in a departure from established but seldom used R semantics. This is unlikely to change in the future
43 | because of the challenges inherent in adopting reference semantics in a parallel environment. The map function should not read from standard input and write to standard output. Logging and debugging messages should be written to standard error, and will be redirected to the appropriate logs or to console by the backend. If necessary, library functions that can not be prevented from writing into standard output can be surrounded by a pair of \code{sink} calls as in \code{sink(stderr()); library.function(); sink(NULL)}. See also the Tutorial
44 | \url{https://github.com/RevolutionAnalytics/RHadoop/wiki/Tutorial}}
45 |
46 | \seealso{\code{\link{to.map}} and \code{\link{to.reduce}} can be used to convert other functions into suitable arguments for the map and
47 | reduce arguments; see the tests directory in the package for more examples}
48 |
49 |
--------------------------------------------------------------------------------
/pkg/man/rmr-package.Rd:
--------------------------------------------------------------------------------
1 | \name{rmr}
2 | \alias{rmr}
3 | \docType{package}
4 | \title{A package to perform Map Reduce computations in R}
5 | \description{Running on top of Hadoop, this package allows to define and run mapreduce jobs, including specifying the mapper and the reducer as R functions, and to move data between R and Hadoop in a mostly transparent way. The aim is to make writing map reduce jobs very similar to and just as easy as writing a lapply and a tapply. Additional features provide easy job composition, transparent intermediate result management, support for different data formats and more.
6 | }
7 |
--------------------------------------------------------------------------------
/pkg/man/rmr.options.Rd:
--------------------------------------------------------------------------------
1 | \name{rmr.options}
2 | \alias{rmr.options}
3 | \title{Function to set and get package options}
4 | \description{Set and get package options}
5 | \usage{
6 | rmr.options(
7 | backend = c("hadoop", "local"),
8 | profile.nodes = c("off", "calls", "memory", "both"),
9 | hdfs.tempdir = "/tmp",
10 | exclude.objects = NULL,
11 | backend.parameters = list())
12 | }
13 | \arguments{
14 | \item{...}{Names of options to get values of, as length one character vectors}
15 | \item{backend}{One of "hadoop" or "local", the latter being implemented entirely in the current R interpreter, sequentially, for learning and debugging.}
16 | \item{profile.nodes}{Collect profiling and memory information when running additional R interpreters (besides the current one) on the cluster. No effect on the local backend, use Rprof instead. For backward compatibility, \code{"calls"} is equivalent to \code{TRUE} and \code{"off"} to \code{FALSE}}
17 | \item{hdfs.tempdir}{The directory to use for temporary files, including \code{\link{mapreduce}} intermediate results files, on the distributed file system (not used when running on the local backend).}
18 | \item{exclude.objects}{Objects in the Global environment that are not needed by the map or reduce functions, as character vector}
19 | \item{backend.parameters}{Parameters to pass directly to the backend. See equally named argument for the function \code{\link{mapreduce}}. Use this setting for backend parameters that need to be different from default but can be the same from job to job}
20 | }
21 | \details{
22 | While the main goal for rmr2 is to provide access to hadoop mapreduce, the package has a notion of a backend that can be swapped while preserving most features. One backend is of course hadoop itself, the other is called "local" and is implemented within the current interpreter and using the local file system. rmr2 programs run on the local backend are ordinary (non-distributed, single-threaded) programs which is particularly useful for learning and debugging (debug, recover and trace work). Profiling data is collected in the following files: \code{file.path(rmr.options("dfs.tempdir"), "Rprof", , )} on each node (the details of how job id and attempt id are obtained depend upon the Hadoop distribution) The path is printed in stderr for your convenience and you will find in in the logs, specifically stderr, for each attempt. Then you need to ssh to the machine where that attempt run to examine or retrieve it. \code{keyval.length} is used as a hint, particularly as a lower bound hint for how many records are actually processed by each map call.
23 | }
24 | \value{A named list with the options and their values, or just a value if only one requested. NULL when only setting options.}
25 |
26 | \examples{
27 | old.backend = rmr.options("backend")
28 | rmr.options(backend = "hadoop")
29 | rmr.options(backend = old.backend)
30 | \dontrun{
31 | rmr.options(
32 | hdfs.tempdir =
33 | file.path(
34 | "/user",
35 | system("whoami", TRUE),
36 | "tmp-rmr2",
37 | basename(tempdir())))
38 | }}
39 |
--------------------------------------------------------------------------------
/pkg/man/rmr.sample.Rd:
--------------------------------------------------------------------------------
1 | \name{rmr.sample}
2 | \alias{rmr.sample}
3 |
4 | \title{Sample large data sets}
5 |
6 | \description{Sample large data sets}
7 |
8 | \usage{rmr.sample(input, output = NULL, method = c("any", "Bernoulli"), ...)}
9 |
10 | \arguments{
11 | \item{input}{The data set to be sampled as a file path or \code{\link{mapreduce}} return value}
12 | \item{output}{Where to store the result. See \code{\link{mapreduce}}, output argument, for details}
13 | \item{method}{One of "any" or "Bernoulli". "any" will return some records out, optimized for speed, but with no statistical guarantees. "Bernoulli" implements independent sampling according to the Bernoulli distribution}
14 | \item{\dots}{Additional arguments to fully specify the sample, they depend on the method selected. If it is "any" then the size of the desired sample should be provided as the argument \code{n}. If it is "Bernoulli" the argument \code{p} specifies the probabity of picking each record}}
15 |
16 | \value{
17 | The sampled data. See \code{\link{mapreduce}} for details.}
18 |
19 |
--------------------------------------------------------------------------------
/pkg/man/rmr.str.Rd:
--------------------------------------------------------------------------------
1 | \name{rmr.str}
2 | \alias{rmr.str}
3 |
4 | \title{
5 | Print a variable's content}
6 | \description{
7 | One way to debug mapper and reducers on the "hadoop" backend is to print diagnostic messages. This function helps you do safely so by printing a summary of the stack, the name of the variable and its value onto standard error. This means that if Hadoop is running in standalone mode the message will appear in console, otherwise in the logs.
8 | }
9 | \usage{
10 | rmr.str(x, ...)
11 | }
12 | \arguments{
13 | \item{x}{The variable to print}
14 | \item{...}{Additional arguments to \code{str} (called by \code{rmr.str})}
15 | }
16 |
17 | \value{x}
18 |
19 | \examples{
20 | mapreduce(to.dfs(1), map = function(k, v) rmr.str(v))
21 | }
22 |
--------------------------------------------------------------------------------
/pkg/man/scatter.Rd:
--------------------------------------------------------------------------------
1 | \name{scatter}
2 | \alias{scatter}
3 | \alias{gather}
4 | %- Also NEED an '\alias' for EACH other topic documented here.
5 | \title{
6 | Functions to split a file over several parts or to merge multiple parts into one}
7 | \description{
8 | \code{scatter} takes in input a file and pushes it through a mapreduce jobs that writes it over a number of parts (system dependent, specifically on the number of reducers). This helps with parallelization of the next map phase. Gather does the opposite.}
9 | \usage{
10 | scatter(input, output = NULL, ...)
11 | gather(input, output = NULL, ...)
12 | }
13 | \arguments{
14 | \item{input}{
15 | The input file}
16 | \item{output}{
17 | Output, defaults to the same as \code{\link{mapreduce}} output}
18 | \item{\dots}{Other options passed directly to mapreduce}
19 | }
20 |
21 | \value{
22 | Same as for \code{\link{mapreduce}}.
23 | }
24 |
25 | \section{Known Limitations}{Scatter discards keys. This is a limitation that should be addressed in the future}
--------------------------------------------------------------------------------
/pkg/man/status.Rd:
--------------------------------------------------------------------------------
1 | \name{status}
2 | \alias{status}
3 | \alias{increment.counter}
4 | \title{
5 | Set the status and define and increment counters for a Hadoop job
6 | }
7 | \description{
8 | These are Hadoop features used to monitor and debug jobs. Should be used with a grain of salt as far as their scalability.
9 | }
10 | \usage{
11 | status(value)
12 | increment.counter(group, counter, increment = 1)
13 | }
14 | \arguments{
15 | \item{value}{The new value for the status of the job}
16 | \item{group}{The group for the counter}
17 | \item{counter}{The name for the counter}
18 | \item{increment}{By how much to increment the counter}
19 | }
20 | \details{
21 | \code{status} sets the status for the current job. \code{increment.counter} increments the counter named \code{counter} in group \code{group} by \code{increment}. If the counter doesn't exist yet it is initialized to 0. Both calls work only within the map or reduce functions and under local backend just write some messages to stderr. Unfortunately there is no API to query the value of either status or counters at this time, but you can examine them via the jobtracker web interface.
22 | }
23 | \value{
24 | NULL for both.
25 | }
26 |
27 | \examples{
28 | mapreduce(to.dfs(1:1000), map = function(k,v){status("mapping"); increment.counter("Calls", "Map", 1)})
29 | }
30 |
--------------------------------------------------------------------------------
/pkg/man/tomaptoreduce.Rd:
--------------------------------------------------------------------------------
1 | \name{to.map}
2 | \alias{to.map}
3 | \alias{to.reduce}
4 | \title{Create map and reduce functions from other functions}
5 |
6 | \description{These utility functions are meant to avoid the little boilerplate code necessary to convert ordinary functions into map and
7 | reduce functions.}
8 |
9 | \usage{
10 | to.map(fun1, fun2 = identity)
11 | to.reduce(fun1, fun2 = identity)
12 | }
13 |
14 | \arguments{
15 | \item{fun1}{A function to apply to the key, or to the key-value pair if the second argument is missing}
16 | \item{fun2}{A function to apply to the value}
17 | }
18 |
19 | \details{Sometimes there are functions that we could use almost directly as map or reduce functions but for a bit of boilerplate code, and
20 | we hate boilerplate code. That's where the functions documented herein can help. They take one or two functions of a single argument and
21 | return a valid map or reduce function. In the case of \code{to.map} when two functions are specified they are applied independently to the
22 | key and the value and the return values are returned as a key-value pair; when only one is, it is applied to the key-value pair. For
23 | \code{to.reduce} the behavior is the same. }
24 |
25 | \examples{
26 | ##The identity map:
27 | to.map(identity)
28 | ## equivalent to function(k, v) keyval(k, v)
29 | ##Replace key with mod 10 of the key and pass the value along:
30 | to.map(function(x) x \%\% 10, identity )
31 | ##Sum up all the values for the same key:
32 | to.reduce(identity, function(vv) sum(vv))
33 | }
34 |
--------------------------------------------------------------------------------
/pkg/man/vsum.Rd:
--------------------------------------------------------------------------------
1 | \name{vsum}
2 | \alias{vsum}
3 | \title{
4 | Fast small sums
5 | }
6 | \description{
7 | Returns the sum of a list of numeric vectors}
8 | \usage{
9 | vsum(x)
10 | }
11 | %- maybe also 'usage' for other objects documented here.
12 | \arguments{
13 | \item{x}{
14 | }
15 | }
16 | \details{
17 | Equivalent to \code{sapply(x, sum)}, it's about 30X faster in some use cases (many small sums). It's often useful in reducers in the vectorized form.
18 | }
19 | \value{
20 | A numeric vector with the sum of each element of the list provided as argument}
21 |
22 | \note{
23 | See collocations.R in the examples directory.
24 | }
25 |
26 | \seealso{
27 | \code{\link{mapreduce}}
28 | }
29 |
--------------------------------------------------------------------------------
/pkg/src/Makevars:
--------------------------------------------------------------------------------
1 | BINDIR = inst/hbase-io
2 | .PHONY: all hbase-io
3 |
4 | PKG_CXXFLAGS=`${R_HOME}/bin/Rscript -e "Rcpp:::CxxFlags()"`
5 | PKG_LIBS = `$(R_HOME)/bin/Rscript -e "Rcpp:::LdFlags()"`
6 |
7 | all: $(SHLIB) hbase-io
8 |
9 | hbase-io:
10 | ((which hbase && (mkdir -p ../inst; cd hbase-io; sh build_linux.sh; cp build/dist/* ../../inst)) || echo "can't build hbase IO classes, skipping" >&2)
11 |
12 | clean:
13 | echo "not implemented yet"
14 |
--------------------------------------------------------------------------------
/pkg/src/Makevars.win:
--------------------------------------------------------------------------------
1 | BINDIR = inst/bin$(R_ARCH)
2 | .PHONY: all bin
3 |
4 | PKG_CXXFLAGS=`${R_HOME}/bin/Rscript -e "Rcpp:::CxxFlags()"`
5 | PKG_LIBS = `$(R_HOME)/bin/Rscript -e "Rcpp:::LdFlags()"`
6 |
7 | all: $(SHLIB) bin
8 |
9 | bin:
10 | # binaries
11 | make --no-print-directory -C catwin -f Makefile\
12 | CC="$(CC)" CFLAGS="$(CFLAGS)"\
13 | CXX="$(CXX)" CXXFLAGS="$(CXXFLAGS)"\
14 | BINDIR="$(BINDIR)"\
15 | install-bin
16 |
17 | clean:
18 | ( cd catwin; make clean )
19 | rm -rf ../$(BINDIR)
20 |
--------------------------------------------------------------------------------
/pkg/src/catwin/Makefile:
--------------------------------------------------------------------------------
1 | # see Makeconf for compiler settings
2 | TARGET = catwin
3 |
4 | default: $(TARGET)
5 |
6 | clean:
7 | rm -rf *~ *.o $(TARGET)
8 |
9 | catwin: catwin.c $(OBJS) $(HEADERS)
10 | $(CXX) $(CXXFLAGS) $(LDFLAGS) -o catwin catwin.c $(OBJS) $(LIBS)
11 |
12 | install-bin: $(TARGET)
13 | mkdir -p ../../$(BINDIR)
14 | cp $(TARGET) ../../$(BINDIR)
15 |
--------------------------------------------------------------------------------
/pkg/src/catwin/catwin.c:
--------------------------------------------------------------------------------
1 | /* Copyright 2011 Revolution Analytics
2 | * Copyright (c) 1989, 1993
3 | * The Regents of the University of California. All rights reserved.
4 | *
5 | * This code is derived from software contributed to Berkeley by
6 | * Kevin Fall.
7 | *
8 | * Redistribution and use in source and binary forms, with or without
9 | * modification, are permitted provided that the following conditions
10 | * are met:
11 | * 1. Redistributions of source code must retain the above copyright
12 | * notice, this list of conditions and the following disclaimer.
13 | * 2. Redistributions in binary form must reproduce the above copyright
14 | * notice, this list of conditions and the following disclaimer in the
15 | * documentation and/or other materials provided with the distribution.
16 | * 3. Neither the name of the University nor the names of its contributors
17 | * may be used to endorse or promote products derived from this software
18 | * without specific prior written permission.
19 | *
20 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 | * SUCH DAMAGE.
31 | */
32 |
33 | #include
34 | #include
35 | #include
36 | #include
37 | #include
38 |
39 | #define CAT_BUFSIZ 4096
40 |
41 | int main(int argc, char* argv[])
42 | {
43 | int wfd;
44 | int rfd;
45 | ssize_t nr, nw, off;
46 | static char *buf = NULL;
47 | static char fb_buf[CAT_BUFSIZ];
48 | static size_t bsize;
49 |
50 | rfd = fileno(stdin);
51 | wfd = fileno(stdout);
52 |
53 | setmode(rfd, O_BINARY);
54 | setmode(wfd, O_BINARY);
55 |
56 | buf = fb_buf;
57 | bsize = CAT_BUFSIZ;
58 |
59 | while ((nr = read(rfd, buf, bsize)) > 0)
60 | for (off = 0; nr; nr -= nw, off += nw)
61 | nw = write(wfd, buf + off, (size_t)nr);
62 |
63 | fclose(stdout);
64 | return 0;
65 | }
66 |
--------------------------------------------------------------------------------
/pkg/src/extras.cpp:
--------------------------------------------------------------------------------
1 | //Copyright 2011 Revolution Analytics
2 | //
3 | //Licensed under the Apache License, Version 2.0 (the "License");
4 | //you may not use this file except in compliance with the License.
5 | //You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | //Unless required by applicable law or agreed to in writing, software
10 | //distributed under the License is distributed on an "AS IS" BASIS,
11 | //WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //See the License for the specific language governing permissions and
13 | //limitations under the License.
14 |
15 | #include "extras.h"
16 | #include
17 | #include
18 |
19 | SEXP vsum(SEXP xx) {
20 | Rcpp::List _xx (xx);
21 | std::vector results(_xx.size());
22 | for(unsigned int i = 0; i < _xx.size(); i ++) {
23 | std::vector x = Rcpp::as >(_xx[i]);
24 | for(unsigned int j = 0; j < x.size(); j++) {
25 | results[i] += x[j];}}
26 | return Rcpp::wrap(results);}
--------------------------------------------------------------------------------
/pkg/src/extras.h:
--------------------------------------------------------------------------------
1 | //Copyright 2011 Revolution Analytics
2 | //
3 | //Licensed under the Apache License, Version 2.0 (the "License");
4 | //you may not use this file except in compliance with the License.
5 | //You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | //Unless required by applicable law or agreed to in writing, software
10 | //distributed under the License is distributed on an "AS IS" BASIS,
11 | //WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //See the License for the specific language governing permissions and
13 | //limitations under the License.
14 |
15 | #ifndef _RMR_EXTRAS_H
16 | #define _RMR_EXTRAS_H
17 |
18 | #include
19 |
20 | RcppExport SEXP vsum(SEXP xx);
21 |
22 | #endif
23 |
--------------------------------------------------------------------------------
/pkg/src/hbase-io:
--------------------------------------------------------------------------------
1 | ../../hadoopy_hbase/java/
--------------------------------------------------------------------------------
/pkg/src/hbase-to-df.cpp:
--------------------------------------------------------------------------------
1 | //Copyright 2012 Revolution Analytics
2 | //
3 | //Licensed under the Apache License, Version 2.0 (the "License");
4 | //you may not use this file except in compliance with the License.
5 | //You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | //Unless required by applicable law or agreed to in writing, software
10 | //distributed under the License is distributed on an "AS IS" BASIS,
11 | //WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //See the License for the specific language governing permissions and
13 | //limitations under the License.
14 |
15 | #include "hbase-to-df.h"
16 | typedef std::deque raw;
17 |
18 | std::string raw_to_string(SEXP source) {
19 | Rcpp::RawVector raw_source(source);
20 | std::string retval(raw_source.size(), 'a');
21 | std::copy(raw_source.begin(), raw_source.end(), retval.begin());
22 | return retval;}
23 |
24 | SEXP raw_list_to_character(SEXP _source) {
25 | Rcpp::List source(_source);
26 | Rcpp::CharacterVector dest(source.size());
27 | for(unsigned int i = 0; i < source.size(); i++) {
28 | dest[i] = raw_to_string(source[i]);}
29 | return Rcpp::wrap(dest);}
30 |
31 | SEXP string_to_raw(std::string source) {
32 | Rcpp::RawVector retval(source.size());
33 | std::copy(source.begin(), source.end(), retval.begin());
34 | return Rcpp::wrap(retval);}
35 |
36 | SEXP p_string_to_raw(SEXP _source) {
37 | std::vector source = Rcpp::as >(_source);
38 | Rcpp::List retval(source.size());
39 | for(unsigned int i = 0; i < source.size(); i++) {
40 | retval[i] = Rcpp::wrap(string_to_raw(source[i]));}
41 | return Rcpp::wrap(retval);}
42 |
43 | SEXP hbase_to_df(SEXP _source, SEXP _dest) {
44 | int l = 0;
45 |
46 | Rcpp::List dest(_dest);
47 | Rcpp::List dest_key = Rcpp::as(dest["key"]);
48 | Rcpp::List dest_family = Rcpp::as(dest["family"]);
49 | Rcpp::List dest_column = Rcpp::as(dest["column"]);
50 | Rcpp::List dest_cell = Rcpp::as(dest["cell"]);
51 |
52 | Rcpp::List source(_source);
53 | Rcpp::List key1 = Rcpp::as(source["key"]);
54 | Rcpp::List val1 = Rcpp::as(source["val"]);
55 |
56 | for(unsigned int i = 0; i < key1.size(); i ++) {
57 | Rcpp::List val1_i = Rcpp::as(val1[i]);
58 | Rcpp::List key2 = Rcpp::as(val1_i["key"]);
59 | Rcpp::List val2 = Rcpp::as(val1_i["val"]);
60 | for(unsigned int j = 0; j < key2.size(); j++) {
61 | Rcpp::List val2_j = Rcpp::as(val2[j]);
62 | Rcpp::List key3 = Rcpp::as(val2_j["key"]);
63 | Rcpp::List val3 = Rcpp::as(val2_j["val"]);
64 | for(unsigned int k = 0; k < key3.size(); k++) {
65 | dest_family[l] = Rcpp::wrap(key2[j]);
66 | dest_column[l] = Rcpp::wrap(key3[k]);
67 | dest_key[l] = Rcpp::wrap(key1[i]);
68 | dest_cell[l] = Rcpp::wrap(val3[k]);
69 | l++;}}}
70 | return Rcpp::wrap(
71 | Rcpp::List::create(
72 | Rcpp::Named("data.frame") = Rcpp::wrap(_dest),
73 | Rcpp::Named("nrows") = Rcpp::wrap(l)));}
74 |
--------------------------------------------------------------------------------
/pkg/src/hbase-to-df.h:
--------------------------------------------------------------------------------
1 | //Copyright 2012 Revolution Analytics
2 | //
3 | //Licensed under the Apache License, Version 2.0 (the "License");
4 | //you may not use this file except in compliance with the License.
5 | //You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | //Unless required by applicable law or agreed to in writing, software
10 | //distributed under the License is distributed on an "AS IS" BASIS,
11 | //WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //See the License for the specific language governing permissions and
13 | //limitations under the License.
14 |
15 | #ifndef _RMR_HBASE_TO_DF_H
16 | #define _RMR_HBASE_TO_DF_H
17 |
18 | #include
19 |
20 |
21 | RcppExport SEXP hbase_to_df(SEXP _source, SEXP _dest);
22 | RcppExport SEXP p_string_to_raw(SEXP _source);
23 | RcppExport SEXP raw_list_to_character(SEXP _source);
24 |
25 | #endif
26 |
--------------------------------------------------------------------------------
/pkg/src/keyval.cpp:
--------------------------------------------------------------------------------
1 | //Copyright 2013 Revolution Analytics
2 | //
3 | //Licensed under the Apache License, Version 2.0 (the "License");
4 | //you may not use this file except in compliance with the License.
5 | //You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | //Unless required by applicable law or agreed to in writing, software
10 | //distributed under the License is distributed on an "AS IS" BASIS,
11 | //WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //See the License for the specific language governing permissions and
13 | //limitations under the License.
14 |
15 | #include "keyval.h"
16 | #include
17 | #include
18 |
19 | using namespace Rcpp;
20 |
21 | SEXP null_purge(SEXP xx) {
22 | List _xx(xx);
23 | int n = _xx.size();
24 | int not_null_count = 0;
25 | for (int i = 0; i < n; i ++)
26 | if (!Rf_isNull(_xx[i])) not_null_count++;
27 | List yy(not_null_count);
28 | for (int i = 0, j = 0; i < n; i ++)
29 | if (!Rf_isNull(_xx[i])){
30 | yy[j] = _xx[i];
31 | j++;}
32 | return wrap(yy);}
33 |
34 | SEXP lapply_as_character(SEXP xx) {
35 | List _xx(xx);
36 | List yy(_xx.size());
37 | for (int i = 0; i < _xx.size(); i ++)
38 | yy[i] = Rf_asCharacterFactor(_xx[i]);
39 | return wrap(yy);}
40 |
41 | int rmr_length(SEXP x) {
42 | if(Rf_isMatrix(x))
43 | return Rf_nrows(x);
44 | RObject _x(x);
45 | if (_x.hasAttribute("class")) {
46 | if(as(_x.attr("class")) == "data.frame") {
47 | List __x(x);
48 | if(Rf_length(__x) == 0) {
49 | return(0);}
50 | else {
51 | return(Rf_length(__x[0]));}}}
52 | return Rf_length(x);}
53 |
54 | SEXP sapply_rmr_length(SEXP xx) {
55 | List _xx(xx);
56 | std::vector results(_xx.size());
57 | for(unsigned int i = 0; i < _xx.size(); i++) {
58 | results[i] = rmr_length(_xx[i]);}
59 | return(wrap(results));}
60 |
61 | SEXP sapply_rmr_length_lossy_data_frame(SEXP xx){
62 | List _xx(xx);
63 | std::vector results(_xx.size());
64 | for(unsigned int i = 0; i < _xx.size(); i++) {
65 | List cols(as(_xx[i]));
66 | results[i] = rmr_length(cols[0]);}
67 | return wrap(results);}
68 |
69 | int length_keyval(SEXP kv) {
70 | List kv_(kv);
71 | int kl = rmr_length(kv_["key"]);
72 | int vl = rmr_length(kv_["val"]);
73 | if (kl > vl) return(kl);
74 | return(vl);}
75 |
76 | SEXP sapply_length_keyval(SEXP kvs) {
77 | List _kvs(kvs);
78 | std::vector results(_kvs.size());
79 | for(unsigned int i = 0; i < _kvs.size(); i++) {
80 | results[i] = length_keyval(_kvs[i]);}
81 | return(wrap(results));}
82 |
83 | SEXP sapply_null_keys(SEXP kvs) {
84 | List _kvs(kvs);
85 | std::vector results(_kvs.size());
86 | for(unsigned int i = 0; i < _kvs.size(); i++) {
87 | List kv(wrap(_kvs[i]));
88 | results[i] = Rf_isNull(kv["key"]);}
89 | return(wrap(results));}
90 |
91 | SEXP sapply_is_list(SEXP l) {
92 | List _l(l);
93 | std::vector results(_l.size());
94 | for(unsigned int i = 0; i < _l.size(); i++) {
95 | results[i] = (as(_l[i]).sexp_type() == VECSXP);}
96 | return wrap(results);}
97 |
98 | SEXP lapply_key_val(SEXP kvs, std::string slot) {
99 | List _kvs(kvs);
100 | List results(_kvs.size());
101 | for(unsigned int i = 0; i < _kvs.size(); i++) {
102 | List kv(wrap(_kvs[i]));
103 | results[i] = kv[slot];}
104 | return wrap(results);}
105 |
106 | SEXP lapply_keys(SEXP kvs) {
107 | return lapply_key_val(kvs, "key");}
108 |
109 | SEXP lapply_values(SEXP kvs) {
110 | return lapply_key_val(kvs, "val");}
111 |
112 | SEXP are_factor(SEXP xx) {
113 | List _xx(xx);
114 | std::vector results(_xx.size());
115 | for(unsigned int i = 0; i < _xx.size(); i++) {
116 | results[i] = Rf_isFactor(_xx[i]);}
117 | return wrap(results);}
118 |
119 | bool is_data_frame(SEXP x) {
120 | RObject _x(x);
121 | if (_x.hasAttribute("class")) {
122 | if(as(_x.attr("class")) == "data.frame") {
123 | return true;}}
124 | return false;}
125 |
126 | SEXP are_data_frame(SEXP xx) {
127 | List _xx(xx);
128 | std::vector results(_xx.size());
129 | for(unsigned int i = 0; i < _xx.size(); i++) {
130 | results[i] = is_data_frame(_xx[i]);}
131 | return wrap(results);}
132 |
133 | SEXP are_matrix(SEXP xx) {
134 | List _xx(xx);
135 | std::vector results(_xx.size());
136 | for(unsigned int i = 0; i < _xx.size(); i++) {
137 | results[i] = Rf_isMatrix(_xx[i]);}
138 | return wrap(results);}
--------------------------------------------------------------------------------
/pkg/src/keyval.h:
--------------------------------------------------------------------------------
1 | //Copyright 2013 Revolution Analytics
2 | //
3 | //Licensed under the Apache License, Version 2.0 (the "License");
4 | //you may not use this file except in compliance with the License.
5 | //You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | //Unless required by applicable law or agreed to in writing, software
10 | //distributed under the License is distributed on an "AS IS" BASIS,
11 | //WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //See the License for the specific language governing permissions and
13 | //limitations under the License.
14 |
15 | #ifndef _RMR_KEYVAL_H
16 | #define _RMR_KEYVAL_H
17 |
18 | #include
19 |
20 | RcppExport SEXP null_purge(SEXP xx);
21 | RcppExport SEXP lapply_as_character(SEXP xx);
22 | RcppExport SEXP sapply_rmr_length(SEXP xx);
23 | RcppExport SEXP sapply_rmr_length_lossy_data_frame(SEXP xx);
24 | RcppExport SEXP sapply_length_keyval(SEXP kvs);
25 | RcppExport SEXP sapply_null_keys(SEXP kvs);
26 | RcppExport SEXP sapply_is_list(SEXP l);
27 | RcppExport SEXP lapply_keys(SEXP kvs);
28 | RcppExport SEXP lapply_values(SEXP kvs);
29 | RcppExport SEXP are_factor(SEXP xx);
30 | RcppExport SEXP are_data_frame(SEXP xx);
31 | RcppExport SEXP are_matrix(SEXP xx);
32 |
33 | #endif
34 |
--------------------------------------------------------------------------------
/pkg/src/t-list.cpp:
--------------------------------------------------------------------------------
1 | //Copyright 2013 Revolution Analytics
2 | //
3 | //Licensed under the Apache License, Version 2.0 (the "License");
4 | //you may not use this file except in compliance with the License.
5 | //You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | //Unless required by applicable law or agreed to in writing, software
10 | //distributed under the License is distributed on an "AS IS" BASIS,
11 | //WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //See the License for the specific language governing permissions and
13 | //limitations under the License.
14 |
15 | #include "t-list.h"
16 |
17 | using namespace Rcpp;
18 | using std::vector;
19 | using std::cerr;
20 | using std::endl;
21 |
22 |
23 | SEXP t_list(SEXP _ll) {
24 | List ll(_ll);
25 | List l_0(as(ll[0]));
26 | List tll(l_0.size());
27 | for(unsigned int j = 0; j < tll.size(); j++)
28 | tll[j] = List(ll.size());
29 | for(unsigned int i = 0; i < ll.size(); i++) {
30 | List l_i(as(ll[i]));
31 | for(unsigned int j = 0; j < tll.size(); j++) {
32 | as(tll[j])[i] = l_i[j];};}
33 | return wrap(tll);}
--------------------------------------------------------------------------------
/pkg/src/t-list.h:
--------------------------------------------------------------------------------
1 | //Copyright 2013 Revolution Analytics
2 | //
3 | //Licensed under the Apache License, Version 2.0 (the "License");
4 | //you may not use this file except in compliance with the License.
5 | //You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | //Unless required by applicable law or agreed to in writing, software
10 | //distributed under the License is distributed on an "AS IS" BASIS,
11 | //WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //See the License for the specific language governing permissions and
13 | //limitations under the License.
14 |
15 | #ifndef _RMR_TLIST_H
16 | #define _RMR_TLIST_H
17 |
18 | #include
19 |
20 |
21 | RcppExport SEXP t_list(SEXP _ll);
22 |
23 |
24 | #endif
--------------------------------------------------------------------------------
/pkg/src/typed-bytes.h:
--------------------------------------------------------------------------------
1 | //Copyright 2011 Revolution Analytics
2 | //
3 | //Licensed under the Apache License, Version 2.0 (the "License");
4 | //you may not use this file except in compliance with the License.
5 | //You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | //Unless required by applicable law or agreed to in writing, software
10 | //distributed under the License is distributed on an "AS IS" BASIS,
11 | //WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //See the License for the specific language governing permissions and
13 | //limitations under the License.
14 |
15 | #ifndef _RMR_TYPEDBYTES_H
16 | #define _RMR_TYPEDBYTES_H
17 |
18 | #include
19 |
20 |
21 | RcppExport SEXP typedbytes_reader(SEXP data);
22 | RcppExport SEXP typedbytes_writer(SEXP data, SEXP native);
23 |
24 | #endif
25 |
--------------------------------------------------------------------------------
/pkg/tests/IO.R:
--------------------------------------------------------------------------------
1 | # Copyright 2011 Revolution Analytics
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | ## test for typed bytes read/write
16 | library(quickcheck)
17 | library(rmr2)
18 |
19 |
20 | # test(
21 | # function(l) {
22 | # l = rapply(l, how = 'replace',
23 | # function(x){
24 | # if(is.null(x)) list()
25 | # else as.list(x)})
26 | # isTRUE(all.equal(l,
27 | # rmr2:::typedbytes.reader(rmr2:::typedbytes.writer(l), length(l) + 5)$objects,
28 | # check.attributes = FALSE))},
29 | # generators = list(rlist))
30 |
--------------------------------------------------------------------------------
/pkg/tests/avro.R:
--------------------------------------------------------------------------------
1 | # Copyright 2011 Revolution Analytics
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | library(rmr2)
16 | library(testthat)
17 | library(ravro)
18 | library(rhdfs)
19 | hdfs.init()
20 |
21 | rmr.options(backend = "hadoop")
22 |
23 | pathname = ravro::AVRO_TOOLS
24 | if(.Platform$OS.type == "windows") {
25 | subfname = strsplit(pathname, ":")
26 | if(length(subfname[[1]]) > 1)
27 | {
28 | pathname = subfname[[1]][2]
29 | }
30 | pathname = gsub("\"","",pathname)
31 | pathname = shortPathName(pathname)
32 | pathname = gsub("\\\\","/",pathname)}
33 | Sys.setenv(AVRO_LIBS = pathname)
34 |
35 | test_avro_rmr <-
36 | function(df, test, write.args = list(),
37 | input.format.args = list(), map = function(k, v) v ) {
38 | if(rmr.options("backend") == "local") TRUE
39 | else {
40 | tf1 = tempfile(fileext = ".avro")
41 | expect_true(do.call(ravro:::write.avro, c(list(df, tf1), write.args)))
42 | tf2 = "/tmp/rmr2.test"
43 | tf3 = file.path(tf2, "data.avro")
44 | hdfs.mkdir(tf2)
45 | hdfs.put(tf1, tf3)
46 | on.exit(hdfs.rmr(tf2))
47 | df.input.format <- do.call(make.input.format,
48 | c(list(
49 | format = "avro",
50 | schema.file = tf1),
51 | input.format.args))
52 | retdf <- values(
53 | from.dfs(
54 | mapreduce(
55 | tf2,
56 | map = map,
57 | input.format = df.input.format)))
58 | retdf <- retdf[row.names(df), ]
59 | attributes(retdf) <- attributes(retdf)[names(attributes(df))]
60 | test(retdf)
61 | }}
62 |
63 | expect_equal_avro_rmr <- function(df, ...){
64 | row.names(df) <- row.names(df) # rmr2 uses row.names function which coerces to character
65 | # We need to make sure row.names for x is character or else this will always fail
66 | test_avro_rmr(df, function(x)expect_equal(x, df), ...)
67 | }
68 |
69 | expect_equivalent_avro_rmr <- function(df, ...)
70 | test_avro_rmr(df, function(x)expect_equivalent(x, df), ...)
71 |
72 | d <- data.frame(x = 1,
73 | y = as.factor(1:10),
74 | fac = as.factor(sample(letters[1:3], 10, replace = TRUE)))
75 | expect_equivalent_avro_rmr(d)
76 |
77 |
78 | ##########################################################################################
79 |
80 | context("Basic Avro Read/Write")
81 |
82 | ### Handeling Factors
83 | # Warnings: Factor levels converted to valid Avro names
84 |
85 | test_that("Handling factors", {
86 | # Factors with non-"name" levels should still work
87 | d <- data.frame(x = 1,
88 | y = as.factor(1:10),
89 | fac = as.factor(sample(letters[1:3], 10, replace = TRUE)))
90 | expect_equivalent_avro_rmr(d) #order of levels can change
91 | })
92 |
93 |
94 | ### Type Translation
95 |
96 | test_that("type translation", {
97 | # All types should translate successfully
98 | L3 <- LETTERS[1:3]
99 | fac <- sample(L3, 10, replace = TRUE)
100 | d <- data.frame(x = 1, y = 1:10, fac = fac, b = rep(c(TRUE, FALSE), 5), c = rep(NA, 10),
101 | stringsAsFactors = FALSE)
102 | expect_equal_avro_rmr(d)
103 |
104 | d <- data.frame(x = 1, y = 1:10, fac = factor(fac, levels = L3),
105 | b = rep(c(TRUE, FALSE), 5), c = rep(NA, 10),
106 | stringsAsFactors = FALSE)
107 | expect_equivalent_avro_rmr(d)
108 | })
109 |
110 | ### write can handle missing values
111 |
112 | test_that("write can handle missing values", {
113 | # NA column (entirely "null" in Avro)
114 | d <- data.frame(x = 1,
115 | y = 1:10,
116 | b = rep(c(TRUE, FALSE), 5),
117 | c = rep(NA, 10),
118 | stringsAsFactors = FALSE)
119 | expect_equal_avro_rmr(d)
120 |
121 | # NA row (entirely "null" in Avro)
122 | d <- rbind(data.frame(x = 1,
123 | y = 1:10,
124 | b = rep(c(TRUE, FALSE), 5)),
125 | rep(NA, 3))
126 | expect_equal_avro_rmr(d)
127 | })
128 |
129 | ### NaNs throw warning
130 |
131 | test_that("NaNs throw warning", {
132 | # NaN row (entirely "null" in Avro)
133 | d <- rbind(data.frame(x = 1,
134 | y = 1:10,
135 | b = rep(c(TRUE, FALSE), 5)),
136 | rep(NaN, 3))
137 | d[nrow(d), ] <- NA
138 | expect_equal_avro_rmr(d)
139 |
140 | # NaN row (entirely "null" in Avro)
141 | d <- cbind(data.frame(x = 1,
142 | y = 1:10,
143 | b = rep(c(TRUE, FALSE), 5)),
144 | c = rep(NaN, 10))
145 | d[, ncol(d)] <- as.numeric(NA) # coerce this type
146 | expect_equal_avro_rmr(d)
147 | })
148 |
149 | ### write.avro throws error on infinite values
150 | ## Infinite values cannot be serialied to Avro (which is good, what test verifies)
151 |
152 | test_that("write.avro throws error on infinite values", {
153 | d <- rbind(data.frame(x = 1, y = 1:10, b = rep(c(TRUE, FALSE), 5)), rep(NA, 3),
154 | c(Inf, 11, TRUE, NA))
155 | expect_that(expect_equal_avro_rmr(d), throws_error())
156 |
157 | d <- rbind(data.frame(x = 1, y = 1:10, b = rep(c(TRUE, FALSE), 5)), rep(NA, 3),
158 | c(-Inf, 11, TRUE, NA))
159 | expect_that(expect_equal_avro_rmr(d), throws_error())
160 | })
161 |
162 | ############################ Read/Write mtcars and iris ###############################
163 |
164 | context("Read/Write mtcars and iris")
165 |
166 | ### mtcars round trip
167 |
168 | test_that("mtcars round trip", {
169 | expect_equal_avro_rmr(mtcars)
170 | })
171 |
172 |
173 | ### factors level that are not Avro names read/write
174 | ## mttmp equivalent despite refactorization (good, warnings)
175 | # 1: In (function (x, name = NULL, namespace = NULL, is.union = F, row.names = T, :
176 | # Factor levels converted to valid Avro names: _3_ravro, _4_ravro, _5_ravro
177 |
178 | test_that("factors level that are not Avro names read/write", {
179 | mttmp <- mtcars
180 | mttmp$gear_factor <- as.factor(mttmp$gear)
181 | expect_equal_avro_rmr(mttmp)
182 | })
183 |
184 |
185 | ### iris round trip
186 | ## iris_avro not equivalent
187 | # Length mismatch: comparison on first 3 components
188 |
189 | test_that("iris round trip", {
190 | # This doesn't work, because rmr2::from.dfs uses rbind to combine the values together
191 | #expect_equal_avro_rmr(iris, write.args = list(unflatten = T), input.format.args = list(flatten = F))
192 |
193 | expect_equal_avro_rmr(iris, write.args = list(unflatten = T), input.format.args = list(flatten = T))
194 | })
195 |
196 |
--------------------------------------------------------------------------------
/pkg/tests/basic-examples.R:
--------------------------------------------------------------------------------
1 | # Copyright 2011 Revolution Analytics
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # lapply like job, first intro
16 |
17 | library(rmr2)
18 |
19 | for (be in c("local", "hadoop")) {
20 | rmr.options(backend = be)
21 | ## @knitr lapply
22 | small.ints = 1:1000
23 | sapply(small.ints, function(x) x^2)
24 | ## @knitr lapply-mapreduce
25 | small.ints = to.dfs(1:1000)
26 | mapreduce(
27 | input = small.ints,
28 | map = function(k, v) cbind(v, v^2))
29 | ## @knitr end
30 | from.dfs(
31 | mapreduce(
32 | input = small.ints,
33 | map = function(k, v) cbind(v, v^2)))
34 |
35 | # tapply like job
36 | ## @knitr tapply
37 | groups = rbinom(32, n = 50, prob = 0.4)
38 | tapply(groups, groups, length)
39 | ## @knitr tapply-mapreduce
40 | groups = to.dfs(groups)
41 | from.dfs(
42 | mapreduce(
43 | input = groups,
44 | map = function(., v) keyval(v, 1),
45 | reduce =
46 | function(k, vv)
47 | keyval(k, length(vv))))
48 | ## @knitr end
49 |
50 | ## input can be any rmr-native format file
51 | ## pred can be function(x) x > 0
52 | ## it will be evaluated on the value only, not on the key
53 |
54 | ## @knitr basic.examples-filter
55 | filter.map =
56 | function(pred)
57 | function(., v) {v[pred(v)]}
58 |
59 | mrfilter =
60 | function (input,
61 | output = NULL,
62 | pred) {
63 | mapreduce(
64 | input = input,
65 | output = output,
66 | map = filter.map(pred))}
67 |
68 | filtertest = to.dfs(rnorm(10))
69 | from.dfs(
70 | mrfilter(
71 | input = filtertest,
72 | pred = function(x) x > 0))
73 | }
74 | ## @knitr end
75 |
76 | ## pipeline of two filters, sweet
77 | # from.dfs(mrfilter(input = mrfilter(
78 | # input = "/tmp/filtertest/",
79 | # pred = function(x) x > 0),
80 | # pred = function(x) x < 0.5))
81 |
--------------------------------------------------------------------------------
/pkg/tests/basic.R:
--------------------------------------------------------------------------------
1 | # Copyright 2011 Revolution Analytics
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # the problem with the tests here is that they are more complex than the function they are meant to test
16 | # or just a duplication. That's not very useful.
17 |
18 | library(rmr2)
19 | library(quickcheck)
20 |
21 | #qw
22 | test(
23 | function(ss = rcharacter()) {
24 | ss = paste("v", ss, sep = "")
25 | all(ss == eval(parse(text = paste("rmr2:::qw(", paste(ss, collapse = ","), ")"))))})
26 |
27 | # Make.single.arg
28 | test(
29 | function(l = rlist()) {
30 | f = function(...) list(...)
31 | g = rmr2:::Make.single.arg(f)
32 | identical(do.call(f, l), g(l))})
33 |
34 | # Make.multi.arg
35 | test(
36 | function(l = rlist()) {
37 | f = function(x) x
38 | g = rmr2:::Make.multi.arg(f)
39 | identical(do.call(g, l), f(l))})
40 |
41 | # Make.single.or.multi.arg
42 | test(
43 | function(
44 | l = rlist(size = c(min = 2)),
45 | arity = sample(c("single", "multi"), size = 1)) {
46 | f = if(arity == "single") unlist else c
47 | g = rmr2:::Make.single.or.multi.arg(f, from = arity)
48 | identical(g(l), do.call(g, l))})
49 |
50 | #%:% TODO
51 | # all.predicate TODO
52 |
53 | # make.fast.list TODO
54 | # actually the function has been working forever, the test doesn't
55 |
56 | # test(
57 | # function(l){
58 | # fl = rmr2:::make.fast.list()
59 | # lapply(l, fl)
60 | # print(x=as.list(do.call(c, l)))
61 | # print(x=fl())
62 | # identical(as.list(do.call(c, l)), fl())},
63 | # list(Curry(rlist,lambda=1, max.level=8)))
64 | #
65 |
66 | #named.slice TODO
67 | #interleave TODO
68 |
69 |
--------------------------------------------------------------------------------
/pkg/tests/benchmarks.R:
--------------------------------------------------------------------------------
1 | # Copyright 2011 Revolution Analytics
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | library(rmr2)
16 |
17 | report = list()
18 | for (be in c("local", "hadoop")) {
19 | rmr.options(backend = be)
20 | ## @knitr input
21 | input.size = {
22 | if(rmr.options('backend') == "local")
23 | 10^4
24 | else
25 | 10^6}
26 | ## @knitr end
27 | report[[be]] =
28 | rbind(
29 | report[[be]],
30 | write =
31 | system.time({
32 | ## @knitr write
33 | input = to.dfs(1:input.size)
34 | ## @knitr end
35 | }))
36 |
37 | report[[be]] =
38 | rbind(
39 | report[[be]],
40 | read =
41 | system.time({
42 | out =
43 | ## @knitr read
44 | from.dfs(input)
45 | ## @knitr end
46 | }))
47 | stopifnot(
48 | all(
49 | 1:input.size == sort(values(out))))
50 |
51 | report[[be]] =
52 | rbind(
53 | report[[be]],
54 | pass.through = system.time({
55 | out =
56 | ## @knitr pass-through
57 | mapreduce(
58 | input,
59 | map = function(k, v) keyval(k, v))
60 | ## @knitr end
61 | }))
62 | stopifnot(
63 | all(
64 | 1:input.size ==
65 | sort(values(from.dfs(out)))))
66 |
67 | ## @knitr predicate
68 | predicate =
69 | function(., v) v%%2 == 0
70 | ## @knitr end
71 | report[[be]] =
72 | rbind(
73 | report[[be]],
74 | filter = system.time({
75 | out =
76 | ## @knitr filter
77 | mapreduce(
78 | input,
79 | map =
80 | function(k, v) {
81 | filter = predicate(k, v)
82 | keyval(k[filter], v[filter])})
83 | ## @knitr end
84 | }))
85 | stopifnot(
86 | all(
87 | 2*(1:(input.size/2)) ==
88 | sort(values(from.dfs(out)))))
89 |
90 | ## @knitr select-input
91 | input.select =
92 | to.dfs(
93 | data.frame(
94 | a = rnorm(input.size),
95 | b = 1:input.size,
96 | c = sample(as.character(1:10),
97 | input.size,
98 | replace=TRUE)))
99 | ## @knitr end
100 | report[[be]] =
101 | rbind(
102 | report[[be]],
103 | select = system.time({
104 | out =
105 | ## @knitr select
106 | mapreduce(input.select,
107 | map = function(., v) v$b)
108 | ## @knitr end
109 | }))
110 | stopifnot(
111 | all(
112 | 1:input.size ==
113 | sort(values(from.dfs(out)))))
114 |
115 | ## @knitr bigsum-input
116 | set.seed(0)
117 | big.sample = rnorm(input.size)
118 | input.bigsum = to.dfs(big.sample)
119 | ## @knitr end
120 | report[[be]] =
121 | rbind(
122 | report[[be]],
123 | bigsum = system.time({
124 | out =
125 | ## @knitr bigsum
126 | mapreduce(
127 | input.bigsum,
128 | map =
129 | function(., v) keyval(1, sum(v)),
130 | reduce =
131 | function(., v) keyval(1, sum(v)),
132 | combine = TRUE)
133 | ## @knitr end
134 | }))
135 | stopifnot(
136 | isTRUE(
137 | all.equal(
138 | sum(values(from.dfs(out))),
139 | sum(big.sample),
140 | tolerance=.000001)))
141 | ## @knitr group-aggregate-input
142 | input.ga =
143 | to.dfs(
144 | cbind(
145 | 1:input.size,
146 | rnorm(input.size)))
147 | ## @knitr group-aggregate-functions
148 | group = function(x) x%%10
149 | aggregate = function(x) sum(x)
150 | ## @knitr end
151 | report[[be]] =
152 | rbind(
153 | report[[be]],
154 | group.aggregate = system.time({
155 | out =
156 | ## @knitr group-aggregate
157 | mapreduce(
158 | input.ga,
159 | map =
160 | function(k, v)
161 | keyval(group(v[,1]), v[,2]),
162 | reduce =
163 | function(k, vv)
164 | keyval(k, aggregate(vv)),
165 | combine = TRUE)
166 | ## @knitr end
167 | }))
168 | log.input.size = log10(input.size)
169 | z = plyr::splat(rbind)(
170 | c(
171 | lapply(0:log.input.size, function(i) system.time(to.dfs(keyval(data.frame(1:10^i), data.frame(1:10^log.input.size))))),
172 | lapply(0:log.input.size, function(i) {z = to.dfs(keyval(data.frame(1:10^i), data.frame(1:10^log.input.size))); system.time(from.dfs(z))}),
173 | lapply(0:log.input.size, function(i) {z = to.dfs(keyval(data.frame(1:10^i), data.frame(1:10^log.input.size))); system.time(mapreduce(z))}),
174 | lapply(0:(log.input.size-2), function(i) {z = to.dfs(keyval(data.frame(1:10^i), data.frame(1:10^log.input.size)));
175 | system.time(mapreduce(z, reduce = function(k,v) as.data.frame(t(colSums(v)))))})))
176 | row.names(z) = make.names(t(outer(c("to.dfs","from.dfs", "map only", "map reduce"), c(0:log.input.size), paste)))[1:(4*(1 + log.input.size) - 2)]
177 | report[[be]] = rbind(report[[be]], z)
178 | }
179 |
180 |
181 | print(report)
182 |
183 |
--------------------------------------------------------------------------------
/pkg/tests/getting-data-in-and-out.R:
--------------------------------------------------------------------------------
1 | # Copyright 2013 Revolution Analytics
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # partly from community contributed examples (with code)
16 | # additional copyrights may apply
17 |
18 | library(rmr2)
19 | ## @knitr getting-data.IO.formats
20 | rmr2:::IO.formats
21 | ## @knitr getting-data.make.input.format.csv
22 | make.input.format("csv")
23 | ## @knitr getting-data.make.output.format.csv
24 | make.output.format("csv")
25 | ## @knitr getting-data.generic.list
26 | my.data = list(TRUE, list("nested list", 7.2), seq(1:3), letters[1:4], matrix(1:25, nrow = 5,ncol = 5))
27 | ## @knitr getting-data.to.dfs
28 | hdfs.data = to.dfs(my.data)
29 | ## @knitr getting-data.object.length.frequency
30 | result = mapreduce(
31 | input = hdfs.data,
32 | map = function(k, v) keyval(lapply(v, length), 1),
33 | reduce = function(k, vv) keyval(k, sum(vv)))
34 |
35 | from.dfs(result)
36 | ## @knitr end
37 | ## @knitr getting-data.tsv.reader
38 | tsv.reader = function(con){
39 | lines = readLines(con, 1000)
40 | if(length(lines) == 0)
41 | NULL
42 | else {
43 | delim = strsplit(lines, split = "\t")
44 | keyval(
45 | sapply(delim,
46 | function(x) x[1]),
47 | sapply(delim,
48 | function(x) x[-1]))}}
49 | ## first column is the key, note that column indexes moved by 1
50 | ## @knitr getting-data.tsv.input.format
51 | tsv.format =
52 | make.input.format(
53 | format = tsv.reader,
54 | mode = "text")
55 | ## @knitr getting-data.generate.tsv.data
56 |
57 | tsv.data =
58 | to.dfs(
59 | data.frame(
60 | x = 1:100,
61 | y = rnorm(100),
62 | z = runif(100),
63 | w = 1:100),
64 | format =
65 | make.output.format("csv", sep = "\t"))
66 | ## @knitr getting-data.frequency.count
67 | freq.counts =
68 | mapreduce(
69 | input = tsv.data,
70 | input.format = tsv.format,
71 | map = function(k, v) keyval(v[1,], 1),
72 | reduce = function(k, vv) keyval(k, sum(vv)))
73 | ## @knitr getting-data.named.columns
74 | tsv.reader =
75 | function(con){
76 | lines = readLines(con, 1000)
77 | if(length(lines) == 0)
78 | NULL
79 | else {
80 | delim = strsplit(lines, split = "\t")
81 | keyval(
82 | sapply(delim, function(x) x[1]),
83 | data.frame(
84 | location = sapply(delim, function(x) x[2]),
85 | name = sapply(delim, function(x) x[3]),
86 | value = sapply(delim, function(x) x[4])))}}
87 |
88 | ## @knitr getting-data.tsv.input.format.1
89 | tsv.format =
90 | make.input.format(
91 | format = tsv.reader,
92 | mode = "text")
93 | ## @knitr getting-data.named.column.access
94 | freq.counts =
95 | mapreduce(
96 | input = tsv.data,
97 | input.format = tsv.format,
98 | map =
99 | function(k, v) {
100 | filter = (v$name == "blarg")
101 | keyval(k[filter], log(as.numeric(v$value[filter])))},
102 | reduce = function(k, vv) keyval(k, mean(vv)))
103 | ## @knitr getting-data.csv.output
104 | csv.writer = function(kv, con){
105 | cat(
106 | paste(
107 | apply(cbind(1:32, mtcars),
108 | 1,
109 | paste, collapse = ","),
110 | collapse = "\n"),
111 | file = con)}
112 | ## @knitr getting-data.csv.output.simpler
113 | csv.format = make.output.format("csv", sep = ",")
114 | ## @knitr getting-data.explicit.output.arg
115 | mapreduce(
116 | input = hdfs.data,
117 | output = tempfile(),
118 | output.format = csv.format,
119 | map = function(k, v){
120 | # complicated function here
121 | keyval(1, v)},
122 | reduce = function(k, vv) {
123 | #complicated function here
124 | keyval(k, vv[[1]])})
125 | ## @knitr getting-data.create.fields.list
126 | fields <- rmr2:::qw(mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb)
127 | field.size = 8
128 | ## @knitr getting-data.fwf.reader
129 | fwf.reader <- function(con) {
130 | lines <- readLines(con, 1000)
131 | if (length(lines) == 0) {
132 | NULL}
133 | else {
134 | split.lines = unlist(strsplit(lines, ""))
135 | df =
136 | as.data.frame(
137 | matrix(
138 | sapply(
139 | split(
140 | split.lines,
141 | ceiling(1:length(split.lines)/field.size)),
142 | paste, collapse = ""),
143 | ncol = length(fields), byrow = TRUE))
144 | names(df) = fields
145 | keyval(NULL, df)}}
146 | fwf.input.format = make.input.format(mode = "text", format = fwf.reader)
147 | ## @knitr getting-data.fwf.writer
148 | fwf.writer <- function(kv, con) {
149 | ser =
150 | function(df)
151 | paste(
152 | apply(
153 | df,
154 | 1,
155 | function(x)
156 | paste(
157 | format(
158 | x,
159 | width = field.size),
160 | collapse = "")),
161 | collapse = "\n")
162 | out = ser(values(kv))
163 | writeLines(out, con = con)}
164 | fwf.output.format = make.output.format(mode = "text", format = fwf.writer)
165 | ## @knitr getting-data.generate.fwf.data
166 | fwf.data <- to.dfs(mtcars, format = fwf.output.format)
167 | ## @knitr getting-data.from.dfs.one.line
168 | out <- from.dfs(mapreduce(input = fwf.data,
169 | input.format = fwf.input.format))
170 | out$val
171 | ## @knitr getting-data.cyl.frequency.count
172 | out <- from.dfs(mapreduce(input = fwf.data,
173 | input.format = fwf.input.format,
174 | map = function(key, value) keyval(value[,"cyl"], 1),
175 | reduce = function(key, value) keyval(key, sum(unlist(value))),
176 | combine = TRUE))
177 | df <- data.frame(out$key, out$val)
178 | names(df) <- c("cyl","count")
179 | df
180 | ## @knitr end
181 |
182 |
--------------------------------------------------------------------------------
/pkg/tests/keyval.R:
--------------------------------------------------------------------------------
1 | # Copyright 2011 Revolution Analytics
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | library(quickcheck)
16 | library(rmr2)
17 |
18 | #has.rows
19 | test(
20 | function(x = rmr2:::rrmr.data()) {
21 | is.null(nrow(x)) == !rmr2:::has.rows(x)})
22 |
23 | #all.have rows TODO
24 | #rmr.length TODO
25 |
26 | #keyval, keys.values
27 | test(
28 | function(k = rmr2:::rrmr.data(size = c(min = 1)), v = rmr2:::rrmr.data(size = ~rmr2:::rmr.length(k))){
29 | kv = keyval(k, v)
30 | identical(keys(kv), k) &&
31 | identical(values(kv), v)})
32 |
33 | #NULL key case
34 | test(
35 | function(v = rmr2:::rrmr.data(size = c(min = 1))){
36 | k = NULL
37 | kv = keyval(k, v)
38 | identical(keys(kv), k) &&
39 | identical(values(kv), v)})
--------------------------------------------------------------------------------
/pkg/tests/kmeans.R:
--------------------------------------------------------------------------------
1 | # Copyright 2011 Revolution Analytics
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | library(rmr2)
16 |
17 | ## @knitr kmeans-signature
18 | kmeans.mr =
19 | function(
20 | P,
21 | num.clusters,
22 | num.iter,
23 | combine,
24 | in.memory.combine) {
25 | ## @knitr kmeans-dist.fun
26 | dist.fun =
27 | function(C, P) {
28 | apply(
29 | C,
30 | 1,
31 | function(x)
32 | colSums((t(P) - x)^2))}
33 | ## @knitr kmeans.map
34 | kmeans.map =
35 | function(., P) {
36 | nearest = {
37 | if(is.null(C))
38 | sample(
39 | 1:num.clusters,
40 | nrow(P),
41 | replace = TRUE)
42 | else {
43 | D = dist.fun(C, P)
44 | nearest = max.col(-D)}}
45 | if(!(combine || in.memory.combine))
46 | keyval(nearest, P)
47 | else
48 | keyval(nearest, cbind(1, P))}
49 | ## @knitr kmeans.reduce
50 | kmeans.reduce = {
51 | if (!(combine || in.memory.combine) )
52 | function(., P)
53 | t(as.matrix(apply(P, 2, mean)))
54 | else
55 | function(k, P)
56 | keyval(
57 | k,
58 | t(as.matrix(apply(P, 2, sum))))}
59 | ## @knitr kmeans-main-1
60 | C = NULL
61 | for(i in 1:num.iter ) {
62 | C =
63 | values(
64 | from.dfs(
65 | mapreduce(
66 | P,
67 | map = kmeans.map,
68 | reduce = kmeans.reduce)))
69 | if(combine || in.memory.combine)
70 | C = C[, -1]/C[, 1]
71 | ## @knitr end
72 | # points(C, col = i + 1, pch = 19)
73 | ## @knitr kmeans-main-2
74 | if(nrow(C) < num.clusters) {
75 | C =
76 | rbind(
77 | C,
78 | matrix(
79 | rnorm(
80 | (num.clusters -
81 | nrow(C)) * nrow(C)),
82 | ncol = nrow(C)) %*% C) }}
83 | C}
84 | ## @knitr end
85 |
86 | ## sample runs
87 | ##
88 |
89 | out = list()
90 |
91 | for(be in c("local", "hadoop")) {
92 | rmr.options(backend = be)
93 | set.seed(0)
94 | ## @knitr kmeans-data
95 | P =
96 | do.call(
97 | rbind,
98 | rep(
99 | list(
100 | matrix(
101 | rnorm(10, sd = 10),
102 | ncol=2)),
103 | 20)) +
104 | matrix(rnorm(200), ncol =2)
105 | ## @knitr end
106 | # x11()
107 | # plot(P)
108 | # points(P)
109 | out[[be]] =
110 | ## @knitr kmeans-run
111 | kmeans.mr(
112 | to.dfs(P),
113 | num.clusters = 12,
114 | num.iter = 5,
115 | combine = FALSE,
116 | in.memory.combine = FALSE)
117 | ## @knitr end
118 | }
119 |
120 | # would love to take this step but kmeans in randomized in a way that makes it hard to be completely reprodubile
121 | # stopifnot(rmr2:::cmp(out[['hadoop']], out[['local']]))
122 |
--------------------------------------------------------------------------------
/pkg/tests/linear-least-squares.R:
--------------------------------------------------------------------------------
1 | # Copyright 2011 Revolution Analytics
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | library(rmr2)
16 |
17 | ## @knitr LLS-data
18 | X = matrix(rnorm(2000), ncol = 10)
19 | X.index = to.dfs(cbind(1:nrow(X), X))
20 | y = as.matrix(rnorm(200))
21 | ## @knitr LLS-sum
22 | Sum =
23 | function(., YY)
24 | keyval(1, list(Reduce('+', YY)))
25 | ## @knitr LLS-XtX
26 | XtX =
27 | values(
28 | from.dfs(
29 | mapreduce(
30 | input = X.index,
31 | map =
32 | function(., Xi) {
33 | Xi = Xi[,-1]
34 | keyval(1, list(t(Xi) %*% Xi))},
35 | reduce = Sum,
36 | combine = TRUE)))[[1]]
37 | ## @knitr LLS-Xty
38 | Xty =
39 | values(
40 | from.dfs(
41 | mapreduce(
42 | input = X.index,
43 | map = function(., Xi) {
44 | yi = y[Xi[,1],]
45 | Xi = Xi[,-1]
46 | keyval(1, list(t(Xi) %*% yi))},
47 | reduce = Sum,
48 | combine = TRUE)))[[1]]
49 | ## @knitr LLS-solve
50 | solve(XtX, Xty)
51 | ## @knitr end
--------------------------------------------------------------------------------
/pkg/tests/logistic-regression.R:
--------------------------------------------------------------------------------
1 | # Copyright 2011 Revolution Analytics
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | ## see spark implementation http://www.spark-project.org/examples.html
17 | ## see nice derivation here http://people.csail.mit.edu/jrennie/writing/lr.pdf
18 |
19 | library(rmr2)
20 |
21 | ## @knitr logistic.regression-signature
22 | logistic.regression =
23 | function(input, iterations, dims, alpha){
24 |
25 | ## @knitr logistic.regression-map
26 | lr.map =
27 | function(., M) {
28 | Y = M[,1]
29 | X = M[,-1]
30 | keyval(
31 | 1,
32 | Y * X *
33 | g(-Y * as.numeric(X %*% t(plane))))}
34 | ## @knitr logistic.regression-reduce
35 | lr.reduce =
36 | function(k, Z)
37 | keyval(k, t(as.matrix(apply(Z,2,sum))))
38 | ## @knitr logistic.regression-main
39 | plane = t(rep(0, dims))
40 | g = function(z) 1/(1 + exp(-z))
41 | for (i in 1:iterations) {
42 | gradient =
43 | values(
44 | from.dfs(
45 | mapreduce(
46 | input,
47 | map = lr.map,
48 | reduce = lr.reduce,
49 | combine = TRUE)))
50 | plane = plane + alpha * gradient }
51 | plane }
52 | ## @knitr end
53 |
54 | out = list()
55 | test.size = 10^5
56 | for (be in c("local", "hadoop")) {
57 | rmr.options(backend = be)
58 | ## create test set
59 | set.seed(0)
60 | ## @knitr logistic.regression-data
61 | eps = rnorm(test.size)
62 | testdata =
63 | to.dfs(
64 | as.matrix(
65 | data.frame(
66 | y = 2 * (eps > 0) - 1,
67 | x1 = 1:test.size,
68 | x2 = 1:test.size + eps)))
69 | ## @knitr end
70 | out[[be]] =
71 | ## @knitr logistic.regression-run
72 | logistic.regression(
73 | testdata, 3, 2, 0.05)
74 | ## @knitr end
75 | ## max likelihood solution diverges for separable dataset, (-inf, inf) such as the above
76 | }
77 | stopifnot(
78 | isTRUE(all.equal(out[['local']], out[['hadoop']], tolerance = 1E-7)))
79 |
--------------------------------------------------------------------------------
/pkg/tests/mapreduce.R:
--------------------------------------------------------------------------------
1 | # Copyright 2011 Revolution Analytics
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | library(quickcheck)
16 | library(rmr2)
17 | library(rhdfs)
18 | hdfs.init()
19 |
20 | kv.cmp = rmr2:::kv.cmp
21 |
22 |
23 | for (be in c("local", "hadoop")) {
24 | rmr.options(backend = be)
25 |
26 | ##from.dfs to.dfs
27 |
28 | ##native
29 | test(
30 | function(kv = rmr2:::rkeyval())
31 | kv.cmp(
32 | kv,
33 | from.dfs(to.dfs(kv))))
34 |
35 | ## csv
36 | ## no support for raw in csv
37 |
38 | test(
39 | function(df = rmr2:::rdata.frame.simple())
40 | kv.cmp(
41 | keyval(NULL, df),
42 | from.dfs(
43 | to.dfs(
44 | keyval(NULL, df),
45 | format = "csv"),
46 | format = "csv")))
47 |
48 | #json
49 | fmt = "json"
50 | test(
51 | function(df = rmr2:::rdata.frame.simple())
52 | kv.cmp(
53 | keyval(1, df),
54 | from.dfs(
55 | to.dfs(
56 | keyval(1, df),
57 | format = fmt),
58 | format = make.input.format("json", key.class = "list", value.class = "data.frame"))))
59 |
60 | #sequence.typedbytes
61 | seq.tb.data.loss =
62 | function(l)
63 | rapply(
64 | l,
65 | function(x){
66 | if(class(x) == "Date") x = unclass(x)
67 | if(is.factor(x)) x = as.character(x)
68 | if(class(x) == "raw" || length(x) == 1) x else as.list(x)},
69 | how = "replace")
70 |
71 | fmt = "sequence.typedbytes"
72 | test(
73 | function(l = rlist()) {
74 | l = c(0, l)
75 | kv = keyval(seq.tb.data.loss(list(1)), seq.tb.data.loss(l))
76 | kv.cmp(
77 | kv,
78 | from.dfs(
79 | to.dfs(
80 | kv,
81 | format = fmt),
82 | format = fmt))})
83 |
84 | ##mapreduce
85 |
86 | ##simplest mapreduce, all default
87 | test(
88 | function(kv = rmr2:::rkeyval()) {
89 | if(rmr2:::length.keyval(kv) == 0) TRUE
90 | else {
91 | kv1 = from.dfs(mapreduce(input = to.dfs(kv)))
92 | kv.cmp(kv, kv1)}})
93 |
94 | ##put in a reduce for good measure
95 | test(
96 | function(kv = rmr2:::rkeyval()) {
97 | if(rmr2:::length.keyval(kv) == 0) TRUE
98 | else {
99 | kv1 =
100 | from.dfs(
101 | mapreduce(
102 | input = to.dfs(kv),
103 | reduce = to.reduce(identity)))
104 | kv.cmp(kv, kv1)}})
105 |
106 | ## csv
107 | test(
108 | function(df = rmr2:::rdata.frame.simple())
109 | kv.cmp(
110 | keyval(NULL, df),
111 | from.dfs(
112 | mapreduce(
113 | to.dfs(
114 | keyval(NULL, df),
115 | format = "csv"),
116 | input.format = "csv",
117 | output.format = "csv"),
118 | format = "csv")))
119 |
120 | #json
121 | # a more general test would be better for json but the subtleties of mapping R to to JSON are many
122 | fmt = "json"
123 | test(
124 | function(df = rmr2:::rdata.frame.simple())
125 | kv.cmp(
126 | keyval(1, df),
127 | from.dfs(
128 | mapreduce(
129 | to.dfs(
130 | keyval(1, df),
131 | format = fmt),
132 | input.format = make.input.format("json", key.class = "list", value.class = "data.frame"),
133 | output.format = fmt),
134 | format = make.input.format("json", key.class = "list", value.class = "data.frame"))))
135 |
136 | #sequence.typedbytes
137 | fmt = "sequence.typedbytes"
138 | test(
139 | function(l = rlist()) {
140 | l = c(0, l)
141 | kv = keyval(seq.tb.data.loss(list(1)), seq.tb.data.loss(l))
142 | l = c(0, l)
143 | kv.cmp(
144 | kv,
145 | from.dfs(
146 | mapreduce(
147 | to.dfs(
148 | kv,
149 | format = fmt),
150 | input.format = fmt,
151 | output.format = fmt),
152 | format = fmt))})
153 |
154 | #avro
155 | pathname = ravro::AVRO_TOOLS
156 | if(.Platform$OS.type == "windows") {
157 | subfname = strsplit(pathname, ":")
158 | if(length(subfname[[1]]) > 1)
159 | {
160 | pathname = subfname[[1]][2]
161 | }
162 | pathname = gsub("\"","",pathname)
163 | pathname = shortPathName(pathname)
164 | pathname = gsub("\\\\","/",pathname)}
165 | Sys.setenv(AVRO_LIBS = pathname)
166 |
167 | test(
168 | function(df = rmr2:::rdata.frame.simple(nrow = c(min = 2))) {
169 | if(rmr.options("backend") == "local") TRUE
170 | else {
171 | names(df) = sub("\\.", "_", names(df))
172 | tf1 = tempfile()
173 | ravro:::write.avro(df, tf1)
174 | tf2 = "/tmp/rmr2.test.avro"
175 | on.exit(hdfs.rm(tf2))
176 | hdfs.put(tf1, tf2)
177 | kv.cmp(
178 | keyval(NULL, df),
179 | from.dfs(
180 | mapreduce(
181 | tf2,
182 | input.format =
183 | make.input.format(
184 | format = "avro",
185 | schema.file = tf1))))}})
186 |
187 | #equijoin
188 | stopifnot(
189 | all(
190 | apply(
191 | values(
192 | from.dfs(
193 | equijoin(
194 | left.input = to.dfs(keyval(1:10, (1:10)^2)),
195 | right.input = to.dfs(keyval(1:10, (1:10)^3))))),
196 | 1,
197 | function(x) x[[1]]^(3/2) == x[[2]])))
198 | }
--------------------------------------------------------------------------------
/pkg/tests/naive-bayes.R:
--------------------------------------------------------------------------------
1 | # Copyright 2011 Revolution Analytics
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # this is just an example, not part of a math library
16 | # matrix A_{ij} representation is a list of keyval(c(i,j), A_{ij})
17 | # vectors are column matrices
18 |
19 | ## input format is keyval(NULL, list(x=c(x1, ..., xn), y = y)
20 |
21 | ##library(rmr2)
22 |
23 | ##naive.bayes = function(input, output = NULL) {
24 | ## mapreduce(input = input, output = output,
25 | ## map = function(k, v) c(lapply(1:length(v$x) function(i) keyval(c(i, v$x[i], v$y),1)),
26 | ## lapply),
27 | ## reduce = function(k, vv) keyval(k, sum(unlist(vv))),
28 | ## combiner = TRUE)
29 | ##}
--------------------------------------------------------------------------------
/pkg/tests/wordcount.R:
--------------------------------------------------------------------------------
1 | # Copyright 2011 Revolution Analytics
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | ## classic wordcount
17 | ## input can be any text file
18 | ## inspect output with from.dfs(output) -- this will produce an R list watch out with big datasets
19 |
20 | library(rmr2)
21 |
22 | ## @knitr wordcount-signature
23 | wordcount =
24 | function(
25 | input,
26 | output = NULL,
27 | pattern = " "){
28 | ## @knitr wordcount-map
29 | wc.map =
30 | function(., lines) {
31 | keyval(
32 | unlist(
33 | strsplit(
34 | x = lines,
35 | split = pattern)),
36 | 1)}
37 | ## @knitr wordcount-reduce
38 | wc.reduce =
39 | function(word, counts ) {
40 | keyval(word, sum(counts))}
41 | ## @knitr wordcount-mapreduce
42 | mapreduce(
43 | input = input,
44 | output = output,
45 | map = wc.map,
46 | reduce = wc.reduce,
47 | combine = TRUE)}
48 | ## @knitr end
49 |
50 | text = capture.output(license())
51 | out = list()
52 | for(be in c("local", "hadoop")) {
53 | rmr.options(backend = be)
54 | out[[be]] = from.dfs(wordcount(to.dfs(keyval(NULL, text)), pattern = " +"))}
55 | stopifnot(rmr2:::kv.cmp(out$hadoop, out$local))
56 |
--------------------------------------------------------------------------------
/pkg/tools/whirr/README:
--------------------------------------------------------------------------------
1 | # Copyright 2011 Revolution Analytics
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Newly-released Whirr 0.7.1 fixes problems with the Java installation which
16 | # were caused by changes in licensing of Java by Oracle
17 |
18 | # To set up a hadoop/rmr cluster first launch the cluster
19 |
20 | $WHIRR_HOME/bin/whirr launch-cluster --config hadoop-ec2.properties
21 | #this config slightly tweaked from whirr distro, starts 5 large nodes
22 |
23 | # then install R and rmr
24 | $WHIRR_HOME/bin/whirr run-script --script rmr.sh --config hadoop-ec2.properties
25 |
26 | # remember to destroy when done. You are responsible for any AWS charges
27 |
28 | $WHIRR_HOME/bin/whirr destroy-cluster --config hadoop-ec2.properties
29 |
30 | # 'hadoop-ec2-centos.properties' and 'rmr-master-centos.sh' can be used with
31 | # the above steps to create a CentOS 4.6-based cluster using a RightScale AMI
32 |
--------------------------------------------------------------------------------
/pkg/tools/whirr/hadoop-ec2-centos.properties:
--------------------------------------------------------------------------------
1 | # hadoop-ec2-centos.properties by Jeffrey Breen, based on hadoop-ec2.properties
2 | #
3 | # Licensed to the Apache Software Foundation (ASF) under one or more
4 | # contributor license agreements. See the NOTICE file distributed with
5 | # this work for additional information regarding copyright ownership.
6 | # The ASF licenses this file to You under the Apache License, Version 2.0
7 | # (the "License"); you may not use this file except in compliance with
8 | # the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | #
18 |
19 | #
20 | # Hadoop Cluster on AWS EC2
21 | #
22 | # With changes to make installing RHadoop/rmr easier
23 |
24 | # Read the Configuration Guide for more info:
25 | # http://incubator.apache.org/whirr/configuration-guide.html
26 |
27 | # Change the cluster name here
28 | whirr.cluster-name=hadoop-ec2-centos-${env:USER}
29 |
30 | # Change the number of machines in the cluster here
31 | whirr.instance-templates=1 hadoop-namenode+hadoop-jobtracker,5 hadoop-datanode+hadoop-tasktracker
32 |
33 | # Uncomment out these lines to run CDH
34 | # You need cdh3 because of the streaming combiner backport
35 | whirr.hadoop.install-function=install_cdh_hadoop
36 | whirr.hadoop.configure-function=configure_cdh_hadoop
37 | # just-released Whirr 0.7.1 fixes java:
38 | whirr.java.install-function=install_oab_java
39 |
40 | # For EC2 set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables.
41 | whirr.provider=aws-ec2
42 | whirr.identity=${env:AWS_ACCESS_KEY_ID}
43 | whirr.credential=${env:AWS_SECRET_ACCESS_KEY}
44 |
45 | # The size of the instance to use. See http://aws.amazon.com/ec2/instance-types/
46 | whirr.hardware-id=m1.large
47 | # whirr.hardware-id=c1.xlarge
48 |
49 | # select recent, 64-bit CentOS 4.6 AMI from RightScale
50 | whirr.image-id=us-east-1/ami-49e32320
51 |
52 | # If you choose a different location, make sure whirr.image-id is updated too
53 | whirr.location-id=us-east-1
54 |
55 | # You can also specify the spot instance price
56 | # http://aws.amazon.com/ec2/spot-instances/
57 | # whirr.aws-ec2-spot-price=0.15
58 |
59 | # By default use the user system SSH keys. Override them here.
60 | # whirr.private-key-file=${sys:user.home}/.ssh/id_rsa
61 | # whirr.public-key-file=${whirr.private-key-file}.pub
62 |
63 | # Expert: override Hadoop properties by setting properties with the prefix
64 | # hadoop-common, hadoop-hdfs, hadoop-mapreduce to set Common, HDFS, MapReduce
65 | # site properties, respectively. The prefix is removed by Whirr, so that for
66 | # example, setting
67 | # hadoop-common.fs.trash.interval=1440
68 | # will result in fs.trash.interval being set to 1440 in core-site.xml.
69 |
70 | # Expert: specify the version of Hadoop to install.
71 | #whirr.hadoop.version=0.20.2
72 | #whirr.hadoop.tarball.url=http://archive.apache.org/dist/hadoop/core/hadoop-${whirr.hadoop.version}/hadoop-${whirr.hadoop.version}.tar.gz
73 |
--------------------------------------------------------------------------------
/pkg/tools/whirr/hadoop-ec2.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 |
18 | #
19 | # Hadoop Cluster on AWS EC2
20 | #
21 | # With changes to make installing RHadoop/rmr easier
22 |
23 | # Read the Configuration Guide for more info:
24 | # http://incubator.apache.org/whirr/configuration-guide.html
25 |
26 | # Change the cluster name here
27 | whirr.cluster-name=hadoop-ec2-${env:USER}
28 |
29 | # Change the number of machines in the cluster here
30 | whirr.instance-templates=1 hadoop-namenode+hadoop-jobtracker,5 hadoop-datanode+hadoop-tasktracker
31 |
32 |
33 | # Uncomment out these lines to run CDH
34 | # You need cdh3 because of several backported patches from 0.21
35 | # Apache Hadoop 1.0.2 is also expected to work
36 | whirr.env.repo=cdh4
37 | whirr.hadoop.install-function=install_cdh_hadoop
38 | whirr.hadoop.configure-function=configure_cdh_hadoop
39 |
40 | # just-released Whirr 0.7.1 fixes java:
41 | # whirr.java.install-function=install_oab_java
42 |
43 | # For EC2 set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables.
44 | whirr.provider=aws-ec2
45 | whirr.identity=${env:AWS_ACCESS_KEY_ID}
46 | whirr.credential=${env:AWS_SECRET_ACCESS_KEY}
47 |
48 | # The size of the instance to use. See http://aws.amazon.com/ec2/instance-types/
49 | whirr.hardware-id=m1.xlarge
50 | # Ubuntu 12.04 LTS Precise. See http://alestic.com/
51 | whirr.image-id=us-east-1/ami-eafa5883
52 |
53 | # If you choose a different location, make sure whirr.image-id is updated too
54 | whirr.location-id=us-east-1
55 |
56 | # You can also specify the spot instance price
57 | # http://aws.amazon.com/ec2/spot-instances/
58 | # whirr.aws-ec2-spot-price=0.15
59 |
60 | # By default use the user system SSH keys. Override them here.
61 | # whirr.private-key-file=${sys:user.home}/.ssh/id_rsa
62 | # whirr.public-key-file=${whirr.private-key-file}.pub
63 |
64 | # Expert: override Hadoop properties by setting properties with the prefix
65 | # hadoop-common, hadoop-hdfs, hadoop-mapreduce to set Common, HDFS, MapReduce
66 | # site properties, respectively. The prefix is removed by Whirr, so that for
67 | # example, setting
68 | # hadoop-common.fs.trash.interval=1440
69 | # will result in fs.trash.interval being set to 1440 in core-site.xml.
70 | hadoop-common.io.compression.codec.lzo.class=com.hadoop.compression.lzo.LzoCodec
71 |
72 | # Expert: specify the version of Hadoop to install.
73 | #whirr.hadoop.version=0.20.2
74 | #whirr.hadoop.tarball.url=http://archive.apache.org/dist/hadoop/core/hadoop-${whirr.hadoop.version}/hadoop-${whirr.hadoop.version}.tar.gz
75 |
76 | hadoop-env.JAVA_HOME=/usr/lib/jvm/java-1.6.0-openjdk-amd64
77 |
78 |
--------------------------------------------------------------------------------
/pkg/tools/whirr/lzo-centos.sh:
--------------------------------------------------------------------------------
1 | #get a fresher ant than yum would
2 | wget --no-check-certificate http://apache.cs.utah.edu//ant/binaries/apache-ant-1.8.4-bin.tar.gz
3 | tar xvzf apache-ant-1.8.4-bin.tar.gz
4 | export ANT_HOME=/home/users/antonio/apache-ant-1.8.4
5 | sudo yum install -y xml-commons-apis
6 | sudo yum install -y gcc
7 | sudo yum install -y lzo-devel
8 | sudo yum install -y make
9 | wget --no-check-certificate https://github.com/kambatla/hadoop-lzo/tarball/master
10 | tar xvzf master
11 | cd kambatla*
12 | export CFLAGS=-m64
13 | export CXXFLAGS=-m64
14 | ant package
15 | sudo cp build/hadoop-lzo-*.jar /usr/lib/hadoop/lib/
16 | sudo mkdir -p /usr/lib/hadoop/lib/native/
17 | sudo cp build/native/Linux-*-*/lib/libgplcompression.* /usr/lib/hadoop/lib/native/
18 |
19 | sudo /etc/init.d//hadoop-0.20-tasktracker restart
20 |
--------------------------------------------------------------------------------
/pkg/tools/whirr/lzo-ubuntu.sh:
--------------------------------------------------------------------------------
1 | wget https://github.com/kambatla/hadoop-lzo/tarball/master
2 | tar xvzf master
3 | cd kambatla*
4 | export CFLAGS=-m64
5 | export CXXFLAGS=-m64
6 | sudo apt-get install -y ant
7 | sudo apt-get install -y gcc
8 | sudo apt-get install -y liblzo2-dev
9 | sudo apt-get install -y make
10 | export JAVA_HOME=/usr/lib/jvm/java-1.6.0-openjdk-amd64
11 | ant package
12 | sudo cp build/hadoop-lzo-*.jar /usr/lib/hadoop/lib/
13 | sudo mkdir -p /usr/lib/hadoop/lib/native/
14 | sudo cp build/native/Linux-*-*/lib/libgplcompression.* /usr/lib/hadoop/lib/native/
15 |
16 | sudo /etc/init.d//hadoop-*-tasktracker restart
17 |
--------------------------------------------------------------------------------
/pkg/tools/whirr/rmr-dev.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2011 Revolution Analytics
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | branch=dev
16 | sudo apt-get install -y r-base-core
17 | sudo apt-get install -y r-cran-rcpp
18 | sudo R --no-save << EOF
19 | install.packages(c('RJSONIO', 'digest', 'functional', 'stringr', 'plyr'), repos = "http://cran.us.r-project.org", INSTALL_opts=c('--byte-compile') )
20 | EOF
21 |
22 | rm -rf $branch RHadoop
23 | curl -L https://github.com/RevolutionAnalytics/RHadoop/tarball/$branch | tar zx
24 | mv RevolutionAnalytics-RHadoop* RHadoop
25 | sudo R CMD INSTALL --byte-compile RHadoop/rmr2/pkg/
26 |
27 | sudo su << EOF1
28 | echo '
29 | export HADOOP_CMD=/usr/bin/hadoop
30 | export HADOOP_STREAMING=/usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.1.2.jar
31 | ' >> /etc/profile
32 |
33 | EOF1
34 |
--------------------------------------------------------------------------------
/pkg/tools/whirr/rmr-master-centos.sh:
--------------------------------------------------------------------------------
1 | # rmr-master-centos.sh by Jeffrey Breen, based on rmr-master.sh
2 | # original copyright attached:
3 | #
4 | # Copyright 2011 Revolution Analytics
5 | #
6 | # Licensed under the Apache License, Version 2.0 (the "License");
7 | # you may not use this file except in compliance with the License.
8 | # You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 | sudo yum -y --enablerepo=epel install R R-devel
19 |
20 | sudo R --no-save << EOF
21 | install.packages(c('Rcpp', 'RJSONIO', 'itertools', 'digest'), repos="http://cran.revolutionanalytics.com", INSTALL_opts=c('--byte-compile') )
22 | EOF
23 |
24 | # install the rmr package from RHadoop:
25 |
26 | branch=master
27 |
28 | wget --no-check-certificate https://github.com/RevolutionAnalytics/RHadoop/tarball/$branch -O - | tar zx
29 | mv RevolutionAnalytics-RHadoop* RHadoop
30 | sudo R CMD INSTALL --byte-compile RHadoop/rmr/pkg/
31 |
32 | sudo su << EOF1
33 | cat >> /etc/profile <> /etc/profile
31 |
32 | EOF1
33 |
--------------------------------------------------------------------------------