├── .gitattributes ├── .gitignore ├── Makefile ├── README.md ├── build ├── rmr2_2.3.0.tar.gz ├── rmr2_2.3.0.zip ├── rmr2_3.0.0.tar.gz ├── rmr2_3.0.0.zip ├── rmr2_3.1.0.tar.gz ├── rmr2_3.1.0.zip ├── rmr2_3.1.1.tar.gz ├── rmr2_3.1.2.tar.gz ├── rmr2_3.1.2.zip ├── rmr2_3.2.0.tar.gz ├── rmr2_3.2.0.zip ├── rmr2_3.3.0.tar.gz └── rmr2_3.3.0.zip ├── docs ├── IO-speed-tests.Rmd ├── IO-speed-tests.md ├── Makefile ├── OCHUG-presentation │ ├── presentation.Rmd │ └── presentation.md ├── benchmark-slides │ ├── benchmark-slides.Rmd │ └── benchmark-slides.md ├── getting-data-in-and-out.Rmd ├── getting-data-in-and-out.md ├── kmeans.gif ├── new-in-this-release.Rmd ├── new-in-this-release.md ├── readme.Rmd ├── readme.md ├── resources │ ├── Mapreduce.png │ ├── R.png │ ├── hadoop-logo.gif │ ├── hadoop-logo.jpg │ ├── revo-home.png │ ├── revolution.jpeg │ ├── rhadoop.png │ └── rhadoop.svg ├── testing.Rmd ├── testing.md ├── trulia-presentation │ ├── presentation.Rmd │ ├── presentation.md │ ├── summary.Rmd │ └── summary.md ├── tutorial-slides │ ├── tutorial-slides.Rmd │ └── tutorial-slides.md ├── tutorial.Rmd └── tutorial.md ├── hadoopy_hbase ├── README ├── hadoopy_hbase │ ├── Hbase.thrift │ ├── __init__.py │ └── hbase │ │ ├── Hbase-remote │ │ ├── Hbase.py │ │ ├── __init__.py │ │ ├── constants.py │ │ └── ttypes.py ├── java │ ├── build.sh │ ├── build.xml │ ├── build_cdh4.sh │ ├── build_linux.sh │ ├── common.xml │ └── src │ │ └── java │ │ └── com │ │ └── dappervision │ │ └── hbase │ │ └── mapred │ │ ├── TypedBytesTableInputFormat.java │ │ ├── TypedBytesTableInputFormatBase.java │ │ ├── TypedBytesTableRecordReader.java │ │ └── TypedBytesTableRecordReaderSingleValue.java ├── setup.py └── tests │ ├── auth.py │ ├── flickr_count.py │ ├── flickr_count_hadoop.py │ ├── flickr_count_job.py │ ├── flickr_crawl.py │ ├── hbase_test.py │ ├── hbase_test_job.py │ ├── hbase_test_job2.py │ ├── server.py │ ├── thrift_bench.py │ └── thrift_example.py └── pkg ├── DESCRIPTION ├── NAMESPACE ├── R ├── IO.R ├── basic.R ├── extras.R ├── hdfs.R ├── keyval.R ├── local.R ├── mapreduce.R ├── parse-url.R ├── quickcheck-rmr.R └── streaming.R ├── examples ├── airline.R ├── avro.R ├── cluster.mr.R ├── collocations.R ├── counts.R ├── hbase.R ├── large-kmeans-test.R ├── mclust.mr.R └── ngram.R ├── man ├── bigdataobject.Rd ├── dfs.empty.Rd ├── equijoin.Rd ├── fromdfstodfs.Rd ├── hadoop-setting.Rd ├── keyval.Rd ├── make.io.format.Rd ├── mapreduce.Rd ├── rmr-package.Rd ├── rmr.options.Rd ├── rmr.sample.Rd ├── rmr.str.Rd ├── scatter.Rd ├── status.Rd ├── tomaptoreduce.Rd └── vsum.Rd ├── src ├── Makevars ├── Makevars.win ├── catwin │ ├── Makefile │ └── catwin.c ├── extras.cpp ├── extras.h ├── hbase-io ├── hbase-to-df.cpp ├── hbase-to-df.h ├── keyval.cpp ├── keyval.h ├── t-list.cpp ├── t-list.h ├── typed-bytes.cpp └── typed-bytes.h ├── tests ├── IO.R ├── avro.R ├── basic-examples.R ├── basic.R ├── benchmarks.R ├── getting-data-in-and-out.R ├── keyval.R ├── kmeans.R ├── linear-least-squares.R ├── logistic-regression.R ├── mapreduce.R ├── naive-bayes.R └── wordcount.R └── tools └── whirr ├── README ├── hadoop-ec2-centos.properties ├── hadoop-ec2.properties ├── lzo-centos.sh ├── lzo-ubuntu.sh ├── rmr-dev.sh ├── rmr-master-centos.sh └── rmr-master.sh /.gitattributes: -------------------------------------------------------------------------------- 1 | # Set default behaviour, in case users don't have core.autocrlf set. 2 | * text=auto 3 | 4 | *.cpp text eol=lf 5 | *.h text eol=lf 6 | *.html text eol=lf 7 | *.java text eol=lf 8 | *.jpeg binary 9 | *.md text eol=lf 10 | *.o: binary 11 | *.png binary 12 | *.properties text eol=lf 13 | *.R: text eol=lf 14 | *.Rd text eol=lf 15 | *.Rmd text eol=lf 16 | *.sh text eol=lf 17 | *.so binary 18 | *.xml text eol=lf 19 | .gitignore text eol=lf 20 | Makevars* text eol=lf 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | #for Mac users 2 | .DS_Store 3 | #for R users 4 | .RData 5 | .Rhistory 6 | *.Rcheck 7 | Rprof.out 8 | #for RStudio users 9 | .Rproj.user 10 | *.Rproj 11 | #for rmr users 12 | rmr-*-env* 13 | rmr-streaming-* 14 | #for emacs users 15 | *~ 16 | #for whirr users 17 | whirr.log* 18 | #for emerge users 19 | *.orig 20 | rhbase/pkg/config.log 21 | #Compilation artifacts 22 | src-i386 23 | src-x86_64 24 | *.o 25 | *.so 26 | *.rds 27 | Rprof.out 28 | *.aup 29 | *.au 30 | *.swp 31 | *.gz 32 | !build/*.gz 33 | out/* 34 | *.Rout 35 | file* 36 | quickcheck* 37 | *.html 38 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | TESTS := $(wildcard pkg/tests/*.R) 2 | OUTPUT := $(addprefix out/,$(notdir $(TESTS:.R=.out))) 3 | 4 | check: $(OUTPUT) 5 | 6 | out/%.out:pkg/tests/%.R 7 | R CMD BATCH --vanilla --slave $< $@ 8 | 9 | clean: 10 | rm -f $(OUTPUT) 11 | rm -rf rmr-* job_local* 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | rmr2 2 | ==== 3 | 4 | A package that allows R developer to use Hadoop MapReduce, developed as part of the RHadoop project. Please see the [RHadoop wiki](https://github.com/RevolutionAnalytics/RHadoop/wiki) for information. 5 | -------------------------------------------------------------------------------- /build/rmr2_2.3.0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/build/rmr2_2.3.0.tar.gz -------------------------------------------------------------------------------- /build/rmr2_2.3.0.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/build/rmr2_2.3.0.zip -------------------------------------------------------------------------------- /build/rmr2_3.0.0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/build/rmr2_3.0.0.tar.gz -------------------------------------------------------------------------------- /build/rmr2_3.0.0.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/build/rmr2_3.0.0.zip -------------------------------------------------------------------------------- /build/rmr2_3.1.0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/build/rmr2_3.1.0.tar.gz -------------------------------------------------------------------------------- /build/rmr2_3.1.0.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/build/rmr2_3.1.0.zip -------------------------------------------------------------------------------- /build/rmr2_3.1.1.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/build/rmr2_3.1.1.tar.gz -------------------------------------------------------------------------------- /build/rmr2_3.1.2.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/build/rmr2_3.1.2.tar.gz -------------------------------------------------------------------------------- /build/rmr2_3.1.2.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/build/rmr2_3.1.2.zip -------------------------------------------------------------------------------- /build/rmr2_3.2.0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/build/rmr2_3.2.0.tar.gz -------------------------------------------------------------------------------- /build/rmr2_3.2.0.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/build/rmr2_3.2.0.zip -------------------------------------------------------------------------------- /build/rmr2_3.3.0.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/build/rmr2_3.3.0.tar.gz -------------------------------------------------------------------------------- /build/rmr2_3.3.0.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/build/rmr2_3.3.0.zip -------------------------------------------------------------------------------- /docs/IO-speed-tests.Rmd: -------------------------------------------------------------------------------- 1 | Knit document for some timing results: 2 | 3 | ```{r, echo=TRUE} 4 | zz = rmr2:::interleave(1:10^6, 1:10^6) 5 | con = file("/tmp/n-test", "wb") 6 | system.time({rmr2:::typedbytes.writer(zz, con, TRUE)}) 7 | close(con) 8 | con = file("/tmp/tb-test", "wb") 9 | system.time({rmr2:::typedbytes.writer(zz, con, FALSE)}) 10 | close(con) 11 | system.time({save(zz, file= "/tmp/save-test")}) 12 | system.time({rmr2:::make.typedbytes.input.format()(file("/tmp/n-test", "rb"), 10^6)}) 13 | system.time({rmr2:::make.typedbytes.input.format()(file("/tmp/tb-test", "rb"), 10^6)}) 14 | system.time({load(file="/tmp/save-test")}) 15 | ``` -------------------------------------------------------------------------------- /docs/IO-speed-tests.md: -------------------------------------------------------------------------------- 1 | Knit document for some timing results: 2 | 3 | 4 | ```r 5 | zz = rmr2:::interleave(1:10^6, 1:10^6) 6 | con = file("/tmp/n-test", "wb") 7 | system.time({ 8 | rmr2:::typedbytes.writer(zz, con, TRUE) 9 | }) 10 | ``` 11 | 12 | ``` 13 | ## user system elapsed 14 | ## 0.582 0.033 0.615 15 | ``` 16 | 17 | ```r 18 | close(con) 19 | con = file("/tmp/tb-test", "wb") 20 | system.time({ 21 | rmr2:::typedbytes.writer(zz, con, FALSE) 22 | }) 23 | ``` 24 | 25 | ``` 26 | ## user system elapsed 27 | ## 0.295 0.023 0.317 28 | ``` 29 | 30 | ```r 31 | close(con) 32 | system.time({ 33 | save(zz, file = "/tmp/save-test") 34 | }) 35 | ``` 36 | 37 | ``` 38 | ## user system elapsed 39 | ## 2.365 0.022 2.390 40 | ``` 41 | 42 | ```r 43 | system.time({ 44 | rmr2:::make.typedbytes.input.format()(file("/tmp/n-test", "rb"), 10^6) 45 | }) 46 | ``` 47 | 48 | ``` 49 | ## user system elapsed 50 | ## 9.229 0.374 9.603 51 | ``` 52 | 53 | ```r 54 | system.time({ 55 | rmr2:::make.typedbytes.input.format()(file("/tmp/tb-test", "rb"), 10^6) 56 | }) 57 | ``` 58 | 59 | ``` 60 | ## Warning: closing unused connection 4 (/tmp/n-test) 61 | ``` 62 | 63 | ``` 64 | ## user system elapsed 65 | ## 7.387 0.328 7.716 66 | ``` 67 | 68 | ```r 69 | system.time({ 70 | load(file = "/tmp/save-test") 71 | }) 72 | ``` 73 | 74 | ``` 75 | ## Warning: closing unused connection 4 (/tmp/tb-test) 76 | ``` 77 | 78 | ``` 79 | ## user system elapsed 80 | ## 0.652 0.001 0.653 81 | ``` 82 | 83 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | htmls = $(wildcard *.html) 2 | pdfs = $(htmls:.html=.pdf) 3 | pdf: $(pdfs) 4 | 5 | %.pdf: %.html 6 | /Applications/wkhtmltopdf.app/Contents/MacOS/wkhtmltopdf $< $@ 7 | -------------------------------------------------------------------------------- /docs/benchmark-slides/benchmark-slides.Rmd: -------------------------------------------------------------------------------- 1 | `r read_chunk('../../pkg/tests/benchmarks.R')` 2 | `r opts_chunk$set(echo=TRUE, eval=FALSE, cache=FALSE, tidy=FALSE)` 3 | 4 | ## 5 | 6 | 14 | 15 | ## 16 | ```{r pass-through} 17 | ``` 18 | ## 19 | 27 | ## 28 | 36 | ## 37 | 45 | ## 46 | ```{r group-aggregate-input} 47 | ``` 48 | ## 49 | ```{r group-aggregate-functions} 50 | ``` 51 | ## 52 | ```{r group-aggregate} 53 | ``` 54 | -------------------------------------------------------------------------------- /docs/benchmark-slides/benchmark-slides.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ## 5 | 6 | 20 | 21 | ## 22 | 23 | ```r 24 | mapreduce( 25 | input, 26 | map = function(k,v) keyval(k,v)) 27 | ``` 28 | 29 | ## 30 | 50 | ## 51 | 73 | ## 74 | 96 | ## 97 | 98 | ```r 99 | input.ga = 100 | to.dfs( 101 | keyval( 102 | 1:input.size, 103 | rnorm(input.size))) 104 | ``` 105 | 106 | ## 107 | 108 | ```r 109 | group = function(k,v) k%%100 110 | aggregate = function(x) sum(x) 111 | ``` 112 | 113 | ## 114 | 115 | ```r 116 | mapreduce( 117 | input.ga, 118 | map = 119 | function(k,v) 120 | keyval(group(k,v), v), 121 | reduce = 122 | function(k, vv) 123 | keyval(k, aggregate(vv)), 124 | combine = TRUE) 125 | ``` 126 | 127 | -------------------------------------------------------------------------------- /docs/kmeans.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/docs/kmeans.gif -------------------------------------------------------------------------------- /docs/new-in-this-release.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "What's new in `rmr2` x.y.z" 3 | output: 4 | html_document: 5 | keep_md: true 6 | --- 7 | 8 | We switched to using the github [release page](https://github.com/RevolutionAnalytics/rmr2/releases). Please head there and update your links or bookmarks. -------------------------------------------------------------------------------- /docs/new-in-this-release.md: -------------------------------------------------------------------------------- 1 | # What's new in `rmr2` x.y.z 2 | 3 | We switched to using the github [release page](https://github.com/RevolutionAnalytics/rmr2/releases). Please head there and update your links or bookmarks. 4 | -------------------------------------------------------------------------------- /docs/readme.Rmd: -------------------------------------------------------------------------------- 1 | Each document is present in three formats. Markdown, extension **.md, is the one you want to click on**. R Markdown, extension Rmd, is the original format, see the package `knitr` for details, and is the only one that should be edited. html is not used at this time. 2 | -------------------------------------------------------------------------------- /docs/readme.md: -------------------------------------------------------------------------------- 1 | Each document is present in three formats. Markdown, extension **.md, is the one you want to click on**. R Markdown, extension Rmd, is the original format, see the package `knitr` for details, and is the only one that should be edited. html is not used at this time. 2 | -------------------------------------------------------------------------------- /docs/resources/Mapreduce.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/docs/resources/Mapreduce.png -------------------------------------------------------------------------------- /docs/resources/R.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/docs/resources/R.png -------------------------------------------------------------------------------- /docs/resources/hadoop-logo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/docs/resources/hadoop-logo.gif -------------------------------------------------------------------------------- /docs/resources/hadoop-logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/docs/resources/hadoop-logo.jpg -------------------------------------------------------------------------------- /docs/resources/revo-home.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/docs/resources/revo-home.png -------------------------------------------------------------------------------- /docs/resources/revolution.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/docs/resources/revolution.jpeg -------------------------------------------------------------------------------- /docs/resources/rhadoop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RevolutionAnalytics/rmr2/c9414ef91a0f0ae1f372d28a4efddb2560dc0d9b/docs/resources/rhadoop.png -------------------------------------------------------------------------------- /docs/resources/rhadoop.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 18 | 20 | 21 | 23 | image/svg+xml 24 | 26 | 27 | 28 | 29 | 30 | 32 | 52 | 57 | 64 | 70 | 77 | 83 | 88 | 98 | 108 | 118 | 125 | 130 | 131 | 132 | -------------------------------------------------------------------------------- /docs/testing.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Testing for rmr2 3.3.0" 3 | output: 4 | html_document: 5 | keep_md: true 6 | --- 7 | 8 | In the table at the bottom we collect results concerning testing of rmr on a given combination of R/OS and Hadoop releases. We collect both positive and negative results if available. If a combination is not present in this table, it doesn't imply lack of compatibility. In case of negative results, they will be recorded but there is no guarantee that they will be fixed, albeit it's likely for current and common setups. In the early days `rmr` required a specific list of patches to be present in Hadoop to work. Currently, we expect it to work on any current or recent distibution by the Apache foundation, Hortonworks, Cloudera and MapR. 9 | 10 | Testing is conducted by running `R CMD check path-to-rmr` and requires an additional dependency, quickcheck, also downloadable from our wiki. Failures on producing documentation in legacy formats are not important and are ignored. Notes and warnings are not important in the sense that they do not determine success, but it may be helpful to report them in the issue tracker. Please contribute additional testing reports. 11 | 12 | If you are interested in the testing conducted on other releases, choose one from the drop down menu on the top left, under tags and find this document again (under docs). 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 |
HadoopROSNotesReporter
Hadoop 2.4.0R 3.1.1 (Revolution R Open 8.0 beta)CentOS 6.4Revolution
33 | -------------------------------------------------------------------------------- /docs/testing.md: -------------------------------------------------------------------------------- 1 | # Testing for rmr2 3.3.0 2 | 3 | In the table at the bottom we collect results concerning testing of rmr on a given combination of R/OS and Hadoop releases. We collect both positive and negative results if available. If a combination is not present in this table, it doesn't imply lack of compatibility. In case of negative results, they will be recorded but there is no guarantee that they will be fixed, albeit it's likely for current and common setups. In the early days `rmr` required a specific list of patches to be present in Hadoop to work. Currently, we expect it to work on any current or recent distibution by the Apache foundation, Hortonworks, Cloudera and MapR. 4 | 5 | Testing is conducted by running `R CMD check path-to-rmr` and requires an additional dependency, quickcheck, also downloadable from our wiki. Failures on producing documentation in legacy formats are not important and are ignored. Notes and warnings are not important in the sense that they do not determine success, but it may be helpful to report them in the issue tracker. Please contribute additional testing reports. 6 | 7 | If you are interested in the testing conducted on other releases, choose one from the drop down menu on the top left, under tags and find this document again (under docs). 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 |
HadoopROSNotesReporter
Hadoop 2.4.0R 3.1.1 (Revolution R Open 8.0 beta)CentOS 6.4Revolution
28 | -------------------------------------------------------------------------------- /docs/trulia-presentation/summary.Rmd: -------------------------------------------------------------------------------- 1 | # Scalable Analytics in R with rmr 2 | 3 | *RHadoop* is an open source project started by Revolution Analytics to provide data scientists using R access to Hadoop’s scalability without giving up their favorite language flexibility and convenience. 4 | 5 | So far it has three main packages: 6 | 7 | * rhdfs provides file level manipulation for HDFS, the Hadoop file system 8 | * rhbase provides access to HBASE, the hadoop database 9 | * rmr allows to write mapreduce programs in R. This will be the focus of this presentation. 10 | 11 | rmr allows R developers to program in the mapreduce framework, and to all developers provides an alternative way to implement mapreduce programs that strikes a delicate compromise betwen power and usability. It allows to write general mapreduce programs, offering the full power and ecosystem of an existing, established programming language. It doesn’t force you to replace the R interpreter with a special run-time $mdash;it is just a library. You can write logistic regression in half a page and even understand it. It feels and behaves almost like the usual R iteration and aggregation primitives. It is comprised of a handful of functions with a modest number of arguments and sensible defaults that combine in many useful ways. But there is no way to prove that an API works: one can only show examples of what it allows to do and we will do that covering a few from machine learning and statistics. Finally, we will discuss how to get involved. -------------------------------------------------------------------------------- /docs/trulia-presentation/summary.md: -------------------------------------------------------------------------------- 1 | # Scalable Analytics in R with rmr 2 | 3 | *RHadoop* is an open source project started by Revolution Analytics to provide data scientists using R access to Hadoop’s scalability without giving up their favorite language flexibility and convenience. 4 | 5 | So far it has three main packages: 6 | 7 | * rhdfs provides file level manipulation for HDFS, the Hadoop file system 8 | * rhbase provides access to HBASE, the hadoop database 9 | * rmr allows to write mapreduce programs in R. This will be the focus of this presentation. 10 | 11 | rmr allows R developers to program in the mapreduce framework, and to all developers provides an alternative way to implement mapreduce programs that strikes a delicate compromise betwen power and usability. It allows to write general mapreduce programs, offering the full power and ecosystem of an existing, established programming language. It doesn’t force you to replace the R interpreter with a special run-time $mdash;it is just a library. You can write logistic regression in half a page and even understand it. It feels and behaves almost like the usual R iteration and aggregation primitives. It is comprised of a handful of functions with a modest number of arguments and sensible defaults that combine in many useful ways. But there is no way to prove that an API works: one can only show examples of what it allows to do and we will do that covering a few from machine learning and statistics. Finally, we will discuss how to get involved. 12 | -------------------------------------------------------------------------------- /docs/tutorial-slides/tutorial-slides.Rmd: -------------------------------------------------------------------------------- 1 | ```{r} 2 | library(knitr) 3 | read_chunk('../../pkg/tests/benchmarks.R') 4 | read_chunk('../../pkg/tests/basic-examples.R') 5 | read_chunk('../../pkg/tests/wordcount.R') 6 | read_chunk('../../pkg/tests/logistic-regression.R') 7 | read_chunk('../../pkg/tests/linear-least-squares.R') 8 | read_chunk('../../pkg/tests/kmeans.R') 9 | opts_chunk$set(echo=TRUE, eval=FALSE, cache=FALSE, tidy=FALSE) 10 | ``` 11 | 12 | 13 | ## RHadoop Tutorial 14 | ### Revolution Analytics 15 | #### Antonio Piccolboni 16 | #### rhadoop@revolutionanalytics.com 17 | #### antonio@piccolboni.info 18 | 19 | #RHadoop 20 | 21 | ## 22 | 23 | - R + Hadoop 24 | - OSS 25 | - 26 | - 27 | - rhdfs 28 | - rhbase 29 | - rmr2 30 | 31 | # Mapreduce 32 | 33 | ## 34 | 35 | 43 | 44 | ## 45 | 46 | 54 | 55 | # rmr-ABC 56 | 57 | ## 58 | 59 | 67 | 68 | ## 69 | ```{r pass-through} 70 | ``` 71 | ## 72 | 80 | ## 81 | 89 | ## 90 | 98 | ## 99 | ```{r group-aggregate-input} 100 | ``` 101 | ## 102 | ```{r group-aggregate-functions} 103 | ``` 104 | ## 105 | ```{r group-aggregate} 106 | ``` 107 | 108 | # Wordcount 109 | ## 110 | 111 | 119 | 120 | ## 121 | 122 | 130 | 131 | 132 | 133 | # Logistic Regression 134 | 135 | ## 136 | 145 | 146 | ## 147 | 156 | 157 | ## 158 | 159 | 168 | 169 | 170 | # K-means 171 | 172 | ## 173 | 174 | ```{r kmeans-dist.fun} 175 | ``` 176 | 177 | ## 178 | 179 | ```{r kmeans.map} 180 | ``` 181 | 182 | ## 183 | 184 | ```{r kmeans.reduce} 185 | ``` 186 | 187 | ## 188 | 189 | ```{r kmeans-signature} 190 | ``` 191 | 192 | ## 193 | 194 | ```{r kmeans-main-1} 195 | ``` 196 | 197 | ## 198 | 199 | ```{r kmeans-main-2} 200 | ``` 201 | 202 | ## 203 | 204 | 212 | 213 | # Linear Least Squares 214 | 215 | ## 216 | 217 | $$ \mathbf{X b = y}$$ 218 | 219 | ``` 220 | solve(t(X)%*%X, t(X)%*%y) 221 | ``` 222 | 223 | ## 224 | 225 | ```{r LLS-sum} 226 | ``` 227 | 228 | ## 229 | ```{r LLS-XtX} 230 | ``` 231 | 232 | ## 233 | ```{r LLS-Xty} 234 | ``` 235 | 236 | ## 237 | -------------------------------------------------------------------------------- /hadoopy_hbase/README: -------------------------------------------------------------------------------- 1 | Hadoopy HBase 2 | License: Apache V2 (http://www.apache.org/licenses/LICENSE-2.0) 3 | 4 | 5 | How to Run Tests 6 | sudo /usr/lib/hbase/bin/start-hbase.sh 7 | sudo /etc/init.d/hadoop-hbase-thrift start 8 | cd java 9 | sudo bash build.sh 10 | cd ../tests 11 | python thrift_example.py 12 | python hbase_test.py 13 | 14 | 15 | 16 | 17 | Acknowledgements: Lasthbase (https://github.com/tims/lasthbase) inspired this module but is incompatible with new HBase versions, necessitating starting from scratch. The build scripts and general layout were used as a starting point. 18 | 19 | 20 | -------------------------------------------------------------------------------- /hadoopy_hbase/hadoopy_hbase/__init__.py: -------------------------------------------------------------------------------- 1 | import hadoopy 2 | from thrift.transport.TSocket import TSocket 3 | from thrift.transport.TTransport import TBufferedTransport 4 | from thrift.protocol import TBinaryProtocol 5 | from hbase import Hbase 6 | from hbase.ttypes import ColumnDescriptor, Mutation, BatchMutation, TScan 7 | import hadoopy_hbase 8 | import hashlib 9 | import base64 10 | 11 | 12 | def connect(server='localhost', port=9090): 13 | transport = TBufferedTransport(TSocket(server, int(port))) 14 | transport.open() 15 | protocol = TBinaryProtocol.TBinaryProtocol(transport) 16 | client = Hbase.Client(protocol) 17 | return client 18 | 19 | 20 | def scanner_create_id(client, table, columns=None, start_row=None, stop_row=None, filter=None, caching=None): 21 | return client.scannerOpenWithScan(table, TScan(startRow=start_row, stopRow=stop_row, columns=columns if columns else [], caching=caching, filterString=filter)) 22 | 23 | 24 | def scanner_from_id(client, table, sc, per_call=1, close=True): 25 | try: 26 | if per_call == 1: 27 | scanner = lambda : client.scannerGet(sc) 28 | else: 29 | scanner = lambda : client.scannerGetList(sc, per_call) 30 | while True: 31 | outs = scanner() 32 | if outs: 33 | for out in outs: 34 | yield (out.row, dict((x, y.value) for x, y in out.columns.items())) 35 | else: 36 | break 37 | finally: 38 | if sc is not None and close: 39 | client.scannerClose(sc) 40 | 41 | 42 | def scanner(client, table, per_call=1, close=True, **kw): 43 | sc = scanner_create_id(client, table, **kw) 44 | return scanner_from_id(client, table, sc, per_call, close) 45 | 46 | 47 | def scanner_row_column(client, table, column, **kw): 48 | scanner = hadoopy_hbase.scanner(client, table, columns=[column], **kw) 49 | for row, cols in scanner: 50 | yield row, cols[column] 51 | 52 | 53 | def scanner_column(*args, **kw): 54 | return (y for x, y in scanner_row_column(*args, **kw)) 55 | 56 | 57 | def _launch_args(hbase_in, hbase_out, columns, start_row, stop_row, single_value, kw): 58 | if hbase_in: 59 | kw['input_format'] = 'com.dappervision.hbase.mapred.TypedBytesTableInputFormat' 60 | if hbase_out: 61 | kw['output_format'] = 'com.dappervision.hbase.mapred.TypedBytesTableOutputFormat' 62 | jobconfs = hadoopy._runner._listeq_to_dict(kw.get('jobconfs', [])) 63 | jobconfs['hbase.mapred.tablecolumnsb64'] = ' '.join(map(base64.b64encode, columns)) 64 | if start_row is not None: 65 | jobconfs['hbase.mapred.startrowb64'] = base64.b64encode(start_row) 66 | if stop_row is not None: 67 | jobconfs['hbase.mapred.stoprowb64'] = base64.b64encode(stop_row) 68 | if single_value: 69 | jobconfs['hbase.mapred.valueformat'] = 'singlevalue' 70 | kw['jobconfs'] = jobconfs 71 | 72 | 73 | def launch_frozen(in_name, out_name, script_path, hbase_in=True, hbase_out=False, columns=(), start_row=None, stop_row=None, single_value=None, **kw): 74 | _launch_args(hbase_in, hbase_out, columns, start_row, stop_row, single_value, kw) 75 | hadoopy.launch_frozen(in_name, out_name, script_path, **kw) 76 | 77 | 78 | def launch(in_name, out_name, script_path, hbase_in=True, hbase_out=False, columns=(), start_row=None, stop_row=None, single_value=None, **kw): 79 | _launch_args(hbase_in, hbase_out, columns, start_row, stop_row, single_value, kw) 80 | hadoopy.launch(in_name, out_name, script_path, **kw) 81 | 82 | 83 | class HBaseColumnDict(object): 84 | 85 | def __init__(self, table, row, cf, db=None, **kw): 86 | if db is None: 87 | self._db = hadoopy_hbase.connect(**kw) 88 | else: 89 | self._db = db 90 | self._table = table 91 | self._row = row 92 | self._cf = cf + ':' 93 | 94 | def __setitem__(self, key, value): 95 | assert isinstance(key, str) 96 | assert isinstance(value, str) 97 | self._db.mutateRow(self._table, self._row, [hadoopy_hbase.Mutation(column=self._cf + key, value=value)]) 98 | 99 | def __getitem__(self, key): 100 | assert isinstance(key, str) 101 | result = self._db.get(self._table, self._row, self._cf + key) 102 | if not result: 103 | raise KeyError 104 | return result[0].value 105 | 106 | def __delitem__(self, key): 107 | assert isinstance(key, str) 108 | self._db.mutateRow(self._table, self._row, [hadoopy_hbase.Mutation(column=self._cf + key, isDelete=True)]) 109 | 110 | def items(self): 111 | result = self._db.getRow(self._table, self._row) 112 | if not result: 113 | return [] 114 | return [(x, y.value) for x, y in result[0].columns.items()] 115 | 116 | 117 | class HBaseRowDict(object): 118 | 119 | def __init__(self, table, col, db=None, **kw): 120 | if db is None: 121 | self._db = hadoopy_hbase.connect(**kw) 122 | else: 123 | self._db = db 124 | self._table = table 125 | self._col = col 126 | 127 | def __setitem__(self, key, value): 128 | assert isinstance(key, str) 129 | assert isinstance(value, str) 130 | self._db.mutateRow(self._table, key, [hadoopy_hbase.Mutation(column=self._col, value=value)]) 131 | 132 | def __getitem__(self, key): 133 | assert isinstance(key, str) 134 | result = self._db.get(self._table, key, self._col) 135 | if not result: 136 | raise KeyError 137 | return result[0].value 138 | 139 | def __delitem__(self, key): 140 | assert isinstance(key, str) 141 | self._db.mutateRow(self._table, key, [hadoopy_hbase.Mutation(column=self._col, isDelete=True)]) 142 | 143 | 144 | def hash_key(*args, **kw): 145 | """Convenient key engineering function 146 | 147 | Allows for raw prefix/suffix, with other arguments md5 hashed and truncated. 148 | The key is only guaranteed to be unique if its prefix+suffix is unique. If 149 | being used to create a start key, you can leave off args/suffix but they must 150 | be done in order (e.g., if you leave off an arg you must also leave off suffix). 151 | 152 | Args: 153 | *args: List of arguments to hash in order using hash_bytes of md5 154 | prefix: Raw prefix of the string (default '') 155 | suffix: Raw suffix of the string (default '') 156 | delimiter: Raw delimiter of each field (default '') 157 | hash_bytes: Number of md5 bytes (binary not hex) for each of *args 158 | 159 | Returns: 160 | Combined key (binary) 161 | """ 162 | prefix = kw.get('prefix', '') 163 | suffix = kw.get('suffix', '') 164 | delimiter = kw.get('delimiter', '') 165 | if args: 166 | try: 167 | hash_bytes = kw['hash_bytes'] 168 | except KeyError: 169 | raise ValueError('hash_bytes keyword argument must be specified') 170 | return delimiter.join([prefix] + [hashlib.md5(x).digest()[:hash_bytes] for x in args] + [suffix]) 171 | else: 172 | return delimiter.join([prefix, suffix]) 173 | 174 | -------------------------------------------------------------------------------- /hadoopy_hbase/hadoopy_hbase/hbase/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ['ttypes', 'constants', 'Hbase'] 2 | -------------------------------------------------------------------------------- /hadoopy_hbase/hadoopy_hbase/hbase/constants.py: -------------------------------------------------------------------------------- 1 | # 2 | # Autogenerated by Thrift Compiler (0.8.0) 3 | # 4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 5 | # 6 | # options string: py 7 | # 8 | 9 | from thrift.Thrift import TType, TMessageType, TException 10 | from ttypes import * 11 | 12 | -------------------------------------------------------------------------------- /hadoopy_hbase/java/build.sh: -------------------------------------------------------------------------------- 1 | echo "Note this assumes that the paths/versions are correct, make changes as necesssary" 2 | HADOOP_PATH="/usr/lib/hadoop" 3 | HBASE_PATH="/usr/lib/hbase" 4 | 5 | echo "Copying HBASE libs to Hadoop library path (simple way so that it can find them)" 6 | sudo cp -R ${HBASE_PATH}/lib/* ${HADOOP_PATH}/lib/ 7 | sudo cp -R ${HBASE_PATH}/*.jar ${HADOOP_PATH}/lib/ 8 | 9 | echo "Copying libs into local build directory" 10 | mkdir -p ./lib/ 11 | cp ${HBASE_PATH}/lib/commons-logging* ./lib/ 12 | cp ${HBASE_PATH}/hbase-* ./lib/ 13 | cp ${HADOOP_PATH}/hadoop-*-core.jar ./lib/ 14 | cp ${HADOOP_PATH}/contrib/streaming/hadoop-streaming-*.jar ./lib/ 15 | 16 | 17 | echo "Building hadoopy_hbase.jar" 18 | ant 19 | 20 | echo "Copying hadoopy_hbase.jar into Hadoop library path" 21 | cp build/dist/hadoopy_hbase.jar ${HADOOP_PATH}/lib/hadoopy_hbase.jar 22 | 23 | echo "Restarting jobtracker and tasktracker" 24 | /etc/init.d/hadoop-0.20-jobtracker restart 25 | /etc/init.d/hadoop-0.20-tasktracker restart -------------------------------------------------------------------------------- /hadoopy_hbase/java/build.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /hadoopy_hbase/java/build_cdh4.sh: -------------------------------------------------------------------------------- 1 | 2 | #echo "Copying HBASE libs to Hadoop library path (simple way so that it can find them)" 3 | #sudo cp -R ${HBASE_PATH}/lib/* ${HADOOP_HOME}/lib/ 4 | #sudo cp -R ${HBASE_PATH}/*.jar ${HADOOP_HOME}/lib/ 5 | 6 | echo "Copying libs into local build directory" 7 | mkdir -p ./lib/ 8 | echo $HBASE_HOME 9 | echo $HADOOP_HOME 10 | cp ${HBASE_HOME}/lib/commons-logging* ./lib/ 11 | cp ${HBASE_HOME}/hbase-* ./lib/ 12 | cp ${HADOOP_COMMONS_HOME}/*.jar ./lib/ 13 | cp ${HADOOP_HOME}/hadoop-*-core.jar ./lib/ 14 | cp ${HADOOP_HOME}/contrib/streaming/hadoop-streaming-*.jar ./lib/ 15 | cp /usr/share/java/commons-codec.jar ./lib/ 16 | 17 | 18 | echo "Building hadoopy_hbase.jar" 19 | ant 20 | 21 | echo "Copying hadoopy_hbase.jar into Hadoop library path" 22 | #cp build/dist/hadoopy_hbase.jar ${HADOOP_HOME}/lib/hadoopy_hbase.jar 23 | 24 | #echo "Restarting jobtracker and tasktracker" 25 | #/etc/init.d/hadoop-0.20-jobtracker restart 26 | #/etc/init.d/hadoop-0.20-tasktracker restart -------------------------------------------------------------------------------- /hadoopy_hbase/java/common.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | -------------------------------------------------------------------------------- /hadoopy_hbase/java/src/java/com/dappervision/hbase/mapred/TypedBytesTableInputFormat.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2010 The Apache Software Foundation 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one 5 | * or more contributor license agreements. See the NOTICE file 6 | * distributed with this work for additional information 7 | * regarding copyright ownership. The ASF licenses this file 8 | * to you under the Apache License, Version 2.0 (the 9 | * "License"); you may not use this file except in compliance 10 | * with the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | package com.dappervision.hbase.mapred; 21 | 22 | import java.io.IOException; 23 | 24 | import org.apache.commons.logging.Log; 25 | import org.apache.commons.logging.LogFactory; 26 | import org.apache.hadoop.fs.Path; 27 | import org.apache.hadoop.hbase.HBaseConfiguration; 28 | import org.apache.hadoop.hbase.client.HTable; 29 | import org.apache.hadoop.hbase.util.Bytes; 30 | import org.apache.hadoop.mapred.FileInputFormat; 31 | import org.apache.hadoop.mapred.JobConf; 32 | import org.apache.hadoop.mapred.JobConfigurable; 33 | import org.apache.hadoop.hbase.filter.Filter; 34 | import org.apache.hadoop.util.StringUtils; 35 | import com.dappervision.hbase.mapred.TypedBytesTableInputFormatBase; 36 | import org.apache.hadoop.hbase.filter.RowFilter; 37 | import org.apache.hadoop.hbase.filter.RegexStringComparator; 38 | import org.apache.hadoop.hbase.filter.CompareFilter; 39 | import java.io.UnsupportedEncodingException; 40 | import org.apache.commons.codec.binary.Base64; 41 | 42 | import java.io.IOException; 43 | import java.util.HashMap; 44 | import java.util.HashSet; 45 | import java.util.Map; 46 | import java.util.Set; 47 | 48 | import org.apache.commons.logging.Log; 49 | import org.apache.commons.logging.LogFactory; 50 | import org.apache.hadoop.fs.Path; 51 | import org.apache.hadoop.hbase.HBaseConfiguration; 52 | import org.apache.hadoop.hbase.HConstants; 53 | import org.apache.hadoop.hbase.KeyValue; 54 | import org.apache.hadoop.hbase.UnknownScannerException; 55 | import org.apache.hadoop.hbase.client.HTable; 56 | import org.apache.hadoop.hbase.client.Result; 57 | import org.apache.hadoop.hbase.client.ResultScanner; 58 | import org.apache.hadoop.hbase.client.Scan; 59 | import org.apache.hadoop.hbase.mapred.TableSplit; 60 | import org.apache.hadoop.hbase.regionserver.HRegion; 61 | import org.apache.hadoop.hbase.util.Bytes; 62 | import org.apache.hadoop.mapred.FileInputFormat; 63 | import org.apache.hadoop.mapred.InputFormat; 64 | import org.apache.hadoop.mapred.InputSplit; 65 | import org.apache.hadoop.mapred.JobConf; 66 | import org.apache.hadoop.mapred.JobConfigurable; 67 | import org.apache.hadoop.mapred.RecordReader; 68 | import org.apache.hadoop.mapred.Reporter; 69 | import org.apache.hadoop.record.Buffer; 70 | import org.apache.hadoop.typedbytes.TypedBytesWritable; 71 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable; 72 | import org.apache.hadoop.util.StringUtils; 73 | import com.dappervision.hbase.mapred.TypedBytesTableRecordReader; 74 | import com.dappervision.hbase.mapred.TypedBytesTableRecordReaderSingleValue; 75 | 76 | 77 | 78 | 79 | /** 80 | * Convert HBase tabular data into a format that is consumable by Map/Reduce. 81 | */ 82 | @Deprecated 83 | public class TypedBytesTableInputFormat extends TypedBytesTableInputFormatBase implements 84 | JobConfigurable { 85 | private final Log LOG = LogFactory.getLog(TypedBytesTableInputFormat.class); 86 | 87 | 88 | /** 89 | * space delimited list of columns 90 | */ 91 | public static final String COLUMN_LIST = "hbase.mapred.tablecolumnsb64"; 92 | public static final String ROW_FILTER_REGEX = "hbase.mapred.rowfilter"; 93 | public static final String START_ROW = "hbase.mapred.startrowb64"; 94 | public static final String STOP_ROW = "hbase.mapred.stoprowb64"; 95 | public static final String VALUE_FORMAT = "hbase.mapred.valueformat"; 96 | 97 | private byte [][] inputColumns; 98 | private HTable table; 99 | private TypedBytesTableRecordReader tableRecordReader; 100 | 101 | 102 | /** 103 | * Builds a TableRecordReader. If no TableRecordReader was provided, uses 104 | * the default. 105 | * 106 | * @see org.apache.hadoop.mapred.InputFormat#getRecordReader(InputSplit, 107 | * JobConf, Reporter) 108 | */ 109 | 110 | public void configure(JobConf job) { 111 | Path[] tableNames = FileInputFormat.getInputPaths(job); 112 | String colArg = job.get(COLUMN_LIST); 113 | String[] colNames = colArg.split(" "); 114 | byte [][] m_cols = new byte[colNames.length][]; 115 | for (int i = 0; i < m_cols.length; i++) { 116 | m_cols[i] = Base64.decodeBase64(Bytes.toBytes(colNames[i])); 117 | } 118 | setInputColumns(m_cols); 119 | if (job.get(ROW_FILTER_REGEX) != null) { 120 | LOG.info("Row Regex Filter[" + job.get(ROW_FILTER_REGEX) + "]"); 121 | setRowFilter(new RowFilter(CompareFilter.CompareOp.EQUAL, new RegexStringComparator(job.get(ROW_FILTER_REGEX)))); 122 | } 123 | if (job.get(START_ROW) != null) { 124 | LOG.info("Start Row[" + job.get(START_ROW) + "]"); 125 | try { 126 | setStartRow(Base64.decodeBase64(job.get(START_ROW).getBytes("US-ASCII"))); 127 | } catch( UnsupportedEncodingException e){ 128 | LOG.error("Start Row[" + job.get(START_ROW) + "] - Error"); 129 | } 130 | } 131 | if (job.get(STOP_ROW) != null) { 132 | LOG.info("Stop Row[" + job.get(STOP_ROW) + "]"); 133 | try { 134 | setStopRow(Base64.decodeBase64(job.get(STOP_ROW).getBytes("US-ASCII"))); 135 | } catch( UnsupportedEncodingException e){ 136 | LOG.error("Stop Row[" + job.get(STOP_ROW) + "] - Error"); 137 | } 138 | } 139 | try { 140 | setHTable(new HTable(HBaseConfiguration.create(job), tableNames[0].getName())); 141 | } catch (Exception e) { 142 | LOG.error(StringUtils.stringifyException(e)); 143 | } 144 | if (job.get(VALUE_FORMAT) != null && job.get(VALUE_FORMAT).equalsIgnoreCase("singlevalue")) { 145 | LOG.info("Value Format[" + job.get(VALUE_FORMAT) + "]"); 146 | super.setTableRecordReader(new TypedBytesTableRecordReaderSingleValue()); 147 | } else { 148 | LOG.info("Value Format[familiescolumns]"); 149 | super.setTableRecordReader(new TypedBytesTableRecordReader()); 150 | } 151 | } 152 | 153 | public void validateInput(JobConf job) throws IOException { 154 | // expecting exactly one path 155 | Path [] tableNames = FileInputFormat.getInputPaths(job); 156 | if (tableNames == null || tableNames.length > 1) { 157 | throw new IOException("expecting one table name"); 158 | } 159 | 160 | // connected to table? 161 | if (getHTable() == null) { 162 | throw new IOException("could not connect to table '" + 163 | tableNames[0].getName() + "'"); 164 | } 165 | 166 | // expecting at least one column 167 | String colArg = job.get(COLUMN_LIST); 168 | if (colArg == null || colArg.length() == 0) { 169 | throw new IOException("expecting at least one column"); 170 | } 171 | } 172 | } -------------------------------------------------------------------------------- /hadoopy_hbase/java/src/java/com/dappervision/hbase/mapred/TypedBytesTableRecordReader.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2010 The Apache Software Foundation 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one 5 | * or more contributor license agreements. See the NOTICE file 6 | * distributed with this work for additional information 7 | * regarding copyright ownership. The ASF licenses this file 8 | * to you under the Apache License, Version 2.0 (the 9 | * "License"); you may not use this file except in compliance 10 | * with the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | package com.dappervision.hbase.mapred; 21 | 22 | import java.io.IOException; 23 | 24 | import org.apache.hadoop.hbase.client.HTable; 25 | import org.apache.hadoop.hbase.client.Result; 26 | import org.apache.hadoop.hbase.filter.Filter; 27 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable; 28 | import org.apache.hadoop.io.BytesWritable; 29 | import org.apache.hadoop.mapred.RecordReader; 30 | import org.apache.hadoop.typedbytes.TypedBytesOutput; 31 | import org.apache.hadoop.typedbytes.TypedBytesWritable; 32 | import org.apache.hadoop.hbase.mapred.TableRecordReaderImpl; 33 | import java.io.ByteArrayOutputStream; 34 | import java.io.DataOutputStream; 35 | import org.apache.hadoop.record.Buffer; 36 | import java.util.TreeMap; 37 | import java.util.Map; 38 | import java.util.NavigableMap; 39 | 40 | /** 41 | * Iterate over an HBase table data, return (Text, RowResult) pairs 42 | */ 43 | public class TypedBytesTableRecordReader 44 | implements RecordReader { 45 | 46 | protected TableRecordReaderImpl recordReaderImpl = new TableRecordReaderImpl(); 47 | 48 | /** 49 | * Restart from survivable exceptions by creating a new scanner. 50 | * 51 | * @param firstRow 52 | * @throws IOException 53 | */ 54 | public void restart(byte[] firstRow) throws IOException { 55 | this.recordReaderImpl.restart(firstRow); 56 | } 57 | 58 | /** 59 | * Build the scanner. Not done in constructor to allow for extension. 60 | * 61 | * @throws IOException 62 | */ 63 | public void init() throws IOException { 64 | this.recordReaderImpl.init(); 65 | } 66 | 67 | /** 68 | * @param htable the {@link HTable} to scan. 69 | */ 70 | public void setHTable(HTable htable) { 71 | this.recordReaderImpl.setHTable(htable); 72 | } 73 | 74 | /** 75 | * @param inputColumns the columns to be placed in {@link TypedBytesWritable}. 76 | */ 77 | public void setInputColumns(final byte [][] inputColumns) { 78 | this.recordReaderImpl.setInputColumns(inputColumns); 79 | } 80 | 81 | /** 82 | * @param startRow the first row in the split 83 | */ 84 | public void setStartRow(final byte [] startRow) { 85 | this.recordReaderImpl.setStartRow(startRow); 86 | } 87 | 88 | /** 89 | * 90 | * @param endRow the last row in the split 91 | */ 92 | public void setEndRow(final byte [] endRow) { 93 | this.recordReaderImpl.setEndRow(endRow); 94 | } 95 | 96 | /** 97 | * @param rowFilter the {@link Filter} to be used. 98 | */ 99 | public void setRowFilter(Filter rowFilter) { 100 | this.recordReaderImpl.setRowFilter(rowFilter); 101 | } 102 | 103 | public void close() { 104 | this.recordReaderImpl.close(); 105 | } 106 | 107 | /** 108 | * @return TypedBytesWritable 109 | * 110 | * @see org.apache.hadoop.mapred.RecordReader#createKey() 111 | */ 112 | public TypedBytesWritable createKey() { 113 | //return this.recordReaderImpl.createKey(); 114 | return new TypedBytesWritable(); 115 | } 116 | 117 | /** 118 | * @return RowTypedBytesWritable 119 | * 120 | * @see org.apache.hadoop.mapred.RecordReader#createValue() 121 | */ 122 | public TypedBytesWritable createValue() { 123 | //return this.recordReaderImpl.createValue(); 124 | return new TypedBytesWritable(); 125 | } 126 | 127 | public long getPos() { 128 | 129 | // This should be the ordinal tuple in the range; 130 | // not clear how to calculate... 131 | return this.recordReaderImpl.getPos(); 132 | } 133 | 134 | public float getProgress() { 135 | // Depends on the total number of tuples and getPos 136 | return this.recordReaderImpl.getPos(); 137 | } 138 | 139 | /** 140 | * @param key HStoreKey as input key. 141 | * @param value MapWritable as input value 142 | * @return true if there was more data 143 | * @throws IOException 144 | */ 145 | public boolean next(TypedBytesWritable key, TypedBytesWritable value) 146 | throws IOException { 147 | ImmutableBytesWritable key0 = new ImmutableBytesWritable(); 148 | Result value0 = new Result(); 149 | boolean out = this.recordReaderImpl.next(key0, value0); 150 | if (out) { 151 | TreeMap tm = new TreeMap(); 152 | for (Map.Entry> entry : value0.getNoVersionMap().entrySet()) { 153 | TreeMap tm_inner = new TreeMap(); 154 | for (Map.Entry entry0 : entry.getValue().entrySet()) { 155 | tm_inner.put(new Buffer(entry0.getKey()), new Buffer(entry0.getValue())); 156 | } 157 | tm.put(new Buffer(entry.getKey()), tm_inner); 158 | } 159 | key.setValue(new Buffer(key0.get())); 160 | value.setValue(tm); 161 | } 162 | return out; 163 | 164 | } 165 | } -------------------------------------------------------------------------------- /hadoopy_hbase/java/src/java/com/dappervision/hbase/mapred/TypedBytesTableRecordReaderSingleValue.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2010 The Apache Software Foundation 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one 5 | * or more contributor license agreements. See the NOTICE file 6 | * distributed with this work for additional information 7 | * regarding copyright ownership. The ASF licenses this file 8 | * to you under the Apache License, Version 2.0 (the 9 | * "License"); you may not use this file except in compliance 10 | * with the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | package com.dappervision.hbase.mapred; 21 | 22 | import java.io.IOException; 23 | 24 | import org.apache.hadoop.hbase.client.HTable; 25 | import org.apache.hadoop.hbase.client.Result; 26 | import org.apache.hadoop.hbase.filter.Filter; 27 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable; 28 | import org.apache.hadoop.io.BytesWritable; 29 | import org.apache.hadoop.mapred.RecordReader; 30 | import org.apache.hadoop.typedbytes.TypedBytesOutput; 31 | import org.apache.hadoop.typedbytes.TypedBytesWritable; 32 | import org.apache.hadoop.hbase.mapred.TableRecordReaderImpl; 33 | import java.io.ByteArrayOutputStream; 34 | import java.io.DataOutputStream; 35 | import org.apache.hadoop.record.Buffer; 36 | 37 | public class TypedBytesTableRecordReaderSingleValue extends TypedBytesTableRecordReader { 38 | /** 39 | * @param key HStoreKey as input key. 40 | * @param value MapWritable as input value 41 | * @return true if there was more data 42 | * @throws IOException 43 | */ 44 | public boolean next(TypedBytesWritable key, TypedBytesWritable value) 45 | throws IOException { 46 | ImmutableBytesWritable key0 = new ImmutableBytesWritable(); 47 | Result value0 = new Result(); 48 | boolean out = this.recordReaderImpl.next(key0, value0); 49 | if (out) { 50 | byte [] value_byte = value0.value(); 51 | if (value_byte == null) { 52 | throw new IOException("SingleValue requires at least one column to be present for each row, this should not be possible!"); 53 | } 54 | key.setValue(new Buffer(key0.get())); 55 | value.setValue(new Buffer(value_byte)); 56 | } 57 | return out; 58 | 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /hadoopy_hbase/setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | setup(name='hadoopy_hbase', 4 | version='.01', 5 | packages=['hadoopy_hbase', 'hadoopy_hbase.hbase']) 6 | -------------------------------------------------------------------------------- /hadoopy_hbase/tests/auth.py: -------------------------------------------------------------------------------- 1 | import bottle 2 | import base64 3 | import random 4 | import os 5 | 6 | 7 | AUTH_KEY = os.environ.get('AUTH_KEY') 8 | 9 | 10 | def _make_key(l=16): 11 | global AUTH_KEY 12 | s = hex(random.getrandbits(8 * l))[2:] 13 | if s[-1] == 'L': 14 | s = s[:-1] 15 | # Pad with zeros 16 | if len(s) != l * 2: 17 | s = '0' * (2 * l - len(s)) + s 18 | AUTH_KEY = base64.urlsafe_b64encode(s.decode('hex')).rstrip('=') 19 | 20 | 21 | def verify(func): 22 | 23 | def inner(*args, **kw): 24 | if not bottle.request.path.startswith('/%s/' % AUTH_KEY): 25 | bottle.abort(401) 26 | return func(*args, **kw) 27 | if AUTH_KEY is None: 28 | _make_key() 29 | print('AUTH_KEY: /%s/' % AUTH_KEY) 30 | return inner 31 | -------------------------------------------------------------------------------- /hadoopy_hbase/tests/flickr_count.py: -------------------------------------------------------------------------------- 1 | import hadoopy_hbase 2 | import time 3 | 4 | c = hadoopy_hbase.connect('localhost') 5 | cnt = 0 6 | st = time.time() 7 | N = 5000 8 | for x in hadoopy_hbase.scanner(c, 'flickr', per_call=N, columns=['metadata:license']): 9 | cnt += 1 10 | if cnt % N == 0: 11 | print(((time.time() - st) / N, cnt)) 12 | st = time.time() 13 | -------------------------------------------------------------------------------- /hadoopy_hbase/tests/flickr_count_hadoop.py: -------------------------------------------------------------------------------- 1 | import hadoopy 2 | import hadoopy_hbase 3 | import time 4 | import logging 5 | logging.basicConfig(level=logging.DEBUG) 6 | 7 | st = time.time() 8 | 9 | # NOTE(brandyn): If launch fails, you may need to use launch_frozen see hadoopy.com for details 10 | 11 | out = 'out-%f/0' % st 12 | jobconfs = ['mapred.map.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec', 13 | 'mapred.compress.map.output=true', 14 | 'mapred.output.compression.type=BLOCK'] 15 | hadoopy_hbase.launch('flickr', out, 'identity_hbase_job.py', libjars=['hadoopy_hbase.jar'], 16 | num_mappers=8, columns=['metadata:'], jobconfs=jobconfs) 17 | #results = dict(hadoopy.readtb(out)) 18 | #print(results) 19 | -------------------------------------------------------------------------------- /hadoopy_hbase/tests/flickr_count_job.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import hadoopy 3 | 4 | 5 | def mapper(row, column_families): 6 | yield 'num_rows', 1 7 | 8 | def reducer(key, values): 9 | yield key, sum(values) 10 | 11 | if __name__ == '__main__': 12 | hadoopy.run(mapper, reducer) 13 | -------------------------------------------------------------------------------- /hadoopy_hbase/tests/flickr_crawl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import gevent.monkey 3 | gevent.monkey.patch_all() 4 | import hadoopy_hbase 5 | from hadoopy_hbase import BatchMutation, Mutation, ColumnDescriptor 6 | from thrift_bench import random_string, remove_table 7 | import vision_data 8 | import random 9 | #import multiprocessing 10 | import time 11 | 12 | def main(): 13 | #tags = ' animals architecture art asia australia autumn baby band barcelona beach berlin bike bird birds birthday black blackandwhite blue bw california canada canon car cat chicago china christmas church city clouds color concert dance day de dog england europe fall family fashion festival film florida flower flowers food football france friends fun garden geotagged germany girl graffiti green halloween hawaii holiday house india instagramapp iphone iphoneography island italia italy japan kids la lake landscape light live london love macro me mexico model museum music nature new newyork newyorkcity night nikon nyc ocean old paris park party people photo photography photos portrait raw red river rock san sanfrancisco scotland sea seattle show sky snow spain spring square squareformat street summer sun sunset taiwan texas thailand tokyo travel tree trees trip uk unitedstates urban usa vacation vintage washington water wedding white winter woman yellow zoo '.strip().split() 14 | tags = ['Pyramids Of Giza', 'Great Wall Of China', 'Terracotta Warriors', 'Statue Of Liberty', 'Edinburgh Castle', 'Stirling Castle', 'Empire State Building', 'Stonehenge', 'Blackpool Tower', 'London Bridge', 'Tower Bridge', 'Buckinghampalace', 'Sphinx', 'Eiffle Tower', 'Arc Du Triomph', 'Louvre', 'Cristo Redentor', 'CN Tower', 'Norte Dame', 'River Nile', 'Mount Rushmore', 'Pentagon', 'White House', 'Lincoln Memorial', 'Grand Canyon', 'Leaning Tower Of Piza', 'Easter Island Heads', 'Niagara Falls', 'Abbey Road', 'Ayers Rock', 'Evangeline Oak', 'Lone Cyprus', 'Golden Gate Bridge', 'Colosseum', 'Taj Mahal', 'Santorini'] 15 | client = hadoopy_hbase.connect('localhost') 16 | random.shuffle(tags) 17 | flickr = vision_data.Flickr(max_iters=1) 18 | #remove_table(client, 'flickr') 19 | #client.createTable('flickr', [ColumnDescriptor('metadata:'), ColumnDescriptor('images:')]) 20 | while True: 21 | for tag in tags: 22 | mutations = [] 23 | try: 24 | for url_m, metadata in flickr.image_class_meta_url(tag): 25 | mutations.append(BatchMutation(row=url_m, mutations=[Mutation(column='metadata:%s' % x, value=y.encode('utf-8')) 26 | for x, y in metadata.items()])) 27 | except Exception, e: 28 | print(e) 29 | continue 30 | st = time.time() 31 | client.mutateRows('flickr', mutations) 32 | if mutations: 33 | print((tag, (time.time() - st) / len(mutations), len(mutations))) 34 | else: 35 | print((tag, 0., len(mutations))) 36 | 37 | 38 | def display(): 39 | client = hadoopy_hbase.connect('localhost') 40 | for x in hadoopy_hbase.scanner(client, 'flickr', ['metadata:title']): 41 | print(x) 42 | 43 | if __name__ == '__main__': 44 | gevent.joinall([gevent.spawn(main) for x in range(30)]) 45 | -------------------------------------------------------------------------------- /hadoopy_hbase/tests/hbase_test.py: -------------------------------------------------------------------------------- 1 | import hadoopy 2 | import hadoopy_hbase 3 | import time 4 | import logging 5 | logging.basicConfig(level=logging.DEBUG) 6 | 7 | st = time.time() 8 | 9 | # NOTE(brandyn): If launch fails, you may need to use launch_frozen see hadoopy.com for details 10 | #, 11 | # 12 | out = 'out-%f/3' % st 13 | hadoopy_hbase.launch('testtable', out, 'hbase_test_job.py', columns=['colfam1:'], libjars=['hadoopy_hbase.jar'], start_row='5', stop_row='52') 14 | results = hadoopy.readtb(out) 15 | print list(results)[:10] 16 | 17 | out = 'out-%f/1' % st 18 | hadoopy_hbase.launch('testtable', out, 'hbase_test_job.py', columns=['colfam1:'], libjars=['hadoopy_hbase.jar'], jobconfs={'hbase.mapred.rowfilter': '.*3'}) 19 | results = hadoopy.readtb(out) 20 | print list(results)[:10] 21 | 22 | out = 'out-%f/0' % st 23 | hadoopy_hbase.launch('testtable', out, 'hbase_test_job.py', columns=['colfam1:'], libjars=['hadoopy_hbase.jar']) 24 | results = hadoopy.readtb(out) 25 | print list(results)[:10] 26 | 27 | out = 'out-%f/2' % st 28 | hadoopy_hbase.launch('testtable', out, 'hbase_test_job2.py', columns=['colfam1:'], libjars=['hadoopy_hbase.jar']) 29 | results = hadoopy.readtb(out) 30 | print list(results)[:10] 31 | -------------------------------------------------------------------------------- /hadoopy_hbase/tests/hbase_test_job.py: -------------------------------------------------------------------------------- 1 | import hadoopy 2 | 3 | def mapper(k, v): 4 | #yield 'KEY[%s]' % k, 'VALUE[%s]' % v 5 | yield k, v 6 | 7 | 8 | if __name__ == '__main__': 9 | hadoopy.run(mapper) 10 | -------------------------------------------------------------------------------- /hadoopy_hbase/tests/hbase_test_job2.py: -------------------------------------------------------------------------------- 1 | import hadoopy 2 | 3 | 4 | def mapper(row, column_families): 5 | for column_fam, columns in column_families.items(): 6 | for column, data in columns.items(): 7 | yield row, (column_fam, column, data) 8 | 9 | if __name__ == '__main__': 10 | hadoopy.run(mapper) 11 | -------------------------------------------------------------------------------- /hadoopy_hbase/tests/server.py: -------------------------------------------------------------------------------- 1 | from gevent import monkey 2 | monkey.patch_all() 3 | import bottle 4 | import os 5 | import argparse 6 | import random 7 | import base64 8 | from auth import verify 9 | from flickr_crawl import setup, scanner 10 | import itertools 11 | import time 12 | 13 | START_ROW = '' 14 | 15 | @bottle.route('/:auth_key#[a-zA-Z0-9\_\-]+#/') 16 | @verify 17 | def main(auth_key): 18 | global START_ROW 19 | st = time.time() 20 | x = '' 21 | images = ['' % y['metadata:url_s'] for x, y in itertools.islice(scanner(client, 'flickr', ['metadata:url_s'], per_call=100, start_row=START_ROW), 100)] 22 | START_ROW = x 23 | run_time = time.time() - st 24 | return ('%d-%f
' % (len(images), run_time)) + '
'.join(images) 25 | 26 | 27 | 28 | if __name__ == "__main__": 29 | parser = argparse.ArgumentParser(description="Serve a directory") 30 | 31 | # Server port 32 | parser.add_argument('--port', type=str, help='bottle.run webpy on this port', 33 | default='8080') 34 | ARGS = parser.parse_args() 35 | client = setup() 36 | bottle.run(host='0.0.0.0', port=ARGS.port, server='gevent') 37 | -------------------------------------------------------------------------------- /hadoopy_hbase/tests/thrift_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from thrift.transport.TSocket import TSocket 3 | from thrift.transport.TTransport import TBufferedTransport 4 | from thrift.protocol import TBinaryProtocol 5 | from thrift_bench import random_string, remove_table 6 | import hadoopy_hbase 7 | 8 | client = hadoopy_hbase.connect('localhost') 9 | remove_table(client, 'testtable') 10 | client.createTable('testtable', [hadoopy_hbase.ColumnDescriptor('colfam1:')]) 11 | 12 | for x in xrange(100): 13 | client.mutateRow('testtable', str(x), [hadoopy_hbase.Mutation(column='colfam1:col%d' % y, value=random_string(5)) for y in range(10)]) 14 | print(client.getRow('testtable', '0')) 15 | -------------------------------------------------------------------------------- /pkg/DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: rmr2 2 | Type: Package 3 | Title: R and Hadoop Streaming Connector 4 | Version: 3.3.1 5 | Date: 2015-2-10 6 | Author: Revolution Analytics 7 | Depends: R (>= 2.6.0), methods 8 | Imports: Rcpp, RJSONIO (>= 0.8-2), digest, functional, reshape2, stringr, plyr, caTools (>= 1.16) 9 | Suggests: quickcheck (>= 3.1.0), ravro, rhdfs, testthat 10 | Collate: basic.R extras.R hdfs.R keyval.R IO.R local.R mapreduce.R parse-url.R quickcheck-rmr.R streaming.R 11 | Maintainer: Revolution Analytics 12 | Description: Supports the map reduce programming model on top of hadoop streaming 13 | License: Apache License (== 2.0) 14 | ByteCompile: TRUE 15 | BuildVignettes: FALSE 16 | -------------------------------------------------------------------------------- /pkg/NAMESPACE: -------------------------------------------------------------------------------- 1 | useDynLib(rmr2) 2 | export(mapreduce) 3 | export(from.dfs, to.dfs) 4 | export(equijoin) 5 | export(scatter, gather, rmr.sample) 6 | export(dfs.empty, dfs.size, dfs.exists, dfs.rmr, dfs.mv, dfs.ls) 7 | export(rmr.options) 8 | export(keyval, keys, values, c.keyval) 9 | export(make.input.format, make.output.format) 10 | export(to.map, to.reduce) 11 | export(rmr.str) 12 | export(status, increment.counter) 13 | export(vsum) 14 | 15 | S3method(gorder, default) 16 | S3method(gorder, factor) 17 | S3method(gorder, data.frame) 18 | S3method(gorder, matrix) 19 | S3method(gorder, raw) 20 | 21 | S3method(deraw, data.frame) 22 | S3method(deraw, matrix) 23 | S3method(deraw, raw) 24 | S3method(deraw, default) 25 | 26 | importFrom(functional, Curry) 27 | importFrom(plyr, splat) 28 | importFrom(plyr, quickdf) 29 | import(Rcpp) 30 | importFrom(stringr, str_detect) 31 | importFrom(stringr, str_match) 32 | importFrom(stringr, str_replace) 33 | importFrom(stringr, str_split) 34 | importFrom(digest, digest) 35 | importFrom(reshape2, dcast) 36 | importFrom(caTools, base64encode) 37 | importFrom(RJSONIO, fromJSON) 38 | importFrom(RJSONIO, toJSON) 39 | import(methods) 40 | -------------------------------------------------------------------------------- /pkg/R/basic.R: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Revolution Analytics 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | #string 16 | 17 | qw = function(...) as.character(match.call())[-1] 18 | 19 | #assignment 20 | 21 | default = 22 | function(x, value, bad.value = is.null) { 23 | test = if(is.function(bad.value)) bad.value(x) else identical(bad.value, x) 24 | if(test) value else x} 25 | 26 | #functional 27 | 28 | Make.single.arg = 29 | function(f) 30 | function(x) do.call(f, x) 31 | 32 | Make.multi.arg = 33 | function(f) 34 | function(...) f(list(...)) 35 | 36 | Make.single.or.multi.arg = function(f, from = c("single", "multi")) { 37 | from = match.arg(from) 38 | if (from == "single") { 39 | f.single = f 40 | f.multi = Make.multi.arg(f)} 41 | else { 42 | f.single = Make.single.arg(f) 43 | f.multi = f} 44 | function(...) { 45 | args = list(...) 46 | if(length(args) == 1) 47 | f.single(args[[1]]) 48 | else 49 | f.multi(...)}} 50 | 51 | all.predicate = function(x, P) all(sapply(x, P)) 52 | 53 | #data structures 54 | 55 | make.fast.list = function(l = list()) { 56 | l1 = l 57 | l2 = list(NULL) 58 | i = 1 59 | function(els = NULL){ 60 | if(missing(els)) c(l1, l2[!sapply(l2, is.null)]) 61 | else{ 62 | if(i + length(els) - 1 > length(l2)) { 63 | l1 <<- c(l1, l2[!sapply(l2, is.null)]) 64 | i <<- 1 65 | l2 <<- rep(list(NULL), length(l1) + length(els))} 66 | l2[i:(i + length(els) - 1)] <<- els 67 | i <<- i + length(els)}}} 68 | 69 | named.slice = function(x, n) x[which(names(x) == n)] 70 | 71 | mapply.list = 72 | function(...) mapply(FUN = list, ..., SIMPLIFY = FALSE) 73 | 74 | t.list = 75 | function(l) { 76 | if(length(l) == 0) l 77 | else 78 | .Call( 79 | "t_list", 80 | if(!all(sapply.is.list(l))) 81 | lapply(l, as.list) 82 | else l, 83 | PACKAGE = "rmr2")} 84 | 85 | #data frame manip 86 | 87 | sane.c = 88 | function(...) { 89 | if(all(are.factor(list(...)))) 90 | unlist(list(...)) 91 | else 92 | c(...)} 93 | 94 | rbind.fill.fast = 95 | function(...) { 96 | xx = list(...) 97 | cols = unique(unlist(lapply(xx, names))) 98 | ll = 99 | lapply( 100 | cols, 101 | function(n) 102 | do.call( 103 | sane.c, 104 | lapply( 105 | xx, 106 | function(x){ 107 | if(is.null(x[[n]])) 108 | rep(NA, nrow(x)) 109 | else 110 | x[[n]]}))) 111 | names(ll) = cols 112 | do.call( 113 | data.frame, 114 | c( 115 | lapply( 116 | ll, 117 | function(x) 118 | if (is.atomic(x)) x 119 | else I(x)), 120 | stringsAsFactors = FALSE))} 121 | 122 | 123 | 124 | every.second = 125 | function(pattern) 126 | function(x) { 127 | opt = options("warn")[[1]] 128 | options(warn = -1) 129 | y = x[pattern] 130 | options(warn = opt) 131 | y} 132 | 133 | odd = every.second(c(TRUE, FALSE)) 134 | even = every.second(c(FALSE, TRUE)) 135 | 136 | interleave = 137 | function(l1, l2) { 138 | l = list() 139 | l[2*(1:length(l1)) - 1] = l1 140 | l[2*(1:length(l1))] = l2 141 | l} 142 | 143 | #con 144 | -------------------------------------------------------------------------------- /pkg/R/extras.R: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Revolution Analytics 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | ## push a file through this to get as many partitions as possible (depending on system settings) 16 | ## data is unchanged 17 | 18 | scatter = function(input, output = NULL, ...) 19 | mapreduce(input, 20 | output, 21 | map = function(k, v) keyval(runif(1), v), 22 | reduce = function(k, vv) vv, 23 | ...) 24 | 25 | gather = function(input, output = NULL, ...) { 26 | backend.parameters = list(...)['backend.prameters'] 27 | backend.parameters$hadoop = append(backend.parameters$hadoop, list(D='mapred.reduce.tasks=1')) 28 | mapreduce(input, 29 | output, 30 | backend.parameters = backend.parameters, 31 | ...)} 32 | 33 | #sampling 34 | 35 | rmr.sample = function(input, output = NULL, method = c("any", "Bernoulli"), ...) { 36 | method = match.arg(method) 37 | if (method == "any") { 38 | n = list(...)[['n']] 39 | some = function(k, v) 40 | keyval( 41 | if(is.null(k)) 42 | list(NULL) 43 | else 44 | rmr.slice(k, 1:min(n, rmr.length(k))), 45 | rmr.slice(v, 1:min(n, rmr.length(v)))) 46 | mapreduce(input, 47 | output, 48 | map = some, 49 | combine = TRUE, 50 | reduce = some)} 51 | else 52 | if(method == "Bernoulli"){ 53 | p = list(...)[['p']] 54 | mapreduce(input, 55 | output, 56 | map = function(k, v) { 57 | filter = rbinom(rmr.length(v), 1, p) == 1 58 | keyval(rmr.slice(k, filter), 59 | rmr.slice(v, filter))})}} 60 | 61 | ## map and reduce generators 62 | 63 | partitioned.map = 64 | function(map, n) 65 | function(k,v) { 66 | kv = map(k,v) 67 | keyval( 68 | data.frame( 69 | sample( 70 | 1:n, size=length(k), 71 | replace = TRUE), k), 72 | v)} 73 | 74 | partitioned.combine = 75 | function(reduce) 76 | function(k,vv) { 77 | kv = reduce(k,vv) 78 | keyval(k[,-1], vv)} 79 | 80 | ## fast aggregate functions 81 | 82 | vsum = 83 | function(x) { 84 | if(is.list(x)) 85 | .Call("vsum", x, PACKAGE = "rmr2") 86 | else 87 | stop(paste("can't vsum a ", class(x)))} 88 | 89 | ## dev support 90 | 91 | reload = 92 | function() { 93 | detach("package:rmr2", unload = TRUE) 94 | library.dynam.unload("rmr2",system.file(package="rmr2")) 95 | library(rmr2)} 96 | 97 | rmr.str = 98 | function(x, ...) { 99 | sc = sys.calls() 100 | message( 101 | paste( 102 | c( 103 | capture.output( 104 | str(sc)), 105 | match.call() [[2]], 106 | capture.output(str(x, ...))), 107 | collapse="\n")) 108 | x} 109 | -------------------------------------------------------------------------------- /pkg/R/hdfs.R: -------------------------------------------------------------------------------- 1 | hdfs.ls = 2 | function(fname) 3 | read.table( 4 | textConnection(hdfs("ls", fname, intern = TRUE)), 5 | skip=1, 6 | col.names=c("permissions", "links", "owner", "group", "size", "date", "time", "path"), 7 | stringsAsFactors = FALSE) 8 | hdfs.exists = 9 | function(fname) 10 | hdfs("test -e", fname, test = TRUE) 11 | test.rmr = 12 | function() { 13 | length( 14 | suppressWarnings( 15 | hdfs("- 2>&1 | grep rmr", intern = TRUE))) > 0} 16 | 17 | hdfs.rmr = 18 | (function() { 19 | rmr = NULL 20 | function(fname) { 21 | if(is.null(rmr)) 22 | rmr <<- test.rmr() 23 | if(rmr) 24 | hdfs("rmr", fname) 25 | else 26 | hdfs("rm -r", fname)}})() 27 | hdfs.isdir = 28 | function(fname) { 29 | if(.Platform$OS.type == "windows") 30 | length(grep(pattern = "^Found", hdfs("ls", fname, intern = TRUE))) == 1 31 | else 32 | hdfs("test -d", fname, test = TRUE)} 33 | hdfs.mv = 34 | function(src, dst) 35 | hdfs("mv", src, dst) 36 | hdfs.mkdir = 37 | function(fname) 38 | hdfs("mkdir", fname) 39 | hdfs.put = 40 | function(src, dst) 41 | hdfs("put", src, dst) 42 | hdfs.get = 43 | function(src, dst) 44 | hdfs("get", src, dst) 45 | 46 | hdfs = 47 | function(cmd, ..., intern = FALSE, test = FALSE) { 48 | retval = 49 | system( 50 | paste( 51 | hdfs.cmd(), 52 | "dfs", 53 | paste("-", cmd, sep = ""), 54 | paste( 55 | sapply( 56 | list(...), 57 | rmr.normalize.path), 58 | collapse=" ")), 59 | intern = intern) 60 | if(intern) 61 | retval 62 | else{ 63 | if(test) 64 | retval == 0 65 | else { 66 | stopifnot(retval == 0) 67 | NULL }}} -------------------------------------------------------------------------------- /pkg/R/local.R: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Revolution Analytics 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | mr.local = function( 16 | in.folder, 17 | out.folder, 18 | map, 19 | reduce, 20 | vectorized.reduce, 21 | combine, 22 | in.memory.combine, 23 | input.format, 24 | output.format, 25 | backend.parameters, 26 | verbose) { 27 | 28 | profile.nodes = rmr.options("profile.nodes") 29 | get.data = 30 | function(fname) { 31 | environment(input.format$format) = 32 | list2env(as.list(environment(input.format$format))) 33 | kv = from.dfs(fname, format = input.format) 34 | kv} 35 | map.out = 36 | c.keyval( 37 | do.call( 38 | c, 39 | lapply( 40 | in.folder, 41 | function(fname) { 42 | kv = get.data(fname) 43 | Sys.setenv(mapreduce_map_input_file = fname) 44 | lkv = length.keyval(kv) 45 | unname( 46 | tapply( 47 | 1:lkv, 48 | ceiling((1:lkv)/(lkv/(object.size(kv)/10^6))), #make this constant configurable? 49 | function(r) { 50 | kvr = slice.keyval(kv, r) 51 | as.keyval(map(keys(kvr), values(kvr)))}, 52 | simplify = FALSE))}))) 53 | map.out = from.dfs(to.dfs(map.out)) 54 | reduce.helper = 55 | function(kk, vv) as.keyval(reduce(rmr.slice(kk, 1), vv)) 56 | reduce.out = { 57 | if(!is.null(reduce)){ 58 | if(!vectorized.reduce){ 59 | c.keyval( 60 | reduce.keyval( 61 | map.out, 62 | reduce.helper))} 63 | else{ 64 | as.keyval( 65 | reduce( 66 | keys(map.out), 67 | values(map.out)))}} 68 | else 69 | map.out} 70 | to.dfs(reduce.out, out.folder, format = output.format) 71 | NULL} 72 | -------------------------------------------------------------------------------- /pkg/R/parse-url.R: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Revolution Analytics 2 | # Factored out from the httr package https://github.com/hadley/httr 3 | # Originally under the MIT license 4 | # Original author Hadley Wickham 5 | # No Copyright information found in the original. 6 | 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | 19 | 20 | parse_url <- function(url) { 21 | 22 | url <- as.character(url) 23 | stopifnot(length(url) == 1) 24 | 25 | pull_off <- function(pattern) { 26 | if (!str_detect(url, pattern)) return(NULL) 27 | 28 | piece <- str_match(url, pattern)[, 2] 29 | url <<- str_replace(url, pattern, "") 30 | 31 | piece 32 | } 33 | 34 | fragment <- pull_off("#(.*)$") 35 | scheme <- pull_off("^([[:alpha:]+.-]+):") 36 | netloc <- pull_off("^//([^/]*)/?") 37 | 38 | if (!is.null(netloc)) { 39 | 40 | pieces <- str_split(netloc, "@")[[1]] 41 | if (length(pieces) == 1) { 42 | username <- NULL 43 | password <- NULL 44 | 45 | host <- pieces 46 | } else { 47 | user_pass <- str_split(pieces[[1]], ":")[[1]] 48 | username <- user_pass[1] 49 | password <- user_pass[2] 50 | 51 | host <- pieces[2] 52 | } 53 | 54 | host_pieces <- str_split(host, ":")[[1]] 55 | hostname <- host_pieces[1] 56 | port <- if (length(host_pieces) > 1) host_pieces[2] 57 | } else { 58 | port <- username <- password <- hostname <- NULL 59 | } 60 | 61 | structure(list( 62 | scheme = scheme, hostname = hostname, port = port, path = url, 63 | username = username, password = password), 64 | class = "url") 65 | } 66 | -------------------------------------------------------------------------------- /pkg/R/quickcheck-rmr.R: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Revolution Analytics 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | ##app-specific generators 16 | if(require(quickcheck)){ 17 | 18 | curry.size = 19 | function(gen, size) { 20 | force(gen) 21 | Curry(gen, size = size)} 22 | 23 | curry.nrow = 24 | function(gen, nrow) { 25 | force(gen) 26 | Curry(gen, nrow = nrow, ncol = c(min = 1))} 27 | 28 | rrmr.data = 29 | function(size = c(min = 0, max = quickcheck::default(vector.size %||% 5 * severity))) 30 | quickcheck::mixture( 31 | generators = 32 | c( 33 | lapply( 34 | list( 35 | quickcheck::rlogical, 36 | quickcheck::rinteger, 37 | quickcheck::rdouble, 38 | quickcheck::rcharacter, 39 | quickcheck::rraw, 40 | quickcheck::rfactor, 41 | quickcheck::rlist), 42 | curry.size, size = size), 43 | lapply( 44 | list( 45 | quickcheck::rmatrix, 46 | quickcheck::rdata.frame), 47 | curry.nrow, nrow = size)))() 48 | 49 | rdata.frame.simple = 50 | function( 51 | nrow = c(min = 1, max = quickcheck::default(data.frame.nrow %||% 5 * severity)), 52 | ncol = c(min = 1, max = quickcheck::default(data.frame.ncol %||% severity))) 53 | rdata.frame( 54 | generator = 55 | mixture( 56 | generators = 57 | list( 58 | quickcheck::rlogical, 59 | quickcheck::rinteger, 60 | quickcheck::rdouble, 61 | quickcheck::rcharacter)), 62 | nrow = nrow, 63 | ncol = ncol) 64 | 65 | rkeyval = 66 | function(k = rrmr.data(size = c(min = 1)), v = rrmr.data(size = c(min = 1))) 67 | keyval(k, v) 68 | 69 | rkeyvalsimple = function() keyval(runif(1), runif(1)) #we can do better than this 70 | 71 | ## generic sorting for normalized comparisons 72 | gorder = function(...) UseMethod("gorder") 73 | gorder.default = order 74 | gorder.factor = function(x) order(as.character(x)) 75 | gorder.data.frame = 76 | function(x) splat(gorder)(lapply(x, function(x) if(is.factor(x)) as.character(x) else if(is.list(x) || is.raw(x)) sapply(x, digest) else x)) 77 | gorder.matrix = function(x) gorder(as.data.frame(x)) 78 | gorder.raw = gorder.list = function(x) gorder(sapply(x, digest)) 79 | 80 | reorder = function(x, o) if(has.rows(x)) x[o, , drop = FALSE] else x[o] 81 | 82 | gsort = function(x) reorder(x, gorder(x)) 83 | 84 | gsort.keyval = 85 | function(kv) { 86 | k = keys(kv) 87 | v = values(kv) 88 | o = { 89 | if(is.null(k)) gorder(v) 90 | else 91 | gorder( 92 | data.frame( 93 | if(is.list(k) && !is.data.frame(k)) sapply(k, digest) else k, 94 | if(is.list(v) && !is.data.frame(v)) sapply(v, digest) else v))} 95 | keyval(reorder(k, o), reorder(v, o))} 96 | 97 | ## keyval compare 98 | kv.cmp = function(kv1, kv2) 99 | isTRUE(all.equal(gsort.keyval(kv1), gsort.keyval(kv2), tolerance=1e-4, check.attributes=FALSE)) 100 | 101 | } -------------------------------------------------------------------------------- /pkg/examples/airline.R: -------------------------------------------------------------------------------- 1 | library(rmr2) 2 | from.dfs( 3 | mapreduce( 4 | input = '../RHadoop.data/airline.1000', 5 | input.format = make.input.format("csv", sep = ","), 6 | map = function(., data) { 7 | # filter out non-numeric values (header and NA) 8 | filter = !is.na(data[,16]) 9 | data = data[filter,] 10 | # emit composite key (airline|year|month) and delay 11 | keyval( 12 | data[,c(9,1,2)], 13 | data[,16, drop = FALSE])}, 14 | reduce = function(k,delays) { 15 | keyval(k, mean(delays[,1]))})) -------------------------------------------------------------------------------- /pkg/examples/avro.R: -------------------------------------------------------------------------------- 1 | # known limitations: these formats work only with mapreduce, not with from.dfs or to.dfs, nor they work in on the local backend 2 | # as a workaround, use a simple conversion job 3 | # from.dfs(mapreduce(some.input, input.format = avroIF)) or mapreduce(to.dfs(some.data), output.format = avroOF) 4 | # avroOF uses a fixed schema "bytes" containing the JSON representation of the data. 5 | 6 | avro.jar = "/Users/antonio/Downloads/avro-mapred-1.7.4-hadoop1.jar" 7 | 8 | paste.fromJSON = 9 | function(...) 10 | fromJSON(paste("[", paste(..., sep = ","), "]")) 11 | 12 | mapply.fromJSON = 13 | function(...) 14 | mapply(paste.fromJSON, ..., SIMPLIFY = FALSE) 15 | 16 | avro.input.format = 17 | function(con) { 18 | lines = readLines(con = con, n = 1000) 19 | if (length(lines) == 0) NULL 20 | else 21 | do.call( 22 | keyval, 23 | unname( 24 | do.call( 25 | mapply.fromJSON, 26 | strsplit( 27 | lines, 28 | "\t"))))} 29 | 30 | avroIF = 31 | make.input.format( 32 | format = avro.input.format, 33 | mode = "text", 34 | streaming.format = "org.apache.avro.mapred.AvroAsTextInputFormat", 35 | backend.parameters = 36 | list( 37 | hadoop = 38 | list( 39 | libjars = avro.jar))) 40 | 41 | 42 | avro.output.format = 43 | function(kv, con) 44 | writeLines( 45 | unlist( 46 | rmr2:::reduce.keyval( 47 | kv, 48 | function(k, v) 49 | paste( 50 | toJSON(k, .escapeEscapes = TRUE), 51 | toJSON(v, .escapeEscapes = TRUE), 52 | sep = "\t"))), 53 | con = con) 54 | 55 | avroOF = 56 | make.output.format( 57 | format = avro.output.format, 58 | mode = "text", 59 | streaming.format = "org.apache.avro.mapred.AvroTextOutputFormat", 60 | backend.parameters = 61 | list( 62 | hadoop = 63 | list( 64 | libjars = avro.jar))) 65 | 66 | 67 | avro.test = 68 | mapreduce( 69 | to.dfs(keyval(1:2, 1:10)), 70 | output.format = avroOF) 71 | 72 | from.dfs( 73 | mapreduce( 74 | avro.test, 75 | input.format = avroIF)) -------------------------------------------------------------------------------- /pkg/examples/cluster.mr.R: -------------------------------------------------------------------------------- 1 | # Copyright 2013 Revolution Analytics 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | 17 | ## @knitr cluster-napply 18 | library(cluster) 19 | napply = function(ll, a.name) lapply(ll, function(l) l[[a.name]]) 20 | 21 | ## @knitr cluster-mr 22 | cluster.mr = 23 | function(data, subcluster, merge) 24 | mapreduce( 25 | data, 26 | map = 27 | function(., data.chunk) 28 | keyval(1, list(subcluster(data.chunk))), 29 | combine = TRUE, 30 | reduce = 31 | function(., clusterings) 32 | keyval(1, list(merge(clusterings)))) 33 | 34 | ## @knitr cluster-subclara 35 | subclara = 36 | function(data, n.centers) { 37 | clust = 38 | clara( 39 | data, 40 | n.centers, 41 | keep.data = FALSE) 42 | list( 43 | size = nrow(data), 44 | sample = data[clust$sample,], 45 | medoids = clust$medoids)} 46 | 47 | ## @knitr cluster-merge-clara 48 | merge.clara = 49 | function(clusterings, n.centers){ 50 | sizes = unlist(napply(clusterings, 'size')) 51 | total.size = sum(sizes) 52 | size.range = range(sizes) 53 | size.ratio = max(size.range)/min(size.range) 54 | resample = 55 | function(x) 56 | x$sample[ 57 | sample( 58 | 1:nrow(x$sample), 59 | round(nrow(x$sample) * size.ratio), 60 | replace = TRUE)] 61 | clust = 62 | subclara( 63 | do.call( 64 | rbind, 65 | lapply( 66 | clusterings, 67 | resample)), 68 | n.centers) 69 | clust$size = total.size 70 | clust} 71 | 72 | ## @knitr cluster-clara 73 | clara.mr = 74 | function(data, n.centers) 75 | values( 76 | from.dfs( 77 | cluster.mr( 78 | data, 79 | Curry(subclara, n.centers = n.centers), 80 | Curry(merge.clara, n.centers = n.centers))))[[1]] -------------------------------------------------------------------------------- /pkg/examples/collocations.R: -------------------------------------------------------------------------------- 1 | # Copyright 2013 Revolution Analytics 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | library(rmr2) 16 | 17 | ngram.format = 18 | make.input.format( 19 | format="csv", 20 | quote = NULL, 21 | sep = "\t", 22 | comment.char = "", 23 | col.names = c("ngram", "year", "count", "pages", "books"), 24 | stringsAsFactors = FALSE) 25 | 26 | ngram.parse = 27 | function(ngram.data) { 28 | ngram.split = 29 | suppressWarnings( 30 | do.call( 31 | rbind, 32 | strsplit( 33 | paste(ngram.data$ngram, " "), 34 | " ")) 35 | [,1:5]) 36 | filter = ngram.split[,ncol(ngram.split)] != "" 37 | cbind( 38 | ngram.data[,-1], 39 | ngram.split, 40 | stringsAsFactors = FALSE) 41 | [filter,]} 42 | 43 | map.fun = 44 | function(k, v) { 45 | data = ngram.parse(v) 46 | keyval( 47 | as.matrix(data[, c("year", "1", names(data)[ncol(data)])]), 48 | data$count)} 49 | 50 | reduce.fun = 51 | function(k,vv) { 52 | vv = split(vv, as.data.frame(k), drop = TRUE) 53 | keyval(names(vv), vsum(vv))} 54 | #keyval(names(vv), sapply(vv, sum))} 55 | #this alone changes the runtime from 49' to 1h 27' 56 | #on a 5 node cluster with 10 reducer slots 57 | 58 | system.time({ 59 | zz = 60 | mapreduce( 61 | "/user/ngrams/", 62 | #"../RHadoop.data/ngrams/10000000.csv", 63 | input.format = ngram.format, 64 | map = map.fun, 65 | reduce = reduce.fun, 66 | vectorized.reduce = TRUE, 67 | in.memory.combine = FALSE, 68 | combine = FALSE)}) -------------------------------------------------------------------------------- /pkg/examples/counts.R: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Revolution Analytics 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | ## @knitr counts 17 | count = 18 | function(data, ...) { 19 | map.count = 20 | function(.,data) { 21 | counts = apply(data,2,function(x) aggregate(x,list(x),length)) 22 | keyval(names(counts), counts)} 23 | reduce.count = 24 | function(colname, counts) { 25 | counts = do.call(rbind, counts) 26 | keyval( 27 | colname, 28 | list(aggregate(counts$x, list(as.character(counts$Group.1)), sum)))} 29 | from.dfs( 30 | mapreduce( 31 | data, 32 | map = map.count, 33 | reduce = reduce.count, 34 | combine = TRUE, 35 | ...))} 36 | ## @knitr end -------------------------------------------------------------------------------- /pkg/examples/hbase.R: -------------------------------------------------------------------------------- 1 | # Copyright 2013 Revolution Analytics 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | ## @knitr hbase-blogposts 16 | from.dfs( 17 | mapreduce( 18 | input="blogposts", 19 | input.format = 20 | make.input.format( 21 | "hbase", 22 | family.columns = 23 | list( 24 | image= list("bodyimage"), 25 | post = list("author", "body")), 26 | key.deserialize = "raw", 27 | cell.deserialize = "raw", 28 | dense = TRUE, 29 | atomic = TRUE))) 30 | 31 | ## @knitr hbase-freebase.input.format 32 | freebase.input.format = 33 | make.input.format( 34 | "hbase", 35 | family.columns = 36 | list( 37 | name = "", 38 | freebase = "types"), 39 | key.deserialize = "raw", 40 | cell.deserialize = "raw", 41 | dense = FALSE, 42 | atomic = FALSE) 43 | 44 | ## @knitr hbase-freebase-mapreduce 45 | from.dfs( 46 | mapreduce( 47 | input = "freebase", 48 | input.format = freebase.input.format, 49 | map = function(k,v) keyval(k[1,], v[1,]))) 50 | ## @knitr end -------------------------------------------------------------------------------- /pkg/examples/large-kmeans-test.R: -------------------------------------------------------------------------------- 1 | # Copyright 2013 Revolution Analytics 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | input.1000 = mapreduce (input = to.dfs(1:1000), 16 | map = function(k, v) keyval(rnorm(1), v), 17 | reduce = to.reduce(identity)) 18 | 19 | input.10e6 = mapreduce (input = input.1000, 20 | map = function(k, v) lapply(1:1000, function(i) keyval(rnorm(1), v)), 21 | reduce = to.reduce(identity)) 22 | 23 | kmeans.input.10e6 = mapreduce(input.1000, 24 | map = function(k, v) keyval(rnorm(1), cbind(sample(0:2, recsize, replace = TRUE) + 25 | rnorm(recsize, sd = .1), 26 | sample(0:3, recsize, replace = TRUE) + 27 | rnorm(recsize, sd = .1)))) 28 | 29 | kmeans.input.10e9 = mapreduce(input.10e6, 30 | map = function(k, v) keyval(rnorm(1), cbind(sample(0:2, recsize, replace = TRUE) + 31 | rnorm(recsize, sd = .1), 32 | sample(0:3, recsize, replace = TRUE) + 33 | rnorm(recsize, sd = .1)))) 34 | -------------------------------------------------------------------------------- /pkg/examples/mclust.mr.R: -------------------------------------------------------------------------------- 1 | # Copyright 2013 Revolution Analytics 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | library(mclust) 16 | 17 | fast.mclust = 18 | function(data) 19 | Mclust( 20 | data, 21 | initialization = 22 | list( 23 | subset = 24 | sample( 25 | 1:nrow(data), 26 | size = min(100, nrow(data))))) 27 | 28 | 29 | mclust.mr = 30 | function(data, merge.dataset.size = 10000) 31 | mapreduce( 32 | data, 33 | map = 34 | function(.,data) 35 | keyval(1, list(fast.mclust(data)[c('n', 'modelName', 'parameters')])), 36 | reduce = 37 | function(., models) { 38 | shrink = 39 | merge.dataset.size/ 40 | sum(sapply(models, function(m) m$n)) 41 | model = 42 | fast.mclust( 43 | do.call( 44 | rbind, 45 | lapply( 46 | models, 47 | function(m) 48 | sim( 49 | modelName = m$modelName, 50 | parameters = m$parameters, 51 | n = round(m$n/shrink))[,-1]))) 52 | keyval( 53 | 1, 54 | list( 55 | list( 56 | n = round(model$n*shrink), 57 | modelName = model$modelName, 58 | parameters = model$parameters)))}) -------------------------------------------------------------------------------- /pkg/examples/ngram.R: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Revolution Analytics 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Start cluster 16 | # $WHIRR_HOME/bin/whirr launch-cluster --config ~/Projects/Revolution/RHadoop/rmr/pkg/tools/whirr/hadoop-ec2-lzo.properties 2>&1 17 | # $WHIRR_HOME/bin/whirr run-script --script ~/Projects/Revolution/RHadoop/rmr/pkg/tools/whirr/rmr-1.3.sh --config ~/Projects/Revolution/RHadoop/rmr/pkg/tools/whirr/hadoop-ec2-lzo.properties 18 | # $WHIRR_HOME/bin/whirr run-script --script ~/Projects/Revolution/RHadoop/rmr/pkg/tools/whirr/lzo.sh --config ~/Projects/Revolution/RHadoop/rmr/pkg/tools/whirr/hadoop-ec2-lzo.properties 19 | 20 | 21 | ## @knitr fake-data 22 | fake.size = 2000000 23 | writeLines( 24 | apply( 25 | cbind( 26 | sample(sapply(1:20000, function(x) substr(digest(x),start=1,stop=3)), fake.size, replace = TRUE), 27 | sample(1800:1819, fake.size, replace = TRUE), 28 | sample (1:200, fake.size, replace = TRUE), 29 | sample (1:200, fake.size, replace = TRUE), 30 | sample (1:200, fake.size, replace = TRUE)), 31 | 1, 32 | function(x)paste(x, collapse = "\t")), 33 | file("/tmp/fake-ngram-data", "w")) 34 | 35 | source = "/tmp/fake-ngram-data" 36 | # rmr.options(backend = "local") 37 | 38 | #Timing for 12 + 1 node EC2 cluster m1.large instances 39 | ## @knitr distcp 40 | # hadoop distcp s3n://$AWS_ACCESS_KEY_ID:$AWS_SECRET_ACCESS_KEY@datasets.elasticmapreduce/ngrams/books/20090715/eng-all/1gram/ hdfs:///user/antonio/ 41 | ## @knitr scatter 42 | ## source = scatter("/user/antonio/1gram/data") 43 | # 33 mins 44 | ## @knitr ngram.format 45 | ngram.format = function(lines){ 46 | data = 47 | as.data.frame( 48 | do.call(rbind, strsplit(lines, "\t"))[,1:3], 49 | stringsAsFactors = FALSE) 50 | names(data) = c("ngram", "year", "count") 51 | data$year = as.integer(data$year) 52 | data$count = as.integer(data$count) 53 | data} 54 | 55 | ## @knitr filter.map 56 | filter.map = function(., lines) { 57 | ngram.data = ngram.format(lines) 58 | ngram.data[ 59 | regexpr( 60 | "^[A-Za-z]+$", 61 | ngram.data$ngram) > -1 & 62 | ngram.data$year > 1800,]} 63 | ## @knitr end 64 | 65 | # use 66 | # input.format = "text" 67 | # on fake data 68 | 69 | ## @knitr filtered.data 70 | source = "/user/antonio/1gram/data" 71 | library(rmr2) 72 | filtered.data = 73 | mapreduce(input = source, 74 | map = filter.map) 75 | ## @knitr end 76 | #20 mins, 77 | ## @knitr sample-data 78 | from.dfs(rmr.sample(filtered.data, method="any", n = 50)) 79 | ## @knitr end 80 | #5 mins 81 | 82 | ## @knitr totals.map 83 | totals.map = 84 | function(., ngram.data) { 85 | total = tapply(as.numeric(ngram.data$count), ngram.data$year, sum, na.rm = TRUE) 86 | keyval(names(total), as.vector(total))} 87 | 88 | ## @knitr totals.reduce 89 | totals.reduce = 90 | function(year, count) 91 | keyval(year, sum(count, na.rm = TRUE)) 92 | 93 | ## @knitr year.totals 94 | year.totals.kv = 95 | from.dfs( 96 | mapreduce( 97 | input = filtered.data, 98 | map = totals.map, 99 | reduce = totals.reduce, 100 | combine = TRUE)) 101 | ## @knitr end 102 | #9 mins 103 | 104 | ## @knitr year.totals-finish 105 | year.totals = c() 106 | year.totals[keys(year.totals.kv)] = values(year.totals.kv) 107 | ## @knitr outlier.map 108 | library(bitops) 109 | outlier.map = 110 | function(., ngram.data) { 111 | k = ngram.data$year + cksum(ngram.data$ngram)%%100/100 112 | c.keyval( 113 | keyval(k, ngram.data), 114 | keyval(k + 1, ngram.data))} 115 | 116 | ## @knitr outlier.reduce 117 | library(robustbase) 118 | library(reshape2) 119 | outlier.reduce = 120 | function(., ngram.data) { 121 | years = range(ngram.data$year) 122 | if(years[1] == years[2]) 123 | NULL 124 | else { 125 | ngram.data = dcast(ngram.data, ngram ~ year, fill = 0) 126 | tryCatch({ 127 | filter = 128 | !adjOutlyingness( 129 | log( 130 | t( 131 | t(ngram.data[,2:3] + 1)/ 132 | as.vector( 133 | year.totals[as.character(years)] + 1))), 134 | alpha.cutoff = .95)$nonOut 135 | as.character(ngram.data[filter,'ngram'])}, 136 | error = function(e) NULL)}} 137 | ## @knitr end 138 | 139 | # watch out the next doesn't seem to work beyond 10^5 ngrams 140 | # problem is inefficient assignment, still investigating 141 | ## @knitr outlier.ngram 142 | outlier.ngram = 143 | unique( 144 | values( 145 | from.dfs( 146 | mapreduce( 147 | input = filtered.data, 148 | output = "/user/antonio/1gram/outlier-ngram", 149 | map = outlier.map, 150 | reduce = outlier.reduce)))) 151 | 152 | ## @knitr end 153 | # 8 hours 154 | 155 | ## @knitr plot.data 156 | plot.data = 157 | values( 158 | from.dfs( 159 | mapreduce( 160 | input = filtered.data, 161 | output = "/user/antonio/1gram/plot-data-ngram", 162 | map = 163 | function(., ngram.data) 164 | ngram.data[ 165 | is.element( 166 | as.character(ngram.data$ngram), 167 | outlier.ngram),]))) 168 | ## @knitr end 169 | # 5 mins 170 | 171 | ## @knitr plot.data.frame 172 | plot.data = 173 | melt( 174 | dcast( 175 | plot.data, ngram ~ year, fill = 0), 176 | variable.name="year", 177 | value.name = "count") 178 | plot.data$freq = 179 | (plot.data$count + 0.1)/ 180 | year.totals[as.character(plot.data$year)] 181 | plot.data = 182 | plot.data[order(plot.data$ngram, plot.data$year),] 183 | plot.data = 184 | cbind( 185 | plot.data[-nrow(plot.data),], 186 | plot.data[-1,]) 187 | plot.data = 188 | plot.data[ 189 | plot.data[,1] == plot.data[,5], 190 | c(1,2,4,8)] 191 | names(plot.data) = 192 | c("id","time","freq", "freq.prev") 193 | plot.data$average = 194 | sqrt(plot.data$freq*plot.data$freq.prev) 195 | plot.data$ratio = 196 | plot.data$freq/plot.data$freq.prev 197 | plot.data$time = as.integer(as.character(plot.data$time)) 198 | ## @knitr end 199 | 200 | ## save and reload, this is not necessary unless you take a break 201 | ##save(plot.data, file = "../RHadoop.data/ngram.plot.data") 202 | ##load("../RHadoop.data/ngram.plot.data") 203 | ## throw away some data points -- graphics can only use so many 204 | 205 | ## @knitr trim 206 | plot.data = plot.data[log(plot.data$average) > -10, ] 207 | summary(plot.data) 208 | ## @knitr end 209 | 210 | ## @knitr plot 211 | suppressPackageStartupMessages(library(googleVis)) 212 | motion.chart = 213 | gvisMotionChart( 214 | plot.data[,c("id","time","average","ratio")], 215 | options = list(height = 768, width = 1024)) 216 | plot(motion.chart) 217 | ## @knitr end 218 | print(motion.chart, "chart") -------------------------------------------------------------------------------- /pkg/man/bigdataobject.Rd: -------------------------------------------------------------------------------- 1 | \name{big.data.object} 2 | \alias{big.data.object} 3 | 4 | \title{ 5 | The big data object.} 6 | 7 | \description{ 8 | A stub representing data on disk that can be manipulated by other functions in rmr. "Stub" means that the data is not actually "there" or more concretely it is not held in memory in the current process. This is a technique used in different programming languages when remote resources need to be made available. In this case the rationale is that we need to be able to process large data sets whose size is not compatible with them being held in memory at once. Nonetheless it is convenient to be able to refer to the complete data set in the language, albeit the set of operations we can perform on it is limited. Big data objects are returned by \code{\link{to.dfs}}, \code{\link{mapreduce}}, \code{\link{scatter}}, \code{\link{gather}}, \code{\link{equijoin}} and \code{\link{rmr.sample}}, and accepted as input by all of the above with the exception of \code{\link{to.dfs}} and the inclusion of \code{\link{from.dfs}}. Big data objects are NOT persistent, meaning that they are not meant to be saved beyond the limits of a session. They use temporary space and the space is reclaimed as soon as possible when the data can not be referred to any more, or at the end of a session. For data that needs to be accessible outside the current R session, you need to use paths to the file or directory where the data is or should be written to. Valid paths can be used interchangeably wherever big data objects are accepted} 9 | 10 | 11 | \examples{ 12 | some.big.data = to.dfs(1:10) 13 | path = "/tmp/some/big/data" 14 | if(dfs.exists(path)) 15 | dfs.rmr(path) 16 | to.dfs(1:10, path)} 17 | -------------------------------------------------------------------------------- /pkg/man/dfs.empty.Rd: -------------------------------------------------------------------------------- 1 | \name{dfs.empty} 2 | \alias{dfs.empty} 3 | \alias{dfs.exists} 4 | \alias{dfs.size} 5 | \alias{dfs.mv} 6 | \alias{dfs.rmr} 7 | \alias{dfs.ls} 8 | \title{Backend-independent file manipulation} 9 | 10 | \description{Check if an item is empty or return its size. Move an item or remove(recursively). Here item is a valid path or \code{\link{big.data.object}}} 11 | 12 | \usage{ 13 | dfs.empty(fname) 14 | dfs.exists(fname) 15 | dfs.size(fname) 16 | dfs.mv(from, to) 17 | dfs.rmr(fname) 18 | dfs.ls(fname) 19 | } 20 | 21 | \arguments{ 22 | \item{fname}{A valid path or \code{\link{big.data.object}}} 23 | \item{from, to}{A valid path} 24 | } 25 | 26 | \value{For \code{dfs.size} a number of bytes; for \code{dfs.empty} and \code{dfs.exists}, a logical; for \code{dfs.ls} a data.frame} 27 | 28 | \details{ 29 | The size of a directory, for the sake of this commands, is the size of the files contained therein with the exception of hidden files starting with "." and "_". This is not well documented in Hadoop but there is a private call that implements this pattern. } 30 | 31 | \examples{ 32 | dfs.empty(mapreduce(to.dfs(1:10))) 33 | dfs.size(mapreduce(to.dfs(1:10))) 34 | } -------------------------------------------------------------------------------- /pkg/man/equijoin.Rd: -------------------------------------------------------------------------------- 1 | \name{equijoin} 2 | 3 | \alias{equijoin} 4 | 5 | \title{ 6 | Equijoins using map reduce 7 | } 8 | 9 | \description{ 10 | A generalized form of equijoin, hybrid between the SQL brethren and mapreduce 11 | } 12 | 13 | \usage{ 14 | equijoin( 15 | left.input = NULL, 16 | right.input = NULL, 17 | input = NULL, 18 | output = NULL, 19 | input.format = "native", 20 | output.format = "native", 21 | outer = c("", "left", "right", "full"), 22 | map.left = to.map(identity), 23 | map.right = to.map(identity), 24 | reduce = reduce.default)} 25 | \arguments{\item{left.input}{The left side input to the join.} 26 | \item{right.input}{The right side input to the join.} 27 | \item{input}{The only input in case of a self join. Mutually exclusive with the previous two.} 28 | \item{output}{Where to write the output.} 29 | \item{input.format}{Input format specification, see \code{\link{make.input.format}}} 30 | \item{output.format}{Output format specification, see \code{\link{make.output.format}}} 31 | \item{outer}{Whether to perform an outer join, one of the usual three types, left, right or full.} 32 | \item{map.left}{Function to apply to each record from the left input, follows same conventions as any map function. The returned keys 33 | will become join keys.} 34 | \item{map.right}{Function to apply to each record from the right input, follows same conventions as any map function. The returned keys 35 | will become join keys.} 36 | \item{reduce}{Function to be applied, key by key, on the values associated with that key. Those values are in the arguments \code{vl} (left side) and \code{vr} (right side) and their type is determined by the type returned by the map functions, separately for the left side and the right side. The allowable return values are like those of any reduce function, see \code{\link{mapreduce}}. The default performs a \code{merge} with \code{by = NULL} which performs a cartesian product, unless lists are involved in which case the arguments are simply returned in a list.}} 37 | 38 | \value{If output is specified, returns output itself. Otherwise, a \code{\link{big.data.object}}} 39 | 40 | 41 | \section{Warning}{Doesn't work with multiple inputs like \code{mapreduce}} 42 | 43 | 44 | \examples{ 45 | ##---- Should be DIRECTLY executable !! ---- 46 | ##-- ==> Define data, use random, 47 | ##-- or do help(data=index) for the standard data sets. 48 | from.dfs(equijoin(left.input = to.dfs(keyval(1:10, 1:10^2)), right.input = to.dfs(keyval(1:10, 1:10^3)))) 49 | } 50 | 51 | 52 | -------------------------------------------------------------------------------- /pkg/man/fromdfstodfs.Rd: -------------------------------------------------------------------------------- 1 | \name{from.dfs} 2 | \alias{from.dfs} 3 | \alias{to.dfs} 4 | 5 | \title{Read or write R objects from or to the file system} 6 | \description{Functions that read or write R objects from or to the file system} 7 | 8 | \usage{ 9 | to.dfs(kv, output = dfs.tempfile(), format = "native") 10 | from.dfs(input, format = "native") 11 | } 12 | 13 | \arguments{ 14 | \item{kv}{A key-value pair; also, a vector, list, matrix or a data frame (in this case the keys will be set to NULL)} 15 | \item{input}{A valid path or a \code{\link{big.data.object}}} 16 | \item{output}{A valid path} 17 | \item{format}{For \code{from.dfs} either a string naming a format, the same as those allowed by \code{make.input.format}, or the value returned by \code{\link{make.input.format}}. The same is true for \code{to.dfs}, but refer to \code{\link{make.output.format}} instead.}} 18 | 19 | \details{ These functions allow to move data from RAM to the file system and back. Keep in mind that the capacity of these two storage media is 20 | different by two or more orders of magnitude, so the conversion will make sense only in specific situations. These 21 | functions do not perform any size control, so the responsibility is on the user. For the local backend, file system means the local file system. 22 | For the Hadooop backend it means HDFS} 23 | 24 | \value{\code{from.dfs} returns the object whose representation is contained in \code{input}. \code{to.dfs} returns the value of \code{output} or, when this is missing, a \code{\link{big.data.object}} } 25 | 26 | \examples{ 27 | from.dfs(to.dfs(1:10)) 28 | from.dfs(to.dfs(keyval(1, 1:10))) 29 | } 30 | -------------------------------------------------------------------------------- /pkg/man/hadoop-setting.Rd: -------------------------------------------------------------------------------- 1 | \name{hadoop.settings} 2 | \alias{hadoop.settings} 3 | 4 | \title{Important Hadoop settings in relation to rmr2} 5 | \description{There are a few hadoop settings that one should be aware of and know how to modify to allow the successful execution of mapreduce programs} 6 | \details{Since the transition to YARN and MR2, each Mapreduce job needs to secure a container in order to execute. A container is a resource allocation unit and the resource we are concerned with here is memory. At default settings, at least in a non-scientific sampling of deployments, the memory available to a container is used almost entirely by the map and reduce java processes. In rmr2 this is not compatible with the java process successfully executing an instance of the R interpreter, which is necessary for rmr2. Therefore, by default, rmr2 modifies per-job settings to set the java process to use 400MB of memory and leave the rest for use by R. This is assuming that the default container size is larger than 400MB and that R can work successfully in the remaining space. Under certain conditions, it is also possible that 400MB won't be enough for the java process. To solve these problems, the user has access to a number of properties that can be set using configuration files on an per-job basis directly in rmr2 (see \code{\link{rmr.options}}, argument \code{backend.parameters}). Four important properties are \code{mapreduce.map.java.opts}, \code{mapreduce.reduce.java.opts}, \code{mapreduce.map.memory.mb} and \code{mapreduce.reduce.memory.mb} 7 | The first two are set by \code{rmr2} to \code{-Xmx400M}, which sets the memory allocated to the map or reduce java task. The other two properties control the size of the container for, resp., the map and reduce phase and rmr2 leaves them at default values, unless the user decides otherwise. There are many other properties the control the execution environment of mapreduce jobs but they are out of scope for this help entry (you are referred to the documentation accompanying your Hadoop distribution). These four, in the experience of the RHadoop team are the ones one needs to acton upon most often.} 8 | 9 | -------------------------------------------------------------------------------- /pkg/man/keyval.Rd: -------------------------------------------------------------------------------- 1 | \name{keyval} 2 | \alias{keyval} 3 | \alias{keys} 4 | \alias{values} 5 | \alias{c.keyval} 6 | 7 | \title{Create, project or concatenate key-value pairs} 8 | \description{Create a key-value object (a collecton of key-value pairs) from two R objects, extract keys or values from a key value object or concatenate multiple key value objects} 9 | 10 | \usage{ 11 | keys(kv) 12 | values(kv) 13 | keyval(key, val) 14 | c.keyval(...) 15 | } 16 | 17 | \arguments{ 18 | \item{kv}{key-value pairs} 19 | \item{key}{the desired key or keys} 20 | \item{val}{the desired value or values} 21 | \item{...}{key-value pairs to concatenate, or a single list thereof}} 22 | 23 | \details{The \code{keyval} function is used to create return values for the map and reduce functions, themselves parameters to 24 | \code{\link{mapreduce}}. Key-value objects are also appropriate arguments for the \code{\link{to.dfs}} function and are returned by 25 | \code{\link{from.dfs}}. \code{keys} and \code{values} extract keys and values resp. from a key value object. \code{c.keyval} concatenates two or more key-value objects by concatenating the keys and values separately after recycling the arguments. When invoked with a single argument, it considers it a list of key value objects to concatenate. A key value object should always be considered vectorized, meaning that it defines a collection of key-value pairs. For the purpose of forming key-value pairs, the length of an object is considered its number of rows whene defined, that is for matrices and data frames, or its R \code{\link{length}} otherwise. Consistently with this definition, the n-th element of a key or value argument is its n-th row or a subrange including only the n-th element otherwise. Data types are preserved, meaning that, for instance, if the \code{key} is a matrix its n-th element is a matrix with only one row, the n-th row of the larger matrix (the behavior of the \code{[]} operator with \code{drop = FALSE}). The same is true for data frames, list and atomic vectors. When \code{key} and \code{val} have different lengths according to this definition, recycling is applied. The pairing between keys and values is positional, meaning that the n-th element of the key argument is associated with the n-th element of the val argument in a single key-value pair. Concatenation happens with \code{rbind} or variants thereof whenever keys or values have rows, \code{c} otherwise. Mixing and matching keys of different type, e.g. a matrix with a vector, is not supported, and the same is true for keys, but key and value in the same keyval object do not need to be of the same type. When porting programs from rmr < 2 a list of non-vectorized key-value pairs can be converted with \code{c.keyval(keyval(list(k1), list(v1)), keyval(list(k2), list(v2)), ...)}. In many cases wrapping the keys and values in a \code{list} call is not necessary, but it is in the general case.} 26 | 27 | \examples{ 28 | #single key-val 29 | keyval(1,2) 30 | keys(keyval(1,2)) 31 | values(keyval(1,2)) 32 | #10 kv pairs of the form (i,i) 33 | keyval(1:10, 1:10) 34 | #2 kv pairs (1, 2i-1) and (2, 2i) for i in 1:5 35 | keyval(1:2, 1:10) 36 | # mtcars is a data frame, each row is a value with key set to the value of column cyl 37 | keyval(mtcars$cyl, mtcars) 38 | # concatenate two keyval objects 39 | c.keyval(keyval(1,1:2), keyval(1,1:3)) 40 | } 41 | -------------------------------------------------------------------------------- /pkg/man/mapreduce.Rd: -------------------------------------------------------------------------------- 1 | \name{mapreduce} 2 | \alias{mapreduce} 3 | 4 | \title{MapReduce using Hadoop Streaming} 5 | \description{Defines and executes a map reduce job. 6 | } 7 | 8 | \usage{ mapreduce( 9 | input, 10 | output = NULL, 11 | map = to.map(identity), 12 | reduce = NULL, 13 | vectorized.reduce = FALSE, 14 | combine = NULL, 15 | in.memory.combine = FALSE, 16 | input.format = "native", 17 | output.format = "native", 18 | backend.parameters = list(), 19 | verbose = TRUE) } 20 | 21 | \arguments{ 22 | \item{input}{Paths to the input folder(s) (on HDFS) or vector thereof 23 | or or the return value of another \code{mapreduce} or a \code{\link{to.dfs}} call} 24 | \item{output}{A path to the destination folder (on HDFS); if missing, a \code{\link{big.data.object}} is returned, see "Value" below} 25 | \item{map}{An optional R function of two arguments, returning either NULL or the return value of \code{\link{keyval}}, that specifies the map operation to execute as part of a mapreduce job. The two arguments represent multiple key-value pairs according to the definition of the mapreduce model. They can be any of the following: list, vector, matrix, data frame or NULL (the last one only allowed for keys). Keys are matched to the corresponding values by position, according to the second dimension if it is defined (that is rows in matrices and data frames, position otherwise), analogous to the behavior of \code{cbind}, see \code{\link{keyval}} for details.} 26 | \item{reduce}{An optional R function of two arguments, a key and a data structure representing all the values associated with that key (the same type as returned by the map call, merged with \code{rbind} for matrices and data frames and \code{c} otherwise), returning either NULL or the return value of \code{\link{keyval}}, that specifies the reduce operation to execute as part of a mapreduce job. The default is no reduce phase, that is the output of the map phase is the output of the mapreduce job, see the \code{vectorized.reduce} argument for an alternate interface} 27 | \item{vectorized.reduce}{The argument to the reduce should be construed as a collection of keys and values associated to them by position (by row when 2-dimensional). Identical keys are consecutive and once a key is present once, all the records associated with that key will be passed to the same reduce call (complete group guarantee). This form of reduce has been introduced mostly for efficiency reasons when processing small reduce groups, because the records are small and few of them are associated with the same key. This option affects the combiner too.} 28 | \item{combine}{A function with the same signature and possible return values as the reduce function, or TRUE, which means use the reduce function as combiner. NULL means no combiner is used.} 29 | \item{in.memory.combine}{Apply the combiner just after calling the map function, before returning the results to hadoop. This is useful to reduce the amount of I/O and (de)serialization work when combining on small sets of records has any effect (you may want to tune the input format to read more data for each map call together with this approach, see arguments \code{read.size} or \code{nrow} for a variety of formats)} 30 | \item{input.format}{Input format specification, see \code{\link{make.input.format}}} 31 | \item{output.format}{Output format specification, see \code{\link{make.output.format}}} 32 | \item{backend.parameters}{This option is for advanced users only and may be removed in the future. Specify additional, backend-specific 33 | options, as in \code{backend.parameters = list(hadoop = list(D = "mapred.reduce.tasks=1"), local = list())}. It is recommended not to use this argument to change the semantics of mapreduce (output should be independent of this argument). Each backend can only see the nested list named after the backend itself. The interpretation is the following: for the hadoop backend, generate an additional hadoop streaming command line argument for each element of the list, "-name value". If the value is TRUE generate "-name" only, if it is FALSE skip. One possible use is to specify the number of mappers and reducers on a per-job basis. It is not guaranteed that the generated streaming command will be a legal command. In particular, remember to put any generic options before any specific ones, as per hadoop streaming manual. For the local backend, the list is currently ignored.} 34 | \item{verbose}{Run hadoop in verbose mode. When \code{FALSE} job and, on YARN, application ids are returned as attributes. No effect on the local backend}} 35 | 36 | \value{The value of \code{output}, or, when missing, a \code{\link{big.data.object}}} 37 | 38 | \details{Defines and executes a mapreduce job. Jobs can be chained together by simply providing the return value of one as input to the 39 | other. The map and reduce functions will run in an environment that is a close approximation of the environment of this 40 | call, even if the actual execution happens in a different interpreter on a different machine. Changes to the outer 41 | environments performed inside the map and reduce functions with the \code{<<-} operator will only affect a per-process copy of the 42 | environment, not the original one, in a departure from established but seldom used R semantics. This is unlikely to change in the future 43 | because of the challenges inherent in adopting reference semantics in a parallel environment. The map function should not read from standard input and write to standard output. Logging and debugging messages should be written to standard error, and will be redirected to the appropriate logs or to console by the backend. If necessary, library functions that can not be prevented from writing into standard output can be surrounded by a pair of \code{sink} calls as in \code{sink(stderr()); library.function(); sink(NULL)}. See also the Tutorial 44 | \url{https://github.com/RevolutionAnalytics/RHadoop/wiki/Tutorial}} 45 | 46 | \seealso{\code{\link{to.map}} and \code{\link{to.reduce}} can be used to convert other functions into suitable arguments for the map and 47 | reduce arguments; see the tests directory in the package for more examples} 48 | 49 | -------------------------------------------------------------------------------- /pkg/man/rmr-package.Rd: -------------------------------------------------------------------------------- 1 | \name{rmr} 2 | \alias{rmr} 3 | \docType{package} 4 | \title{A package to perform Map Reduce computations in R} 5 | \description{Running on top of Hadoop, this package allows to define and run mapreduce jobs, including specifying the mapper and the reducer as R functions, and to move data between R and Hadoop in a mostly transparent way. The aim is to make writing map reduce jobs very similar to and just as easy as writing a lapply and a tapply. Additional features provide easy job composition, transparent intermediate result management, support for different data formats and more. 6 | } 7 | -------------------------------------------------------------------------------- /pkg/man/rmr.options.Rd: -------------------------------------------------------------------------------- 1 | \name{rmr.options} 2 | \alias{rmr.options} 3 | \title{Function to set and get package options} 4 | \description{Set and get package options} 5 | \usage{ 6 | rmr.options( 7 | backend = c("hadoop", "local"), 8 | profile.nodes = c("off", "calls", "memory", "both"), 9 | hdfs.tempdir = "/tmp", 10 | exclude.objects = NULL, 11 | backend.parameters = list()) 12 | } 13 | \arguments{ 14 | \item{...}{Names of options to get values of, as length one character vectors} 15 | \item{backend}{One of "hadoop" or "local", the latter being implemented entirely in the current R interpreter, sequentially, for learning and debugging.} 16 | \item{profile.nodes}{Collect profiling and memory information when running additional R interpreters (besides the current one) on the cluster. No effect on the local backend, use Rprof instead. For backward compatibility, \code{"calls"} is equivalent to \code{TRUE} and \code{"off"} to \code{FALSE}} 17 | \item{hdfs.tempdir}{The directory to use for temporary files, including \code{\link{mapreduce}} intermediate results files, on the distributed file system (not used when running on the local backend).} 18 | \item{exclude.objects}{Objects in the Global environment that are not needed by the map or reduce functions, as character vector} 19 | \item{backend.parameters}{Parameters to pass directly to the backend. See equally named argument for the function \code{\link{mapreduce}}. Use this setting for backend parameters that need to be different from default but can be the same from job to job} 20 | } 21 | \details{ 22 | While the main goal for rmr2 is to provide access to hadoop mapreduce, the package has a notion of a backend that can be swapped while preserving most features. One backend is of course hadoop itself, the other is called "local" and is implemented within the current interpreter and using the local file system. rmr2 programs run on the local backend are ordinary (non-distributed, single-threaded) programs which is particularly useful for learning and debugging (debug, recover and trace work). Profiling data is collected in the following files: \code{file.path(rmr.options("dfs.tempdir"), "Rprof", , )} on each node (the details of how job id and attempt id are obtained depend upon the Hadoop distribution) The path is printed in stderr for your convenience and you will find in in the logs, specifically stderr, for each attempt. Then you need to ssh to the machine where that attempt run to examine or retrieve it. \code{keyval.length} is used as a hint, particularly as a lower bound hint for how many records are actually processed by each map call. 23 | } 24 | \value{A named list with the options and their values, or just a value if only one requested. NULL when only setting options.} 25 | 26 | \examples{ 27 | old.backend = rmr.options("backend") 28 | rmr.options(backend = "hadoop") 29 | rmr.options(backend = old.backend) 30 | \dontrun{ 31 | rmr.options( 32 | hdfs.tempdir = 33 | file.path( 34 | "/user", 35 | system("whoami", TRUE), 36 | "tmp-rmr2", 37 | basename(tempdir()))) 38 | }} 39 | -------------------------------------------------------------------------------- /pkg/man/rmr.sample.Rd: -------------------------------------------------------------------------------- 1 | \name{rmr.sample} 2 | \alias{rmr.sample} 3 | 4 | \title{Sample large data sets} 5 | 6 | \description{Sample large data sets} 7 | 8 | \usage{rmr.sample(input, output = NULL, method = c("any", "Bernoulli"), ...)} 9 | 10 | \arguments{ 11 | \item{input}{The data set to be sampled as a file path or \code{\link{mapreduce}} return value} 12 | \item{output}{Where to store the result. See \code{\link{mapreduce}}, output argument, for details} 13 | \item{method}{One of "any" or "Bernoulli". "any" will return some records out, optimized for speed, but with no statistical guarantees. "Bernoulli" implements independent sampling according to the Bernoulli distribution} 14 | \item{\dots}{Additional arguments to fully specify the sample, they depend on the method selected. If it is "any" then the size of the desired sample should be provided as the argument \code{n}. If it is "Bernoulli" the argument \code{p} specifies the probabity of picking each record}} 15 | 16 | \value{ 17 | The sampled data. See \code{\link{mapreduce}} for details.} 18 | 19 | -------------------------------------------------------------------------------- /pkg/man/rmr.str.Rd: -------------------------------------------------------------------------------- 1 | \name{rmr.str} 2 | \alias{rmr.str} 3 | 4 | \title{ 5 | Print a variable's content} 6 | \description{ 7 | One way to debug mapper and reducers on the "hadoop" backend is to print diagnostic messages. This function helps you do safely so by printing a summary of the stack, the name of the variable and its value onto standard error. This means that if Hadoop is running in standalone mode the message will appear in console, otherwise in the logs. 8 | } 9 | \usage{ 10 | rmr.str(x, ...) 11 | } 12 | \arguments{ 13 | \item{x}{The variable to print} 14 | \item{...}{Additional arguments to \code{str} (called by \code{rmr.str})} 15 | } 16 | 17 | \value{x} 18 | 19 | \examples{ 20 | mapreduce(to.dfs(1), map = function(k, v) rmr.str(v)) 21 | } 22 | -------------------------------------------------------------------------------- /pkg/man/scatter.Rd: -------------------------------------------------------------------------------- 1 | \name{scatter} 2 | \alias{scatter} 3 | \alias{gather} 4 | %- Also NEED an '\alias' for EACH other topic documented here. 5 | \title{ 6 | Functions to split a file over several parts or to merge multiple parts into one} 7 | \description{ 8 | \code{scatter} takes in input a file and pushes it through a mapreduce jobs that writes it over a number of parts (system dependent, specifically on the number of reducers). This helps with parallelization of the next map phase. Gather does the opposite.} 9 | \usage{ 10 | scatter(input, output = NULL, ...) 11 | gather(input, output = NULL, ...) 12 | } 13 | \arguments{ 14 | \item{input}{ 15 | The input file} 16 | \item{output}{ 17 | Output, defaults to the same as \code{\link{mapreduce}} output} 18 | \item{\dots}{Other options passed directly to mapreduce} 19 | } 20 | 21 | \value{ 22 | Same as for \code{\link{mapreduce}}. 23 | } 24 | 25 | \section{Known Limitations}{Scatter discards keys. This is a limitation that should be addressed in the future} -------------------------------------------------------------------------------- /pkg/man/status.Rd: -------------------------------------------------------------------------------- 1 | \name{status} 2 | \alias{status} 3 | \alias{increment.counter} 4 | \title{ 5 | Set the status and define and increment counters for a Hadoop job 6 | } 7 | \description{ 8 | These are Hadoop features used to monitor and debug jobs. Should be used with a grain of salt as far as their scalability. 9 | } 10 | \usage{ 11 | status(value) 12 | increment.counter(group, counter, increment = 1) 13 | } 14 | \arguments{ 15 | \item{value}{The new value for the status of the job} 16 | \item{group}{The group for the counter} 17 | \item{counter}{The name for the counter} 18 | \item{increment}{By how much to increment the counter} 19 | } 20 | \details{ 21 | \code{status} sets the status for the current job. \code{increment.counter} increments the counter named \code{counter} in group \code{group} by \code{increment}. If the counter doesn't exist yet it is initialized to 0. Both calls work only within the map or reduce functions and under local backend just write some messages to stderr. Unfortunately there is no API to query the value of either status or counters at this time, but you can examine them via the jobtracker web interface. 22 | } 23 | \value{ 24 | NULL for both. 25 | } 26 | 27 | \examples{ 28 | mapreduce(to.dfs(1:1000), map = function(k,v){status("mapping"); increment.counter("Calls", "Map", 1)}) 29 | } 30 | -------------------------------------------------------------------------------- /pkg/man/tomaptoreduce.Rd: -------------------------------------------------------------------------------- 1 | \name{to.map} 2 | \alias{to.map} 3 | \alias{to.reduce} 4 | \title{Create map and reduce functions from other functions} 5 | 6 | \description{These utility functions are meant to avoid the little boilerplate code necessary to convert ordinary functions into map and 7 | reduce functions.} 8 | 9 | \usage{ 10 | to.map(fun1, fun2 = identity) 11 | to.reduce(fun1, fun2 = identity) 12 | } 13 | 14 | \arguments{ 15 | \item{fun1}{A function to apply to the key, or to the key-value pair if the second argument is missing} 16 | \item{fun2}{A function to apply to the value} 17 | } 18 | 19 | \details{Sometimes there are functions that we could use almost directly as map or reduce functions but for a bit of boilerplate code, and 20 | we hate boilerplate code. That's where the functions documented herein can help. They take one or two functions of a single argument and 21 | return a valid map or reduce function. In the case of \code{to.map} when two functions are specified they are applied independently to the 22 | key and the value and the return values are returned as a key-value pair; when only one is, it is applied to the key-value pair. For 23 | \code{to.reduce} the behavior is the same. } 24 | 25 | \examples{ 26 | ##The identity map: 27 | to.map(identity) 28 | ## equivalent to function(k, v) keyval(k, v) 29 | ##Replace key with mod 10 of the key and pass the value along: 30 | to.map(function(x) x \%\% 10, identity ) 31 | ##Sum up all the values for the same key: 32 | to.reduce(identity, function(vv) sum(vv)) 33 | } 34 | -------------------------------------------------------------------------------- /pkg/man/vsum.Rd: -------------------------------------------------------------------------------- 1 | \name{vsum} 2 | \alias{vsum} 3 | \title{ 4 | Fast small sums 5 | } 6 | \description{ 7 | Returns the sum of a list of numeric vectors} 8 | \usage{ 9 | vsum(x) 10 | } 11 | %- maybe also 'usage' for other objects documented here. 12 | \arguments{ 13 | \item{x}{ 14 | } 15 | } 16 | \details{ 17 | Equivalent to \code{sapply(x, sum)}, it's about 30X faster in some use cases (many small sums). It's often useful in reducers in the vectorized form. 18 | } 19 | \value{ 20 | A numeric vector with the sum of each element of the list provided as argument} 21 | 22 | \note{ 23 | See collocations.R in the examples directory. 24 | } 25 | 26 | \seealso{ 27 | \code{\link{mapreduce}} 28 | } 29 | -------------------------------------------------------------------------------- /pkg/src/Makevars: -------------------------------------------------------------------------------- 1 | BINDIR = inst/hbase-io 2 | .PHONY: all hbase-io 3 | 4 | PKG_CXXFLAGS=`${R_HOME}/bin/Rscript -e "Rcpp:::CxxFlags()"` 5 | PKG_LIBS = `$(R_HOME)/bin/Rscript -e "Rcpp:::LdFlags()"` 6 | 7 | all: $(SHLIB) hbase-io 8 | 9 | hbase-io: 10 | ((which hbase && (mkdir -p ../inst; cd hbase-io; sh build_linux.sh; cp build/dist/* ../../inst)) || echo "can't build hbase IO classes, skipping" >&2) 11 | 12 | clean: 13 | echo "not implemented yet" 14 | -------------------------------------------------------------------------------- /pkg/src/Makevars.win: -------------------------------------------------------------------------------- 1 | BINDIR = inst/bin$(R_ARCH) 2 | .PHONY: all bin 3 | 4 | PKG_CXXFLAGS=`${R_HOME}/bin/Rscript -e "Rcpp:::CxxFlags()"` 5 | PKG_LIBS = `$(R_HOME)/bin/Rscript -e "Rcpp:::LdFlags()"` 6 | 7 | all: $(SHLIB) bin 8 | 9 | bin: 10 | # binaries 11 | make --no-print-directory -C catwin -f Makefile\ 12 | CC="$(CC)" CFLAGS="$(CFLAGS)"\ 13 | CXX="$(CXX)" CXXFLAGS="$(CXXFLAGS)"\ 14 | BINDIR="$(BINDIR)"\ 15 | install-bin 16 | 17 | clean: 18 | ( cd catwin; make clean ) 19 | rm -rf ../$(BINDIR) 20 | -------------------------------------------------------------------------------- /pkg/src/catwin/Makefile: -------------------------------------------------------------------------------- 1 | # see Makeconf for compiler settings 2 | TARGET = catwin 3 | 4 | default: $(TARGET) 5 | 6 | clean: 7 | rm -rf *~ *.o $(TARGET) 8 | 9 | catwin: catwin.c $(OBJS) $(HEADERS) 10 | $(CXX) $(CXXFLAGS) $(LDFLAGS) -o catwin catwin.c $(OBJS) $(LIBS) 11 | 12 | install-bin: $(TARGET) 13 | mkdir -p ../../$(BINDIR) 14 | cp $(TARGET) ../../$(BINDIR) 15 | -------------------------------------------------------------------------------- /pkg/src/catwin/catwin.c: -------------------------------------------------------------------------------- 1 | /* Copyright 2011 Revolution Analytics 2 | * Copyright (c) 1989, 1993 3 | * The Regents of the University of California. All rights reserved. 4 | * 5 | * This code is derived from software contributed to Berkeley by 6 | * Kevin Fall. 7 | * 8 | * Redistribution and use in source and binary forms, with or without 9 | * modification, are permitted provided that the following conditions 10 | * are met: 11 | * 1. Redistributions of source code must retain the above copyright 12 | * notice, this list of conditions and the following disclaimer. 13 | * 2. Redistributions in binary form must reproduce the above copyright 14 | * notice, this list of conditions and the following disclaimer in the 15 | * documentation and/or other materials provided with the distribution. 16 | * 3. Neither the name of the University nor the names of its contributors 17 | * may be used to endorse or promote products derived from this software 18 | * without specific prior written permission. 19 | * 20 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 | * SUCH DAMAGE. 31 | */ 32 | 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | 39 | #define CAT_BUFSIZ 4096 40 | 41 | int main(int argc, char* argv[]) 42 | { 43 | int wfd; 44 | int rfd; 45 | ssize_t nr, nw, off; 46 | static char *buf = NULL; 47 | static char fb_buf[CAT_BUFSIZ]; 48 | static size_t bsize; 49 | 50 | rfd = fileno(stdin); 51 | wfd = fileno(stdout); 52 | 53 | setmode(rfd, O_BINARY); 54 | setmode(wfd, O_BINARY); 55 | 56 | buf = fb_buf; 57 | bsize = CAT_BUFSIZ; 58 | 59 | while ((nr = read(rfd, buf, bsize)) > 0) 60 | for (off = 0; nr; nr -= nw, off += nw) 61 | nw = write(wfd, buf + off, (size_t)nr); 62 | 63 | fclose(stdout); 64 | return 0; 65 | } 66 | -------------------------------------------------------------------------------- /pkg/src/extras.cpp: -------------------------------------------------------------------------------- 1 | //Copyright 2011 Revolution Analytics 2 | // 3 | //Licensed under the Apache License, Version 2.0 (the "License"); 4 | //you may not use this file except in compliance with the License. 5 | //You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | //Unless required by applicable law or agreed to in writing, software 10 | //distributed under the License is distributed on an "AS IS" BASIS, 11 | //WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | //See the License for the specific language governing permissions and 13 | //limitations under the License. 14 | 15 | #include "extras.h" 16 | #include 17 | #include 18 | 19 | SEXP vsum(SEXP xx) { 20 | Rcpp::List _xx (xx); 21 | std::vector results(_xx.size()); 22 | for(unsigned int i = 0; i < _xx.size(); i ++) { 23 | std::vector x = Rcpp::as >(_xx[i]); 24 | for(unsigned int j = 0; j < x.size(); j++) { 25 | results[i] += x[j];}} 26 | return Rcpp::wrap(results);} -------------------------------------------------------------------------------- /pkg/src/extras.h: -------------------------------------------------------------------------------- 1 | //Copyright 2011 Revolution Analytics 2 | // 3 | //Licensed under the Apache License, Version 2.0 (the "License"); 4 | //you may not use this file except in compliance with the License. 5 | //You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | //Unless required by applicable law or agreed to in writing, software 10 | //distributed under the License is distributed on an "AS IS" BASIS, 11 | //WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | //See the License for the specific language governing permissions and 13 | //limitations under the License. 14 | 15 | #ifndef _RMR_EXTRAS_H 16 | #define _RMR_EXTRAS_H 17 | 18 | #include 19 | 20 | RcppExport SEXP vsum(SEXP xx); 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /pkg/src/hbase-io: -------------------------------------------------------------------------------- 1 | ../../hadoopy_hbase/java/ -------------------------------------------------------------------------------- /pkg/src/hbase-to-df.cpp: -------------------------------------------------------------------------------- 1 | //Copyright 2012 Revolution Analytics 2 | // 3 | //Licensed under the Apache License, Version 2.0 (the "License"); 4 | //you may not use this file except in compliance with the License. 5 | //You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | //Unless required by applicable law or agreed to in writing, software 10 | //distributed under the License is distributed on an "AS IS" BASIS, 11 | //WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | //See the License for the specific language governing permissions and 13 | //limitations under the License. 14 | 15 | #include "hbase-to-df.h" 16 | typedef std::deque raw; 17 | 18 | std::string raw_to_string(SEXP source) { 19 | Rcpp::RawVector raw_source(source); 20 | std::string retval(raw_source.size(), 'a'); 21 | std::copy(raw_source.begin(), raw_source.end(), retval.begin()); 22 | return retval;} 23 | 24 | SEXP raw_list_to_character(SEXP _source) { 25 | Rcpp::List source(_source); 26 | Rcpp::CharacterVector dest(source.size()); 27 | for(unsigned int i = 0; i < source.size(); i++) { 28 | dest[i] = raw_to_string(source[i]);} 29 | return Rcpp::wrap(dest);} 30 | 31 | SEXP string_to_raw(std::string source) { 32 | Rcpp::RawVector retval(source.size()); 33 | std::copy(source.begin(), source.end(), retval.begin()); 34 | return Rcpp::wrap(retval);} 35 | 36 | SEXP p_string_to_raw(SEXP _source) { 37 | std::vector source = Rcpp::as >(_source); 38 | Rcpp::List retval(source.size()); 39 | for(unsigned int i = 0; i < source.size(); i++) { 40 | retval[i] = Rcpp::wrap(string_to_raw(source[i]));} 41 | return Rcpp::wrap(retval);} 42 | 43 | SEXP hbase_to_df(SEXP _source, SEXP _dest) { 44 | int l = 0; 45 | 46 | Rcpp::List dest(_dest); 47 | Rcpp::List dest_key = Rcpp::as(dest["key"]); 48 | Rcpp::List dest_family = Rcpp::as(dest["family"]); 49 | Rcpp::List dest_column = Rcpp::as(dest["column"]); 50 | Rcpp::List dest_cell = Rcpp::as(dest["cell"]); 51 | 52 | Rcpp::List source(_source); 53 | Rcpp::List key1 = Rcpp::as(source["key"]); 54 | Rcpp::List val1 = Rcpp::as(source["val"]); 55 | 56 | for(unsigned int i = 0; i < key1.size(); i ++) { 57 | Rcpp::List val1_i = Rcpp::as(val1[i]); 58 | Rcpp::List key2 = Rcpp::as(val1_i["key"]); 59 | Rcpp::List val2 = Rcpp::as(val1_i["val"]); 60 | for(unsigned int j = 0; j < key2.size(); j++) { 61 | Rcpp::List val2_j = Rcpp::as(val2[j]); 62 | Rcpp::List key3 = Rcpp::as(val2_j["key"]); 63 | Rcpp::List val3 = Rcpp::as(val2_j["val"]); 64 | for(unsigned int k = 0; k < key3.size(); k++) { 65 | dest_family[l] = Rcpp::wrap(key2[j]); 66 | dest_column[l] = Rcpp::wrap(key3[k]); 67 | dest_key[l] = Rcpp::wrap(key1[i]); 68 | dest_cell[l] = Rcpp::wrap(val3[k]); 69 | l++;}}} 70 | return Rcpp::wrap( 71 | Rcpp::List::create( 72 | Rcpp::Named("data.frame") = Rcpp::wrap(_dest), 73 | Rcpp::Named("nrows") = Rcpp::wrap(l)));} 74 | -------------------------------------------------------------------------------- /pkg/src/hbase-to-df.h: -------------------------------------------------------------------------------- 1 | //Copyright 2012 Revolution Analytics 2 | // 3 | //Licensed under the Apache License, Version 2.0 (the "License"); 4 | //you may not use this file except in compliance with the License. 5 | //You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | //Unless required by applicable law or agreed to in writing, software 10 | //distributed under the License is distributed on an "AS IS" BASIS, 11 | //WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | //See the License for the specific language governing permissions and 13 | //limitations under the License. 14 | 15 | #ifndef _RMR_HBASE_TO_DF_H 16 | #define _RMR_HBASE_TO_DF_H 17 | 18 | #include 19 | 20 | 21 | RcppExport SEXP hbase_to_df(SEXP _source, SEXP _dest); 22 | RcppExport SEXP p_string_to_raw(SEXP _source); 23 | RcppExport SEXP raw_list_to_character(SEXP _source); 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /pkg/src/keyval.cpp: -------------------------------------------------------------------------------- 1 | //Copyright 2013 Revolution Analytics 2 | // 3 | //Licensed under the Apache License, Version 2.0 (the "License"); 4 | //you may not use this file except in compliance with the License. 5 | //You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | //Unless required by applicable law or agreed to in writing, software 10 | //distributed under the License is distributed on an "AS IS" BASIS, 11 | //WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | //See the License for the specific language governing permissions and 13 | //limitations under the License. 14 | 15 | #include "keyval.h" 16 | #include 17 | #include 18 | 19 | using namespace Rcpp; 20 | 21 | SEXP null_purge(SEXP xx) { 22 | List _xx(xx); 23 | int n = _xx.size(); 24 | int not_null_count = 0; 25 | for (int i = 0; i < n; i ++) 26 | if (!Rf_isNull(_xx[i])) not_null_count++; 27 | List yy(not_null_count); 28 | for (int i = 0, j = 0; i < n; i ++) 29 | if (!Rf_isNull(_xx[i])){ 30 | yy[j] = _xx[i]; 31 | j++;} 32 | return wrap(yy);} 33 | 34 | SEXP lapply_as_character(SEXP xx) { 35 | List _xx(xx); 36 | List yy(_xx.size()); 37 | for (int i = 0; i < _xx.size(); i ++) 38 | yy[i] = Rf_asCharacterFactor(_xx[i]); 39 | return wrap(yy);} 40 | 41 | int rmr_length(SEXP x) { 42 | if(Rf_isMatrix(x)) 43 | return Rf_nrows(x); 44 | RObject _x(x); 45 | if (_x.hasAttribute("class")) { 46 | if(as(_x.attr("class")) == "data.frame") { 47 | List __x(x); 48 | if(Rf_length(__x) == 0) { 49 | return(0);} 50 | else { 51 | return(Rf_length(__x[0]));}}} 52 | return Rf_length(x);} 53 | 54 | SEXP sapply_rmr_length(SEXP xx) { 55 | List _xx(xx); 56 | std::vector results(_xx.size()); 57 | for(unsigned int i = 0; i < _xx.size(); i++) { 58 | results[i] = rmr_length(_xx[i]);} 59 | return(wrap(results));} 60 | 61 | SEXP sapply_rmr_length_lossy_data_frame(SEXP xx){ 62 | List _xx(xx); 63 | std::vector results(_xx.size()); 64 | for(unsigned int i = 0; i < _xx.size(); i++) { 65 | List cols(as(_xx[i])); 66 | results[i] = rmr_length(cols[0]);} 67 | return wrap(results);} 68 | 69 | int length_keyval(SEXP kv) { 70 | List kv_(kv); 71 | int kl = rmr_length(kv_["key"]); 72 | int vl = rmr_length(kv_["val"]); 73 | if (kl > vl) return(kl); 74 | return(vl);} 75 | 76 | SEXP sapply_length_keyval(SEXP kvs) { 77 | List _kvs(kvs); 78 | std::vector results(_kvs.size()); 79 | for(unsigned int i = 0; i < _kvs.size(); i++) { 80 | results[i] = length_keyval(_kvs[i]);} 81 | return(wrap(results));} 82 | 83 | SEXP sapply_null_keys(SEXP kvs) { 84 | List _kvs(kvs); 85 | std::vector results(_kvs.size()); 86 | for(unsigned int i = 0; i < _kvs.size(); i++) { 87 | List kv(wrap(_kvs[i])); 88 | results[i] = Rf_isNull(kv["key"]);} 89 | return(wrap(results));} 90 | 91 | SEXP sapply_is_list(SEXP l) { 92 | List _l(l); 93 | std::vector results(_l.size()); 94 | for(unsigned int i = 0; i < _l.size(); i++) { 95 | results[i] = (as(_l[i]).sexp_type() == VECSXP);} 96 | return wrap(results);} 97 | 98 | SEXP lapply_key_val(SEXP kvs, std::string slot) { 99 | List _kvs(kvs); 100 | List results(_kvs.size()); 101 | for(unsigned int i = 0; i < _kvs.size(); i++) { 102 | List kv(wrap(_kvs[i])); 103 | results[i] = kv[slot];} 104 | return wrap(results);} 105 | 106 | SEXP lapply_keys(SEXP kvs) { 107 | return lapply_key_val(kvs, "key");} 108 | 109 | SEXP lapply_values(SEXP kvs) { 110 | return lapply_key_val(kvs, "val");} 111 | 112 | SEXP are_factor(SEXP xx) { 113 | List _xx(xx); 114 | std::vector results(_xx.size()); 115 | for(unsigned int i = 0; i < _xx.size(); i++) { 116 | results[i] = Rf_isFactor(_xx[i]);} 117 | return wrap(results);} 118 | 119 | bool is_data_frame(SEXP x) { 120 | RObject _x(x); 121 | if (_x.hasAttribute("class")) { 122 | if(as(_x.attr("class")) == "data.frame") { 123 | return true;}} 124 | return false;} 125 | 126 | SEXP are_data_frame(SEXP xx) { 127 | List _xx(xx); 128 | std::vector results(_xx.size()); 129 | for(unsigned int i = 0; i < _xx.size(); i++) { 130 | results[i] = is_data_frame(_xx[i]);} 131 | return wrap(results);} 132 | 133 | SEXP are_matrix(SEXP xx) { 134 | List _xx(xx); 135 | std::vector results(_xx.size()); 136 | for(unsigned int i = 0; i < _xx.size(); i++) { 137 | results[i] = Rf_isMatrix(_xx[i]);} 138 | return wrap(results);} -------------------------------------------------------------------------------- /pkg/src/keyval.h: -------------------------------------------------------------------------------- 1 | //Copyright 2013 Revolution Analytics 2 | // 3 | //Licensed under the Apache License, Version 2.0 (the "License"); 4 | //you may not use this file except in compliance with the License. 5 | //You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | //Unless required by applicable law or agreed to in writing, software 10 | //distributed under the License is distributed on an "AS IS" BASIS, 11 | //WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | //See the License for the specific language governing permissions and 13 | //limitations under the License. 14 | 15 | #ifndef _RMR_KEYVAL_H 16 | #define _RMR_KEYVAL_H 17 | 18 | #include 19 | 20 | RcppExport SEXP null_purge(SEXP xx); 21 | RcppExport SEXP lapply_as_character(SEXP xx); 22 | RcppExport SEXP sapply_rmr_length(SEXP xx); 23 | RcppExport SEXP sapply_rmr_length_lossy_data_frame(SEXP xx); 24 | RcppExport SEXP sapply_length_keyval(SEXP kvs); 25 | RcppExport SEXP sapply_null_keys(SEXP kvs); 26 | RcppExport SEXP sapply_is_list(SEXP l); 27 | RcppExport SEXP lapply_keys(SEXP kvs); 28 | RcppExport SEXP lapply_values(SEXP kvs); 29 | RcppExport SEXP are_factor(SEXP xx); 30 | RcppExport SEXP are_data_frame(SEXP xx); 31 | RcppExport SEXP are_matrix(SEXP xx); 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /pkg/src/t-list.cpp: -------------------------------------------------------------------------------- 1 | //Copyright 2013 Revolution Analytics 2 | // 3 | //Licensed under the Apache License, Version 2.0 (the "License"); 4 | //you may not use this file except in compliance with the License. 5 | //You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | //Unless required by applicable law or agreed to in writing, software 10 | //distributed under the License is distributed on an "AS IS" BASIS, 11 | //WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | //See the License for the specific language governing permissions and 13 | //limitations under the License. 14 | 15 | #include "t-list.h" 16 | 17 | using namespace Rcpp; 18 | using std::vector; 19 | using std::cerr; 20 | using std::endl; 21 | 22 | 23 | SEXP t_list(SEXP _ll) { 24 | List ll(_ll); 25 | List l_0(as(ll[0])); 26 | List tll(l_0.size()); 27 | for(unsigned int j = 0; j < tll.size(); j++) 28 | tll[j] = List(ll.size()); 29 | for(unsigned int i = 0; i < ll.size(); i++) { 30 | List l_i(as(ll[i])); 31 | for(unsigned int j = 0; j < tll.size(); j++) { 32 | as(tll[j])[i] = l_i[j];};} 33 | return wrap(tll);} -------------------------------------------------------------------------------- /pkg/src/t-list.h: -------------------------------------------------------------------------------- 1 | //Copyright 2013 Revolution Analytics 2 | // 3 | //Licensed under the Apache License, Version 2.0 (the "License"); 4 | //you may not use this file except in compliance with the License. 5 | //You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | //Unless required by applicable law or agreed to in writing, software 10 | //distributed under the License is distributed on an "AS IS" BASIS, 11 | //WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | //See the License for the specific language governing permissions and 13 | //limitations under the License. 14 | 15 | #ifndef _RMR_TLIST_H 16 | #define _RMR_TLIST_H 17 | 18 | #include 19 | 20 | 21 | RcppExport SEXP t_list(SEXP _ll); 22 | 23 | 24 | #endif -------------------------------------------------------------------------------- /pkg/src/typed-bytes.h: -------------------------------------------------------------------------------- 1 | //Copyright 2011 Revolution Analytics 2 | // 3 | //Licensed under the Apache License, Version 2.0 (the "License"); 4 | //you may not use this file except in compliance with the License. 5 | //You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | //Unless required by applicable law or agreed to in writing, software 10 | //distributed under the License is distributed on an "AS IS" BASIS, 11 | //WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | //See the License for the specific language governing permissions and 13 | //limitations under the License. 14 | 15 | #ifndef _RMR_TYPEDBYTES_H 16 | #define _RMR_TYPEDBYTES_H 17 | 18 | #include 19 | 20 | 21 | RcppExport SEXP typedbytes_reader(SEXP data); 22 | RcppExport SEXP typedbytes_writer(SEXP data, SEXP native); 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /pkg/tests/IO.R: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Revolution Analytics 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | ## test for typed bytes read/write 16 | library(quickcheck) 17 | library(rmr2) 18 | 19 | 20 | # test( 21 | # function(l) { 22 | # l = rapply(l, how = 'replace', 23 | # function(x){ 24 | # if(is.null(x)) list() 25 | # else as.list(x)}) 26 | # isTRUE(all.equal(l, 27 | # rmr2:::typedbytes.reader(rmr2:::typedbytes.writer(l), length(l) + 5)$objects, 28 | # check.attributes = FALSE))}, 29 | # generators = list(rlist)) 30 | -------------------------------------------------------------------------------- /pkg/tests/avro.R: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Revolution Analytics 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | library(rmr2) 16 | library(testthat) 17 | library(ravro) 18 | library(rhdfs) 19 | hdfs.init() 20 | 21 | rmr.options(backend = "hadoop") 22 | 23 | pathname = ravro::AVRO_TOOLS 24 | if(.Platform$OS.type == "windows") { 25 | subfname = strsplit(pathname, ":") 26 | if(length(subfname[[1]]) > 1) 27 | { 28 | pathname = subfname[[1]][2] 29 | } 30 | pathname = gsub("\"","",pathname) 31 | pathname = shortPathName(pathname) 32 | pathname = gsub("\\\\","/",pathname)} 33 | Sys.setenv(AVRO_LIBS = pathname) 34 | 35 | test_avro_rmr <- 36 | function(df, test, write.args = list(), 37 | input.format.args = list(), map = function(k, v) v ) { 38 | if(rmr.options("backend") == "local") TRUE 39 | else { 40 | tf1 = tempfile(fileext = ".avro") 41 | expect_true(do.call(ravro:::write.avro, c(list(df, tf1), write.args))) 42 | tf2 = "/tmp/rmr2.test" 43 | tf3 = file.path(tf2, "data.avro") 44 | hdfs.mkdir(tf2) 45 | hdfs.put(tf1, tf3) 46 | on.exit(hdfs.rmr(tf2)) 47 | df.input.format <- do.call(make.input.format, 48 | c(list( 49 | format = "avro", 50 | schema.file = tf1), 51 | input.format.args)) 52 | retdf <- values( 53 | from.dfs( 54 | mapreduce( 55 | tf2, 56 | map = map, 57 | input.format = df.input.format))) 58 | retdf <- retdf[row.names(df), ] 59 | attributes(retdf) <- attributes(retdf)[names(attributes(df))] 60 | test(retdf) 61 | }} 62 | 63 | expect_equal_avro_rmr <- function(df, ...){ 64 | row.names(df) <- row.names(df) # rmr2 uses row.names function which coerces to character 65 | # We need to make sure row.names for x is character or else this will always fail 66 | test_avro_rmr(df, function(x)expect_equal(x, df), ...) 67 | } 68 | 69 | expect_equivalent_avro_rmr <- function(df, ...) 70 | test_avro_rmr(df, function(x)expect_equivalent(x, df), ...) 71 | 72 | d <- data.frame(x = 1, 73 | y = as.factor(1:10), 74 | fac = as.factor(sample(letters[1:3], 10, replace = TRUE))) 75 | expect_equivalent_avro_rmr(d) 76 | 77 | 78 | ########################################################################################## 79 | 80 | context("Basic Avro Read/Write") 81 | 82 | ### Handeling Factors 83 | # Warnings: Factor levels converted to valid Avro names 84 | 85 | test_that("Handling factors", { 86 | # Factors with non-"name" levels should still work 87 | d <- data.frame(x = 1, 88 | y = as.factor(1:10), 89 | fac = as.factor(sample(letters[1:3], 10, replace = TRUE))) 90 | expect_equivalent_avro_rmr(d) #order of levels can change 91 | }) 92 | 93 | 94 | ### Type Translation 95 | 96 | test_that("type translation", { 97 | # All types should translate successfully 98 | L3 <- LETTERS[1:3] 99 | fac <- sample(L3, 10, replace = TRUE) 100 | d <- data.frame(x = 1, y = 1:10, fac = fac, b = rep(c(TRUE, FALSE), 5), c = rep(NA, 10), 101 | stringsAsFactors = FALSE) 102 | expect_equal_avro_rmr(d) 103 | 104 | d <- data.frame(x = 1, y = 1:10, fac = factor(fac, levels = L3), 105 | b = rep(c(TRUE, FALSE), 5), c = rep(NA, 10), 106 | stringsAsFactors = FALSE) 107 | expect_equivalent_avro_rmr(d) 108 | }) 109 | 110 | ### write can handle missing values 111 | 112 | test_that("write can handle missing values", { 113 | # NA column (entirely "null" in Avro) 114 | d <- data.frame(x = 1, 115 | y = 1:10, 116 | b = rep(c(TRUE, FALSE), 5), 117 | c = rep(NA, 10), 118 | stringsAsFactors = FALSE) 119 | expect_equal_avro_rmr(d) 120 | 121 | # NA row (entirely "null" in Avro) 122 | d <- rbind(data.frame(x = 1, 123 | y = 1:10, 124 | b = rep(c(TRUE, FALSE), 5)), 125 | rep(NA, 3)) 126 | expect_equal_avro_rmr(d) 127 | }) 128 | 129 | ### NaNs throw warning 130 | 131 | test_that("NaNs throw warning", { 132 | # NaN row (entirely "null" in Avro) 133 | d <- rbind(data.frame(x = 1, 134 | y = 1:10, 135 | b = rep(c(TRUE, FALSE), 5)), 136 | rep(NaN, 3)) 137 | d[nrow(d), ] <- NA 138 | expect_equal_avro_rmr(d) 139 | 140 | # NaN row (entirely "null" in Avro) 141 | d <- cbind(data.frame(x = 1, 142 | y = 1:10, 143 | b = rep(c(TRUE, FALSE), 5)), 144 | c = rep(NaN, 10)) 145 | d[, ncol(d)] <- as.numeric(NA) # coerce this type 146 | expect_equal_avro_rmr(d) 147 | }) 148 | 149 | ### write.avro throws error on infinite values 150 | ## Infinite values cannot be serialied to Avro (which is good, what test verifies) 151 | 152 | test_that("write.avro throws error on infinite values", { 153 | d <- rbind(data.frame(x = 1, y = 1:10, b = rep(c(TRUE, FALSE), 5)), rep(NA, 3), 154 | c(Inf, 11, TRUE, NA)) 155 | expect_that(expect_equal_avro_rmr(d), throws_error()) 156 | 157 | d <- rbind(data.frame(x = 1, y = 1:10, b = rep(c(TRUE, FALSE), 5)), rep(NA, 3), 158 | c(-Inf, 11, TRUE, NA)) 159 | expect_that(expect_equal_avro_rmr(d), throws_error()) 160 | }) 161 | 162 | ############################ Read/Write mtcars and iris ############################### 163 | 164 | context("Read/Write mtcars and iris") 165 | 166 | ### mtcars round trip 167 | 168 | test_that("mtcars round trip", { 169 | expect_equal_avro_rmr(mtcars) 170 | }) 171 | 172 | 173 | ### factors level that are not Avro names read/write 174 | ## mttmp equivalent despite refactorization (good, warnings) 175 | # 1: In (function (x, name = NULL, namespace = NULL, is.union = F, row.names = T, : 176 | # Factor levels converted to valid Avro names: _3_ravro, _4_ravro, _5_ravro 177 | 178 | test_that("factors level that are not Avro names read/write", { 179 | mttmp <- mtcars 180 | mttmp$gear_factor <- as.factor(mttmp$gear) 181 | expect_equal_avro_rmr(mttmp) 182 | }) 183 | 184 | 185 | ### iris round trip 186 | ## iris_avro not equivalent 187 | # Length mismatch: comparison on first 3 components 188 | 189 | test_that("iris round trip", { 190 | # This doesn't work, because rmr2::from.dfs uses rbind to combine the values together 191 | #expect_equal_avro_rmr(iris, write.args = list(unflatten = T), input.format.args = list(flatten = F)) 192 | 193 | expect_equal_avro_rmr(iris, write.args = list(unflatten = T), input.format.args = list(flatten = T)) 194 | }) 195 | 196 | -------------------------------------------------------------------------------- /pkg/tests/basic-examples.R: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Revolution Analytics 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # lapply like job, first intro 16 | 17 | library(rmr2) 18 | 19 | for (be in c("local", "hadoop")) { 20 | rmr.options(backend = be) 21 | ## @knitr lapply 22 | small.ints = 1:1000 23 | sapply(small.ints, function(x) x^2) 24 | ## @knitr lapply-mapreduce 25 | small.ints = to.dfs(1:1000) 26 | mapreduce( 27 | input = small.ints, 28 | map = function(k, v) cbind(v, v^2)) 29 | ## @knitr end 30 | from.dfs( 31 | mapreduce( 32 | input = small.ints, 33 | map = function(k, v) cbind(v, v^2))) 34 | 35 | # tapply like job 36 | ## @knitr tapply 37 | groups = rbinom(32, n = 50, prob = 0.4) 38 | tapply(groups, groups, length) 39 | ## @knitr tapply-mapreduce 40 | groups = to.dfs(groups) 41 | from.dfs( 42 | mapreduce( 43 | input = groups, 44 | map = function(., v) keyval(v, 1), 45 | reduce = 46 | function(k, vv) 47 | keyval(k, length(vv)))) 48 | ## @knitr end 49 | 50 | ## input can be any rmr-native format file 51 | ## pred can be function(x) x > 0 52 | ## it will be evaluated on the value only, not on the key 53 | 54 | ## @knitr basic.examples-filter 55 | filter.map = 56 | function(pred) 57 | function(., v) {v[pred(v)]} 58 | 59 | mrfilter = 60 | function (input, 61 | output = NULL, 62 | pred) { 63 | mapreduce( 64 | input = input, 65 | output = output, 66 | map = filter.map(pred))} 67 | 68 | filtertest = to.dfs(rnorm(10)) 69 | from.dfs( 70 | mrfilter( 71 | input = filtertest, 72 | pred = function(x) x > 0)) 73 | } 74 | ## @knitr end 75 | 76 | ## pipeline of two filters, sweet 77 | # from.dfs(mrfilter(input = mrfilter( 78 | # input = "/tmp/filtertest/", 79 | # pred = function(x) x > 0), 80 | # pred = function(x) x < 0.5)) 81 | -------------------------------------------------------------------------------- /pkg/tests/basic.R: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Revolution Analytics 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # the problem with the tests here is that they are more complex than the function they are meant to test 16 | # or just a duplication. That's not very useful. 17 | 18 | library(rmr2) 19 | library(quickcheck) 20 | 21 | #qw 22 | test( 23 | function(ss = rcharacter()) { 24 | ss = paste("v", ss, sep = "") 25 | all(ss == eval(parse(text = paste("rmr2:::qw(", paste(ss, collapse = ","), ")"))))}) 26 | 27 | # Make.single.arg 28 | test( 29 | function(l = rlist()) { 30 | f = function(...) list(...) 31 | g = rmr2:::Make.single.arg(f) 32 | identical(do.call(f, l), g(l))}) 33 | 34 | # Make.multi.arg 35 | test( 36 | function(l = rlist()) { 37 | f = function(x) x 38 | g = rmr2:::Make.multi.arg(f) 39 | identical(do.call(g, l), f(l))}) 40 | 41 | # Make.single.or.multi.arg 42 | test( 43 | function( 44 | l = rlist(size = c(min = 2)), 45 | arity = sample(c("single", "multi"), size = 1)) { 46 | f = if(arity == "single") unlist else c 47 | g = rmr2:::Make.single.or.multi.arg(f, from = arity) 48 | identical(g(l), do.call(g, l))}) 49 | 50 | #%:% TODO 51 | # all.predicate TODO 52 | 53 | # make.fast.list TODO 54 | # actually the function has been working forever, the test doesn't 55 | 56 | # test( 57 | # function(l){ 58 | # fl = rmr2:::make.fast.list() 59 | # lapply(l, fl) 60 | # print(x=as.list(do.call(c, l))) 61 | # print(x=fl()) 62 | # identical(as.list(do.call(c, l)), fl())}, 63 | # list(Curry(rlist,lambda=1, max.level=8))) 64 | # 65 | 66 | #named.slice TODO 67 | #interleave TODO 68 | 69 | -------------------------------------------------------------------------------- /pkg/tests/benchmarks.R: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Revolution Analytics 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | library(rmr2) 16 | 17 | report = list() 18 | for (be in c("local", "hadoop")) { 19 | rmr.options(backend = be) 20 | ## @knitr input 21 | input.size = { 22 | if(rmr.options('backend') == "local") 23 | 10^4 24 | else 25 | 10^6} 26 | ## @knitr end 27 | report[[be]] = 28 | rbind( 29 | report[[be]], 30 | write = 31 | system.time({ 32 | ## @knitr write 33 | input = to.dfs(1:input.size) 34 | ## @knitr end 35 | })) 36 | 37 | report[[be]] = 38 | rbind( 39 | report[[be]], 40 | read = 41 | system.time({ 42 | out = 43 | ## @knitr read 44 | from.dfs(input) 45 | ## @knitr end 46 | })) 47 | stopifnot( 48 | all( 49 | 1:input.size == sort(values(out)))) 50 | 51 | report[[be]] = 52 | rbind( 53 | report[[be]], 54 | pass.through = system.time({ 55 | out = 56 | ## @knitr pass-through 57 | mapreduce( 58 | input, 59 | map = function(k, v) keyval(k, v)) 60 | ## @knitr end 61 | })) 62 | stopifnot( 63 | all( 64 | 1:input.size == 65 | sort(values(from.dfs(out))))) 66 | 67 | ## @knitr predicate 68 | predicate = 69 | function(., v) v%%2 == 0 70 | ## @knitr end 71 | report[[be]] = 72 | rbind( 73 | report[[be]], 74 | filter = system.time({ 75 | out = 76 | ## @knitr filter 77 | mapreduce( 78 | input, 79 | map = 80 | function(k, v) { 81 | filter = predicate(k, v) 82 | keyval(k[filter], v[filter])}) 83 | ## @knitr end 84 | })) 85 | stopifnot( 86 | all( 87 | 2*(1:(input.size/2)) == 88 | sort(values(from.dfs(out))))) 89 | 90 | ## @knitr select-input 91 | input.select = 92 | to.dfs( 93 | data.frame( 94 | a = rnorm(input.size), 95 | b = 1:input.size, 96 | c = sample(as.character(1:10), 97 | input.size, 98 | replace=TRUE))) 99 | ## @knitr end 100 | report[[be]] = 101 | rbind( 102 | report[[be]], 103 | select = system.time({ 104 | out = 105 | ## @knitr select 106 | mapreduce(input.select, 107 | map = function(., v) v$b) 108 | ## @knitr end 109 | })) 110 | stopifnot( 111 | all( 112 | 1:input.size == 113 | sort(values(from.dfs(out))))) 114 | 115 | ## @knitr bigsum-input 116 | set.seed(0) 117 | big.sample = rnorm(input.size) 118 | input.bigsum = to.dfs(big.sample) 119 | ## @knitr end 120 | report[[be]] = 121 | rbind( 122 | report[[be]], 123 | bigsum = system.time({ 124 | out = 125 | ## @knitr bigsum 126 | mapreduce( 127 | input.bigsum, 128 | map = 129 | function(., v) keyval(1, sum(v)), 130 | reduce = 131 | function(., v) keyval(1, sum(v)), 132 | combine = TRUE) 133 | ## @knitr end 134 | })) 135 | stopifnot( 136 | isTRUE( 137 | all.equal( 138 | sum(values(from.dfs(out))), 139 | sum(big.sample), 140 | tolerance=.000001))) 141 | ## @knitr group-aggregate-input 142 | input.ga = 143 | to.dfs( 144 | cbind( 145 | 1:input.size, 146 | rnorm(input.size))) 147 | ## @knitr group-aggregate-functions 148 | group = function(x) x%%10 149 | aggregate = function(x) sum(x) 150 | ## @knitr end 151 | report[[be]] = 152 | rbind( 153 | report[[be]], 154 | group.aggregate = system.time({ 155 | out = 156 | ## @knitr group-aggregate 157 | mapreduce( 158 | input.ga, 159 | map = 160 | function(k, v) 161 | keyval(group(v[,1]), v[,2]), 162 | reduce = 163 | function(k, vv) 164 | keyval(k, aggregate(vv)), 165 | combine = TRUE) 166 | ## @knitr end 167 | })) 168 | log.input.size = log10(input.size) 169 | z = plyr::splat(rbind)( 170 | c( 171 | lapply(0:log.input.size, function(i) system.time(to.dfs(keyval(data.frame(1:10^i), data.frame(1:10^log.input.size))))), 172 | lapply(0:log.input.size, function(i) {z = to.dfs(keyval(data.frame(1:10^i), data.frame(1:10^log.input.size))); system.time(from.dfs(z))}), 173 | lapply(0:log.input.size, function(i) {z = to.dfs(keyval(data.frame(1:10^i), data.frame(1:10^log.input.size))); system.time(mapreduce(z))}), 174 | lapply(0:(log.input.size-2), function(i) {z = to.dfs(keyval(data.frame(1:10^i), data.frame(1:10^log.input.size))); 175 | system.time(mapreduce(z, reduce = function(k,v) as.data.frame(t(colSums(v)))))}))) 176 | row.names(z) = make.names(t(outer(c("to.dfs","from.dfs", "map only", "map reduce"), c(0:log.input.size), paste)))[1:(4*(1 + log.input.size) - 2)] 177 | report[[be]] = rbind(report[[be]], z) 178 | } 179 | 180 | 181 | print(report) 182 | 183 | -------------------------------------------------------------------------------- /pkg/tests/getting-data-in-and-out.R: -------------------------------------------------------------------------------- 1 | # Copyright 2013 Revolution Analytics 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # partly from community contributed examples (with code) 16 | # additional copyrights may apply 17 | 18 | library(rmr2) 19 | ## @knitr getting-data.IO.formats 20 | rmr2:::IO.formats 21 | ## @knitr getting-data.make.input.format.csv 22 | make.input.format("csv") 23 | ## @knitr getting-data.make.output.format.csv 24 | make.output.format("csv") 25 | ## @knitr getting-data.generic.list 26 | my.data = list(TRUE, list("nested list", 7.2), seq(1:3), letters[1:4], matrix(1:25, nrow = 5,ncol = 5)) 27 | ## @knitr getting-data.to.dfs 28 | hdfs.data = to.dfs(my.data) 29 | ## @knitr getting-data.object.length.frequency 30 | result = mapreduce( 31 | input = hdfs.data, 32 | map = function(k, v) keyval(lapply(v, length), 1), 33 | reduce = function(k, vv) keyval(k, sum(vv))) 34 | 35 | from.dfs(result) 36 | ## @knitr end 37 | ## @knitr getting-data.tsv.reader 38 | tsv.reader = function(con){ 39 | lines = readLines(con, 1000) 40 | if(length(lines) == 0) 41 | NULL 42 | else { 43 | delim = strsplit(lines, split = "\t") 44 | keyval( 45 | sapply(delim, 46 | function(x) x[1]), 47 | sapply(delim, 48 | function(x) x[-1]))}} 49 | ## first column is the key, note that column indexes moved by 1 50 | ## @knitr getting-data.tsv.input.format 51 | tsv.format = 52 | make.input.format( 53 | format = tsv.reader, 54 | mode = "text") 55 | ## @knitr getting-data.generate.tsv.data 56 | 57 | tsv.data = 58 | to.dfs( 59 | data.frame( 60 | x = 1:100, 61 | y = rnorm(100), 62 | z = runif(100), 63 | w = 1:100), 64 | format = 65 | make.output.format("csv", sep = "\t")) 66 | ## @knitr getting-data.frequency.count 67 | freq.counts = 68 | mapreduce( 69 | input = tsv.data, 70 | input.format = tsv.format, 71 | map = function(k, v) keyval(v[1,], 1), 72 | reduce = function(k, vv) keyval(k, sum(vv))) 73 | ## @knitr getting-data.named.columns 74 | tsv.reader = 75 | function(con){ 76 | lines = readLines(con, 1000) 77 | if(length(lines) == 0) 78 | NULL 79 | else { 80 | delim = strsplit(lines, split = "\t") 81 | keyval( 82 | sapply(delim, function(x) x[1]), 83 | data.frame( 84 | location = sapply(delim, function(x) x[2]), 85 | name = sapply(delim, function(x) x[3]), 86 | value = sapply(delim, function(x) x[4])))}} 87 | 88 | ## @knitr getting-data.tsv.input.format.1 89 | tsv.format = 90 | make.input.format( 91 | format = tsv.reader, 92 | mode = "text") 93 | ## @knitr getting-data.named.column.access 94 | freq.counts = 95 | mapreduce( 96 | input = tsv.data, 97 | input.format = tsv.format, 98 | map = 99 | function(k, v) { 100 | filter = (v$name == "blarg") 101 | keyval(k[filter], log(as.numeric(v$value[filter])))}, 102 | reduce = function(k, vv) keyval(k, mean(vv))) 103 | ## @knitr getting-data.csv.output 104 | csv.writer = function(kv, con){ 105 | cat( 106 | paste( 107 | apply(cbind(1:32, mtcars), 108 | 1, 109 | paste, collapse = ","), 110 | collapse = "\n"), 111 | file = con)} 112 | ## @knitr getting-data.csv.output.simpler 113 | csv.format = make.output.format("csv", sep = ",") 114 | ## @knitr getting-data.explicit.output.arg 115 | mapreduce( 116 | input = hdfs.data, 117 | output = tempfile(), 118 | output.format = csv.format, 119 | map = function(k, v){ 120 | # complicated function here 121 | keyval(1, v)}, 122 | reduce = function(k, vv) { 123 | #complicated function here 124 | keyval(k, vv[[1]])}) 125 | ## @knitr getting-data.create.fields.list 126 | fields <- rmr2:::qw(mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb) 127 | field.size = 8 128 | ## @knitr getting-data.fwf.reader 129 | fwf.reader <- function(con) { 130 | lines <- readLines(con, 1000) 131 | if (length(lines) == 0) { 132 | NULL} 133 | else { 134 | split.lines = unlist(strsplit(lines, "")) 135 | df = 136 | as.data.frame( 137 | matrix( 138 | sapply( 139 | split( 140 | split.lines, 141 | ceiling(1:length(split.lines)/field.size)), 142 | paste, collapse = ""), 143 | ncol = length(fields), byrow = TRUE)) 144 | names(df) = fields 145 | keyval(NULL, df)}} 146 | fwf.input.format = make.input.format(mode = "text", format = fwf.reader) 147 | ## @knitr getting-data.fwf.writer 148 | fwf.writer <- function(kv, con) { 149 | ser = 150 | function(df) 151 | paste( 152 | apply( 153 | df, 154 | 1, 155 | function(x) 156 | paste( 157 | format( 158 | x, 159 | width = field.size), 160 | collapse = "")), 161 | collapse = "\n") 162 | out = ser(values(kv)) 163 | writeLines(out, con = con)} 164 | fwf.output.format = make.output.format(mode = "text", format = fwf.writer) 165 | ## @knitr getting-data.generate.fwf.data 166 | fwf.data <- to.dfs(mtcars, format = fwf.output.format) 167 | ## @knitr getting-data.from.dfs.one.line 168 | out <- from.dfs(mapreduce(input = fwf.data, 169 | input.format = fwf.input.format)) 170 | out$val 171 | ## @knitr getting-data.cyl.frequency.count 172 | out <- from.dfs(mapreduce(input = fwf.data, 173 | input.format = fwf.input.format, 174 | map = function(key, value) keyval(value[,"cyl"], 1), 175 | reduce = function(key, value) keyval(key, sum(unlist(value))), 176 | combine = TRUE)) 177 | df <- data.frame(out$key, out$val) 178 | names(df) <- c("cyl","count") 179 | df 180 | ## @knitr end 181 | 182 | -------------------------------------------------------------------------------- /pkg/tests/keyval.R: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Revolution Analytics 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | library(quickcheck) 16 | library(rmr2) 17 | 18 | #has.rows 19 | test( 20 | function(x = rmr2:::rrmr.data()) { 21 | is.null(nrow(x)) == !rmr2:::has.rows(x)}) 22 | 23 | #all.have rows TODO 24 | #rmr.length TODO 25 | 26 | #keyval, keys.values 27 | test( 28 | function(k = rmr2:::rrmr.data(size = c(min = 1)), v = rmr2:::rrmr.data(size = ~rmr2:::rmr.length(k))){ 29 | kv = keyval(k, v) 30 | identical(keys(kv), k) && 31 | identical(values(kv), v)}) 32 | 33 | #NULL key case 34 | test( 35 | function(v = rmr2:::rrmr.data(size = c(min = 1))){ 36 | k = NULL 37 | kv = keyval(k, v) 38 | identical(keys(kv), k) && 39 | identical(values(kv), v)}) -------------------------------------------------------------------------------- /pkg/tests/kmeans.R: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Revolution Analytics 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | library(rmr2) 16 | 17 | ## @knitr kmeans-signature 18 | kmeans.mr = 19 | function( 20 | P, 21 | num.clusters, 22 | num.iter, 23 | combine, 24 | in.memory.combine) { 25 | ## @knitr kmeans-dist.fun 26 | dist.fun = 27 | function(C, P) { 28 | apply( 29 | C, 30 | 1, 31 | function(x) 32 | colSums((t(P) - x)^2))} 33 | ## @knitr kmeans.map 34 | kmeans.map = 35 | function(., P) { 36 | nearest = { 37 | if(is.null(C)) 38 | sample( 39 | 1:num.clusters, 40 | nrow(P), 41 | replace = TRUE) 42 | else { 43 | D = dist.fun(C, P) 44 | nearest = max.col(-D)}} 45 | if(!(combine || in.memory.combine)) 46 | keyval(nearest, P) 47 | else 48 | keyval(nearest, cbind(1, P))} 49 | ## @knitr kmeans.reduce 50 | kmeans.reduce = { 51 | if (!(combine || in.memory.combine) ) 52 | function(., P) 53 | t(as.matrix(apply(P, 2, mean))) 54 | else 55 | function(k, P) 56 | keyval( 57 | k, 58 | t(as.matrix(apply(P, 2, sum))))} 59 | ## @knitr kmeans-main-1 60 | C = NULL 61 | for(i in 1:num.iter ) { 62 | C = 63 | values( 64 | from.dfs( 65 | mapreduce( 66 | P, 67 | map = kmeans.map, 68 | reduce = kmeans.reduce))) 69 | if(combine || in.memory.combine) 70 | C = C[, -1]/C[, 1] 71 | ## @knitr end 72 | # points(C, col = i + 1, pch = 19) 73 | ## @knitr kmeans-main-2 74 | if(nrow(C) < num.clusters) { 75 | C = 76 | rbind( 77 | C, 78 | matrix( 79 | rnorm( 80 | (num.clusters - 81 | nrow(C)) * nrow(C)), 82 | ncol = nrow(C)) %*% C) }} 83 | C} 84 | ## @knitr end 85 | 86 | ## sample runs 87 | ## 88 | 89 | out = list() 90 | 91 | for(be in c("local", "hadoop")) { 92 | rmr.options(backend = be) 93 | set.seed(0) 94 | ## @knitr kmeans-data 95 | P = 96 | do.call( 97 | rbind, 98 | rep( 99 | list( 100 | matrix( 101 | rnorm(10, sd = 10), 102 | ncol=2)), 103 | 20)) + 104 | matrix(rnorm(200), ncol =2) 105 | ## @knitr end 106 | # x11() 107 | # plot(P) 108 | # points(P) 109 | out[[be]] = 110 | ## @knitr kmeans-run 111 | kmeans.mr( 112 | to.dfs(P), 113 | num.clusters = 12, 114 | num.iter = 5, 115 | combine = FALSE, 116 | in.memory.combine = FALSE) 117 | ## @knitr end 118 | } 119 | 120 | # would love to take this step but kmeans in randomized in a way that makes it hard to be completely reprodubile 121 | # stopifnot(rmr2:::cmp(out[['hadoop']], out[['local']])) 122 | -------------------------------------------------------------------------------- /pkg/tests/linear-least-squares.R: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Revolution Analytics 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | library(rmr2) 16 | 17 | ## @knitr LLS-data 18 | X = matrix(rnorm(2000), ncol = 10) 19 | X.index = to.dfs(cbind(1:nrow(X), X)) 20 | y = as.matrix(rnorm(200)) 21 | ## @knitr LLS-sum 22 | Sum = 23 | function(., YY) 24 | keyval(1, list(Reduce('+', YY))) 25 | ## @knitr LLS-XtX 26 | XtX = 27 | values( 28 | from.dfs( 29 | mapreduce( 30 | input = X.index, 31 | map = 32 | function(., Xi) { 33 | Xi = Xi[,-1] 34 | keyval(1, list(t(Xi) %*% Xi))}, 35 | reduce = Sum, 36 | combine = TRUE)))[[1]] 37 | ## @knitr LLS-Xty 38 | Xty = 39 | values( 40 | from.dfs( 41 | mapreduce( 42 | input = X.index, 43 | map = function(., Xi) { 44 | yi = y[Xi[,1],] 45 | Xi = Xi[,-1] 46 | keyval(1, list(t(Xi) %*% yi))}, 47 | reduce = Sum, 48 | combine = TRUE)))[[1]] 49 | ## @knitr LLS-solve 50 | solve(XtX, Xty) 51 | ## @knitr end -------------------------------------------------------------------------------- /pkg/tests/logistic-regression.R: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Revolution Analytics 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | ## see spark implementation http://www.spark-project.org/examples.html 17 | ## see nice derivation here http://people.csail.mit.edu/jrennie/writing/lr.pdf 18 | 19 | library(rmr2) 20 | 21 | ## @knitr logistic.regression-signature 22 | logistic.regression = 23 | function(input, iterations, dims, alpha){ 24 | 25 | ## @knitr logistic.regression-map 26 | lr.map = 27 | function(., M) { 28 | Y = M[,1] 29 | X = M[,-1] 30 | keyval( 31 | 1, 32 | Y * X * 33 | g(-Y * as.numeric(X %*% t(plane))))} 34 | ## @knitr logistic.regression-reduce 35 | lr.reduce = 36 | function(k, Z) 37 | keyval(k, t(as.matrix(apply(Z,2,sum)))) 38 | ## @knitr logistic.regression-main 39 | plane = t(rep(0, dims)) 40 | g = function(z) 1/(1 + exp(-z)) 41 | for (i in 1:iterations) { 42 | gradient = 43 | values( 44 | from.dfs( 45 | mapreduce( 46 | input, 47 | map = lr.map, 48 | reduce = lr.reduce, 49 | combine = TRUE))) 50 | plane = plane + alpha * gradient } 51 | plane } 52 | ## @knitr end 53 | 54 | out = list() 55 | test.size = 10^5 56 | for (be in c("local", "hadoop")) { 57 | rmr.options(backend = be) 58 | ## create test set 59 | set.seed(0) 60 | ## @knitr logistic.regression-data 61 | eps = rnorm(test.size) 62 | testdata = 63 | to.dfs( 64 | as.matrix( 65 | data.frame( 66 | y = 2 * (eps > 0) - 1, 67 | x1 = 1:test.size, 68 | x2 = 1:test.size + eps))) 69 | ## @knitr end 70 | out[[be]] = 71 | ## @knitr logistic.regression-run 72 | logistic.regression( 73 | testdata, 3, 2, 0.05) 74 | ## @knitr end 75 | ## max likelihood solution diverges for separable dataset, (-inf, inf) such as the above 76 | } 77 | stopifnot( 78 | isTRUE(all.equal(out[['local']], out[['hadoop']], tolerance = 1E-7))) 79 | -------------------------------------------------------------------------------- /pkg/tests/mapreduce.R: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Revolution Analytics 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | library(quickcheck) 16 | library(rmr2) 17 | library(rhdfs) 18 | hdfs.init() 19 | 20 | kv.cmp = rmr2:::kv.cmp 21 | 22 | 23 | for (be in c("local", "hadoop")) { 24 | rmr.options(backend = be) 25 | 26 | ##from.dfs to.dfs 27 | 28 | ##native 29 | test( 30 | function(kv = rmr2:::rkeyval()) 31 | kv.cmp( 32 | kv, 33 | from.dfs(to.dfs(kv)))) 34 | 35 | ## csv 36 | ## no support for raw in csv 37 | 38 | test( 39 | function(df = rmr2:::rdata.frame.simple()) 40 | kv.cmp( 41 | keyval(NULL, df), 42 | from.dfs( 43 | to.dfs( 44 | keyval(NULL, df), 45 | format = "csv"), 46 | format = "csv"))) 47 | 48 | #json 49 | fmt = "json" 50 | test( 51 | function(df = rmr2:::rdata.frame.simple()) 52 | kv.cmp( 53 | keyval(1, df), 54 | from.dfs( 55 | to.dfs( 56 | keyval(1, df), 57 | format = fmt), 58 | format = make.input.format("json", key.class = "list", value.class = "data.frame")))) 59 | 60 | #sequence.typedbytes 61 | seq.tb.data.loss = 62 | function(l) 63 | rapply( 64 | l, 65 | function(x){ 66 | if(class(x) == "Date") x = unclass(x) 67 | if(is.factor(x)) x = as.character(x) 68 | if(class(x) == "raw" || length(x) == 1) x else as.list(x)}, 69 | how = "replace") 70 | 71 | fmt = "sequence.typedbytes" 72 | test( 73 | function(l = rlist()) { 74 | l = c(0, l) 75 | kv = keyval(seq.tb.data.loss(list(1)), seq.tb.data.loss(l)) 76 | kv.cmp( 77 | kv, 78 | from.dfs( 79 | to.dfs( 80 | kv, 81 | format = fmt), 82 | format = fmt))}) 83 | 84 | ##mapreduce 85 | 86 | ##simplest mapreduce, all default 87 | test( 88 | function(kv = rmr2:::rkeyval()) { 89 | if(rmr2:::length.keyval(kv) == 0) TRUE 90 | else { 91 | kv1 = from.dfs(mapreduce(input = to.dfs(kv))) 92 | kv.cmp(kv, kv1)}}) 93 | 94 | ##put in a reduce for good measure 95 | test( 96 | function(kv = rmr2:::rkeyval()) { 97 | if(rmr2:::length.keyval(kv) == 0) TRUE 98 | else { 99 | kv1 = 100 | from.dfs( 101 | mapreduce( 102 | input = to.dfs(kv), 103 | reduce = to.reduce(identity))) 104 | kv.cmp(kv, kv1)}}) 105 | 106 | ## csv 107 | test( 108 | function(df = rmr2:::rdata.frame.simple()) 109 | kv.cmp( 110 | keyval(NULL, df), 111 | from.dfs( 112 | mapreduce( 113 | to.dfs( 114 | keyval(NULL, df), 115 | format = "csv"), 116 | input.format = "csv", 117 | output.format = "csv"), 118 | format = "csv"))) 119 | 120 | #json 121 | # a more general test would be better for json but the subtleties of mapping R to to JSON are many 122 | fmt = "json" 123 | test( 124 | function(df = rmr2:::rdata.frame.simple()) 125 | kv.cmp( 126 | keyval(1, df), 127 | from.dfs( 128 | mapreduce( 129 | to.dfs( 130 | keyval(1, df), 131 | format = fmt), 132 | input.format = make.input.format("json", key.class = "list", value.class = "data.frame"), 133 | output.format = fmt), 134 | format = make.input.format("json", key.class = "list", value.class = "data.frame")))) 135 | 136 | #sequence.typedbytes 137 | fmt = "sequence.typedbytes" 138 | test( 139 | function(l = rlist()) { 140 | l = c(0, l) 141 | kv = keyval(seq.tb.data.loss(list(1)), seq.tb.data.loss(l)) 142 | l = c(0, l) 143 | kv.cmp( 144 | kv, 145 | from.dfs( 146 | mapreduce( 147 | to.dfs( 148 | kv, 149 | format = fmt), 150 | input.format = fmt, 151 | output.format = fmt), 152 | format = fmt))}) 153 | 154 | #avro 155 | pathname = ravro::AVRO_TOOLS 156 | if(.Platform$OS.type == "windows") { 157 | subfname = strsplit(pathname, ":") 158 | if(length(subfname[[1]]) > 1) 159 | { 160 | pathname = subfname[[1]][2] 161 | } 162 | pathname = gsub("\"","",pathname) 163 | pathname = shortPathName(pathname) 164 | pathname = gsub("\\\\","/",pathname)} 165 | Sys.setenv(AVRO_LIBS = pathname) 166 | 167 | test( 168 | function(df = rmr2:::rdata.frame.simple(nrow = c(min = 2))) { 169 | if(rmr.options("backend") == "local") TRUE 170 | else { 171 | names(df) = sub("\\.", "_", names(df)) 172 | tf1 = tempfile() 173 | ravro:::write.avro(df, tf1) 174 | tf2 = "/tmp/rmr2.test.avro" 175 | on.exit(hdfs.rm(tf2)) 176 | hdfs.put(tf1, tf2) 177 | kv.cmp( 178 | keyval(NULL, df), 179 | from.dfs( 180 | mapreduce( 181 | tf2, 182 | input.format = 183 | make.input.format( 184 | format = "avro", 185 | schema.file = tf1))))}}) 186 | 187 | #equijoin 188 | stopifnot( 189 | all( 190 | apply( 191 | values( 192 | from.dfs( 193 | equijoin( 194 | left.input = to.dfs(keyval(1:10, (1:10)^2)), 195 | right.input = to.dfs(keyval(1:10, (1:10)^3))))), 196 | 1, 197 | function(x) x[[1]]^(3/2) == x[[2]]))) 198 | } -------------------------------------------------------------------------------- /pkg/tests/naive-bayes.R: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Revolution Analytics 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # this is just an example, not part of a math library 16 | # matrix A_{ij} representation is a list of keyval(c(i,j), A_{ij}) 17 | # vectors are column matrices 18 | 19 | ## input format is keyval(NULL, list(x=c(x1, ..., xn), y = y) 20 | 21 | ##library(rmr2) 22 | 23 | ##naive.bayes = function(input, output = NULL) { 24 | ## mapreduce(input = input, output = output, 25 | ## map = function(k, v) c(lapply(1:length(v$x) function(i) keyval(c(i, v$x[i], v$y),1)), 26 | ## lapply), 27 | ## reduce = function(k, vv) keyval(k, sum(unlist(vv))), 28 | ## combiner = TRUE) 29 | ##} -------------------------------------------------------------------------------- /pkg/tests/wordcount.R: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Revolution Analytics 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | ## classic wordcount 17 | ## input can be any text file 18 | ## inspect output with from.dfs(output) -- this will produce an R list watch out with big datasets 19 | 20 | library(rmr2) 21 | 22 | ## @knitr wordcount-signature 23 | wordcount = 24 | function( 25 | input, 26 | output = NULL, 27 | pattern = " "){ 28 | ## @knitr wordcount-map 29 | wc.map = 30 | function(., lines) { 31 | keyval( 32 | unlist( 33 | strsplit( 34 | x = lines, 35 | split = pattern)), 36 | 1)} 37 | ## @knitr wordcount-reduce 38 | wc.reduce = 39 | function(word, counts ) { 40 | keyval(word, sum(counts))} 41 | ## @knitr wordcount-mapreduce 42 | mapreduce( 43 | input = input, 44 | output = output, 45 | map = wc.map, 46 | reduce = wc.reduce, 47 | combine = TRUE)} 48 | ## @knitr end 49 | 50 | text = capture.output(license()) 51 | out = list() 52 | for(be in c("local", "hadoop")) { 53 | rmr.options(backend = be) 54 | out[[be]] = from.dfs(wordcount(to.dfs(keyval(NULL, text)), pattern = " +"))} 55 | stopifnot(rmr2:::kv.cmp(out$hadoop, out$local)) 56 | -------------------------------------------------------------------------------- /pkg/tools/whirr/README: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Revolution Analytics 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Newly-released Whirr 0.7.1 fixes problems with the Java installation which 16 | # were caused by changes in licensing of Java by Oracle 17 | 18 | # To set up a hadoop/rmr cluster first launch the cluster 19 | 20 | $WHIRR_HOME/bin/whirr launch-cluster --config hadoop-ec2.properties 21 | #this config slightly tweaked from whirr distro, starts 5 large nodes 22 | 23 | # then install R and rmr 24 | $WHIRR_HOME/bin/whirr run-script --script rmr.sh --config hadoop-ec2.properties 25 | 26 | # remember to destroy when done. You are responsible for any AWS charges 27 | 28 | $WHIRR_HOME/bin/whirr destroy-cluster --config hadoop-ec2.properties 29 | 30 | # 'hadoop-ec2-centos.properties' and 'rmr-master-centos.sh' can be used with 31 | # the above steps to create a CentOS 4.6-based cluster using a RightScale AMI 32 | -------------------------------------------------------------------------------- /pkg/tools/whirr/hadoop-ec2-centos.properties: -------------------------------------------------------------------------------- 1 | # hadoop-ec2-centos.properties by Jeffrey Breen, based on hadoop-ec2.properties 2 | # 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | # 18 | 19 | # 20 | # Hadoop Cluster on AWS EC2 21 | # 22 | # With changes to make installing RHadoop/rmr easier 23 | 24 | # Read the Configuration Guide for more info: 25 | # http://incubator.apache.org/whirr/configuration-guide.html 26 | 27 | # Change the cluster name here 28 | whirr.cluster-name=hadoop-ec2-centos-${env:USER} 29 | 30 | # Change the number of machines in the cluster here 31 | whirr.instance-templates=1 hadoop-namenode+hadoop-jobtracker,5 hadoop-datanode+hadoop-tasktracker 32 | 33 | # Uncomment out these lines to run CDH 34 | # You need cdh3 because of the streaming combiner backport 35 | whirr.hadoop.install-function=install_cdh_hadoop 36 | whirr.hadoop.configure-function=configure_cdh_hadoop 37 | # just-released Whirr 0.7.1 fixes java: 38 | whirr.java.install-function=install_oab_java 39 | 40 | # For EC2 set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables. 41 | whirr.provider=aws-ec2 42 | whirr.identity=${env:AWS_ACCESS_KEY_ID} 43 | whirr.credential=${env:AWS_SECRET_ACCESS_KEY} 44 | 45 | # The size of the instance to use. See http://aws.amazon.com/ec2/instance-types/ 46 | whirr.hardware-id=m1.large 47 | # whirr.hardware-id=c1.xlarge 48 | 49 | # select recent, 64-bit CentOS 4.6 AMI from RightScale 50 | whirr.image-id=us-east-1/ami-49e32320 51 | 52 | # If you choose a different location, make sure whirr.image-id is updated too 53 | whirr.location-id=us-east-1 54 | 55 | # You can also specify the spot instance price 56 | # http://aws.amazon.com/ec2/spot-instances/ 57 | # whirr.aws-ec2-spot-price=0.15 58 | 59 | # By default use the user system SSH keys. Override them here. 60 | # whirr.private-key-file=${sys:user.home}/.ssh/id_rsa 61 | # whirr.public-key-file=${whirr.private-key-file}.pub 62 | 63 | # Expert: override Hadoop properties by setting properties with the prefix 64 | # hadoop-common, hadoop-hdfs, hadoop-mapreduce to set Common, HDFS, MapReduce 65 | # site properties, respectively. The prefix is removed by Whirr, so that for 66 | # example, setting 67 | # hadoop-common.fs.trash.interval=1440 68 | # will result in fs.trash.interval being set to 1440 in core-site.xml. 69 | 70 | # Expert: specify the version of Hadoop to install. 71 | #whirr.hadoop.version=0.20.2 72 | #whirr.hadoop.tarball.url=http://archive.apache.org/dist/hadoop/core/hadoop-${whirr.hadoop.version}/hadoop-${whirr.hadoop.version}.tar.gz 73 | -------------------------------------------------------------------------------- /pkg/tools/whirr/hadoop-ec2.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # 19 | # Hadoop Cluster on AWS EC2 20 | # 21 | # With changes to make installing RHadoop/rmr easier 22 | 23 | # Read the Configuration Guide for more info: 24 | # http://incubator.apache.org/whirr/configuration-guide.html 25 | 26 | # Change the cluster name here 27 | whirr.cluster-name=hadoop-ec2-${env:USER} 28 | 29 | # Change the number of machines in the cluster here 30 | whirr.instance-templates=1 hadoop-namenode+hadoop-jobtracker,5 hadoop-datanode+hadoop-tasktracker 31 | 32 | 33 | # Uncomment out these lines to run CDH 34 | # You need cdh3 because of several backported patches from 0.21 35 | # Apache Hadoop 1.0.2 is also expected to work 36 | whirr.env.repo=cdh4 37 | whirr.hadoop.install-function=install_cdh_hadoop 38 | whirr.hadoop.configure-function=configure_cdh_hadoop 39 | 40 | # just-released Whirr 0.7.1 fixes java: 41 | # whirr.java.install-function=install_oab_java 42 | 43 | # For EC2 set AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables. 44 | whirr.provider=aws-ec2 45 | whirr.identity=${env:AWS_ACCESS_KEY_ID} 46 | whirr.credential=${env:AWS_SECRET_ACCESS_KEY} 47 | 48 | # The size of the instance to use. See http://aws.amazon.com/ec2/instance-types/ 49 | whirr.hardware-id=m1.xlarge 50 | # Ubuntu 12.04 LTS Precise. See http://alestic.com/ 51 | whirr.image-id=us-east-1/ami-eafa5883 52 | 53 | # If you choose a different location, make sure whirr.image-id is updated too 54 | whirr.location-id=us-east-1 55 | 56 | # You can also specify the spot instance price 57 | # http://aws.amazon.com/ec2/spot-instances/ 58 | # whirr.aws-ec2-spot-price=0.15 59 | 60 | # By default use the user system SSH keys. Override them here. 61 | # whirr.private-key-file=${sys:user.home}/.ssh/id_rsa 62 | # whirr.public-key-file=${whirr.private-key-file}.pub 63 | 64 | # Expert: override Hadoop properties by setting properties with the prefix 65 | # hadoop-common, hadoop-hdfs, hadoop-mapreduce to set Common, HDFS, MapReduce 66 | # site properties, respectively. The prefix is removed by Whirr, so that for 67 | # example, setting 68 | # hadoop-common.fs.trash.interval=1440 69 | # will result in fs.trash.interval being set to 1440 in core-site.xml. 70 | hadoop-common.io.compression.codec.lzo.class=com.hadoop.compression.lzo.LzoCodec 71 | 72 | # Expert: specify the version of Hadoop to install. 73 | #whirr.hadoop.version=0.20.2 74 | #whirr.hadoop.tarball.url=http://archive.apache.org/dist/hadoop/core/hadoop-${whirr.hadoop.version}/hadoop-${whirr.hadoop.version}.tar.gz 75 | 76 | hadoop-env.JAVA_HOME=/usr/lib/jvm/java-1.6.0-openjdk-amd64 77 | 78 | -------------------------------------------------------------------------------- /pkg/tools/whirr/lzo-centos.sh: -------------------------------------------------------------------------------- 1 | #get a fresher ant than yum would 2 | wget --no-check-certificate http://apache.cs.utah.edu//ant/binaries/apache-ant-1.8.4-bin.tar.gz 3 | tar xvzf apache-ant-1.8.4-bin.tar.gz 4 | export ANT_HOME=/home/users/antonio/apache-ant-1.8.4 5 | sudo yum install -y xml-commons-apis 6 | sudo yum install -y gcc 7 | sudo yum install -y lzo-devel 8 | sudo yum install -y make 9 | wget --no-check-certificate https://github.com/kambatla/hadoop-lzo/tarball/master 10 | tar xvzf master 11 | cd kambatla* 12 | export CFLAGS=-m64 13 | export CXXFLAGS=-m64 14 | ant package 15 | sudo cp build/hadoop-lzo-*.jar /usr/lib/hadoop/lib/ 16 | sudo mkdir -p /usr/lib/hadoop/lib/native/ 17 | sudo cp build/native/Linux-*-*/lib/libgplcompression.* /usr/lib/hadoop/lib/native/ 18 | 19 | sudo /etc/init.d//hadoop-0.20-tasktracker restart 20 | -------------------------------------------------------------------------------- /pkg/tools/whirr/lzo-ubuntu.sh: -------------------------------------------------------------------------------- 1 | wget https://github.com/kambatla/hadoop-lzo/tarball/master 2 | tar xvzf master 3 | cd kambatla* 4 | export CFLAGS=-m64 5 | export CXXFLAGS=-m64 6 | sudo apt-get install -y ant 7 | sudo apt-get install -y gcc 8 | sudo apt-get install -y liblzo2-dev 9 | sudo apt-get install -y make 10 | export JAVA_HOME=/usr/lib/jvm/java-1.6.0-openjdk-amd64 11 | ant package 12 | sudo cp build/hadoop-lzo-*.jar /usr/lib/hadoop/lib/ 13 | sudo mkdir -p /usr/lib/hadoop/lib/native/ 14 | sudo cp build/native/Linux-*-*/lib/libgplcompression.* /usr/lib/hadoop/lib/native/ 15 | 16 | sudo /etc/init.d//hadoop-*-tasktracker restart 17 | -------------------------------------------------------------------------------- /pkg/tools/whirr/rmr-dev.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Revolution Analytics 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | branch=dev 16 | sudo apt-get install -y r-base-core 17 | sudo apt-get install -y r-cran-rcpp 18 | sudo R --no-save << EOF 19 | install.packages(c('RJSONIO', 'digest', 'functional', 'stringr', 'plyr'), repos = "http://cran.us.r-project.org", INSTALL_opts=c('--byte-compile') ) 20 | EOF 21 | 22 | rm -rf $branch RHadoop 23 | curl -L https://github.com/RevolutionAnalytics/RHadoop/tarball/$branch | tar zx 24 | mv RevolutionAnalytics-RHadoop* RHadoop 25 | sudo R CMD INSTALL --byte-compile RHadoop/rmr2/pkg/ 26 | 27 | sudo su << EOF1 28 | echo ' 29 | export HADOOP_CMD=/usr/bin/hadoop 30 | export HADOOP_STREAMING=/usr/lib/hadoop-0.20-mapreduce/contrib/streaming/hadoop-streaming-2.0.0-mr1-cdh4.1.2.jar 31 | ' >> /etc/profile 32 | 33 | EOF1 34 | -------------------------------------------------------------------------------- /pkg/tools/whirr/rmr-master-centos.sh: -------------------------------------------------------------------------------- 1 | # rmr-master-centos.sh by Jeffrey Breen, based on rmr-master.sh 2 | # original copyright attached: 3 | # 4 | # Copyright 2011 Revolution Analytics 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | sudo yum -y --enablerepo=epel install R R-devel 19 | 20 | sudo R --no-save << EOF 21 | install.packages(c('Rcpp', 'RJSONIO', 'itertools', 'digest'), repos="http://cran.revolutionanalytics.com", INSTALL_opts=c('--byte-compile') ) 22 | EOF 23 | 24 | # install the rmr package from RHadoop: 25 | 26 | branch=master 27 | 28 | wget --no-check-certificate https://github.com/RevolutionAnalytics/RHadoop/tarball/$branch -O - | tar zx 29 | mv RevolutionAnalytics-RHadoop* RHadoop 30 | sudo R CMD INSTALL --byte-compile RHadoop/rmr/pkg/ 31 | 32 | sudo su << EOF1 33 | cat >> /etc/profile <> /etc/profile 31 | 32 | EOF1 33 | --------------------------------------------------------------------------------