├── .evergreen
    ├── compile.sh
    ├── config.yml
    └── run-tests.sh
├── .gitignore
├── CONTRIBUTORS.md
├── History.md
├── README.md
├── build.gradle
├── clusterConfigs
    ├── core-site.xml
    ├── hdfs-site.xml
    ├── hive-site.xml
    └── mapred-site.xml
├── config
    ├── checkstyle-lite.xml
    ├── checkstyle.xml
    └── findbugs-exclude.xml
├── core
    └── src
    │   ├── main
    │       └── java
    │       │   └── com
    │       │       └── mongodb
    │       │           └── hadoop
    │       │               ├── BSONFileInputFormat.java
    │       │               ├── BSONFileOutputFormat.java
    │       │               ├── BSONPathFilter.java
    │       │               ├── GridFSInputFormat.java
    │       │               ├── MongoConfig.java
    │       │               ├── MongoInputFormat.java
    │       │               ├── MongoOutput.java
    │       │               ├── MongoOutputFormat.java
    │       │               ├── input
    │       │                   ├── BSONFileRecordReader.java
    │       │                   ├── BSONFileSplit.java
    │       │                   ├── GridFSSplit.java
    │       │                   ├── MongoInputSplit.java
    │       │                   └── MongoRecordReader.java
    │       │               ├── io
    │       │                   ├── BSONWritable.java
    │       │                   ├── BSONWritableComparator.java
    │       │                   ├── DataOutputOutputStreamAdapter.java
    │       │                   ├── MongoUpdateWritable.java
    │       │                   └── MongoWritableTypes.java
    │       │               ├── mapred
    │       │                   ├── BSONFileInputFormat.java
    │       │                   ├── BSONFileOutputFormat.java
    │       │                   ├── MongoInputFormat.java
    │       │                   ├── MongoOutputFormat.java
    │       │                   ├── input
    │       │                   │   ├── BSONFileRecordReader.java
    │       │                   │   ├── BSONFileSplit.java
    │       │                   │   └── MongoRecordReader.java
    │       │                   └── output
    │       │                   │   ├── BSONFileRecordWriter.java
    │       │                   │   ├── MongoOutputCommitter.java
    │       │                   │   └── MongoRecordWriter.java
    │       │               ├── output
    │       │                   ├── BSONFileRecordWriter.java
    │       │                   ├── MongoOutputCommitter.java
    │       │                   └── MongoRecordWriter.java
    │       │               ├── splitter
    │       │                   ├── BSONSplitter.java
    │       │                   ├── MongoCollectionSplitter.java
    │       │                   ├── MongoPaginatingSplitter.java
    │       │                   ├── MongoSplitter.java
    │       │                   ├── MongoSplitterFactory.java
    │       │                   ├── MultiCollectionSplitBuilder.java
    │       │                   ├── MultiMongoCollectionSplitter.java
    │       │                   ├── SampleSplitter.java
    │       │                   ├── ShardChunkMongoSplitter.java
    │       │                   ├── ShardMongoSplitter.java
    │       │                   ├── SingleMongoSplitter.java
    │       │                   ├── SplitFailedException.java
    │       │                   └── StandaloneMongoSplitter.java
    │       │               └── util
    │       │                   ├── BSONComparator.java
    │       │                   ├── BSONLoader.java
    │       │                   ├── CompatUtils.java
    │       │                   ├── MapredMongoConfigUtil.java
    │       │                   ├── MongoClientURIBuilder.java
    │       │                   ├── MongoConfigUtil.java
    │       │                   ├── MongoPathRetriever.java
    │       │                   ├── MongoTool.java
    │       │                   └── SplitFriendlyDBCallback.java
    │   └── test
    │       ├── java
    │           └── com
    │           │   └── mongodb
    │           │       └── hadoop
    │           │           ├── BSONFileInputFormatTest.java
    │           │           ├── GridFSInputFormatTest.java
    │           │           ├── HadoopVersionFilter.java
    │           │           ├── MongoConfigUnitTests.java
    │           │           ├── MongoOutputCommitterTest.java
    │           │           ├── bookstore
    │           │               ├── BookstoreConfig.java
    │           │               ├── BookstoreTest.java
    │           │               ├── TagsMapper.java
    │           │               └── TagsReducer.java
    │           │           ├── io
    │           │               ├── BSONWritableTest.java
    │           │               ├── MongoInputSplitTest.java
    │           │               └── MongoUpdateWritableTest.java
    │           │           ├── mapred
    │           │               └── BSONFileInputFormatTest.java
    │           │           ├── splitter
    │           │               ├── BSONFileRecordReaderTest.java
    │           │               ├── BSONSplitterTest.java
    │           │               ├── MongoPaginatingSplitterTest.java
    │           │               ├── MongoRecordReaderTest.java
    │           │               ├── MongoSplitterFactoryTest.java
    │           │               ├── MongoSplitterTestUtils.java
    │           │               ├── SampleSplitterTest.java
    │           │               ├── ShardChunkMongoSplitterTest.java
    │           │               └── StandaloneMongoSplitterTest.java
    │           │           ├── testutils
    │           │               ├── BaseHadoopTest.java
    │           │               └── MapReduceJob.java
    │           │           └── util
    │           │               └── MongoConfigUtilTest.java
    │       └── resources
    │           └── bookstore-dump
    │               ├── inventory.bson
    │               ├── orders.bson
    │               ├── publishers.bson
    │               └── system.indexes.bson
├── examples
    ├── elastic-mapreduce
    │   ├── emr-bootstrap.sh
    │   ├── run_emr_job.sh
    │   └── update_s3.sh
    ├── enron
    │   ├── hive
    │   │   └── hive_enron.q
    │   ├── pig
    │   │   └── pig_enron.pig
    │   ├── run_job.sh
    │   ├── spark
    │   │   └── src
    │   │   │   └── main
    │   │   │       └── java
    │   │   │           └── com
    │   │   │               └── mongodb
    │   │   │                   └── spark
    │   │   │                       └── examples
    │   │   │                           └── enron
    │   │   │                               ├── DataframeExample.java
    │   │   │                               ├── Enron.java
    │   │   │                               └── Message.java
    │   └── src
    │   │   └── main
    │   │       └── java
    │   │           └── com
    │   │               └── mongodb
    │   │                   └── hadoop
    │   │                       └── examples
    │   │                           └── enron
    │   │                               ├── EnronMail.java
    │   │                               ├── EnronMailMapper.java
    │   │                               ├── EnronMailReducer.java
    │   │                               └── MailPair.java
    ├── sensors
    │   ├── run_job.sh
    │   ├── src
    │   │   └── main
    │   │   │   └── java
    │   │   │       └── com
    │   │   │           └── mongodb
    │   │   │               └── hadoop
    │   │   │                   └── examples
    │   │   │                       └── sensors
    │   │   │                           ├── DeviceMapper.java
    │   │   │                           ├── DeviceReducer.java
    │   │   │                           ├── Devices.java
    │   │   │                           ├── LogCombiner.java
    │   │   │                           ├── LogMapper.java
    │   │   │                           ├── LogReducer.java
    │   │   │                           ├── Logs.java
    │   │   │                           └── SensorDataGenerator.java
    │   └── testdata_generator.js
    ├── shakespeare
    │   └── src
    │   │   └── main
    │   │       └── java
    │   │           └── com
    │   │               └── mongodb
    │   │                   └── hadoop
    │   │                       └── examples
    │   │                           └── shakespeare
    │   │                               ├── PrepareShakespeare.java
    │   │                               └── Shakespeare.java
    └── treasury_yield
    │   ├── pig
    │       └── pig_mongo_test.pig
    │   ├── run_job.sh
    │   └── src
    │       ├── main
    │           ├── java
    │           │   └── com
    │           │   │   └── mongodb
    │           │   │       └── hadoop
    │           │   │           └── examples
    │           │   │               └── treasury
    │           │   │                   ├── TreasuryYieldMapper.java
    │           │   │                   ├── TreasuryYieldMulti.java
    │           │   │                   ├── TreasuryYieldReducer.java
    │           │   │                   ├── TreasuryYieldUpdateReducer.java
    │           │   │                   └── TreasuryYieldXMLConfig.java
    │           └── resources
    │           │   ├── commons-logging.properties
    │           │   ├── mongo-defaults.xml
    │           │   ├── parse_yield_historical.py
    │           │   ├── yield_historical_Jan90_Sep10.xml
    │           │   └── yield_historical_in.json
    │       └── test
    │           ├── java
    │               └── com
    │               │   └── mongodb
    │               │       └── hadoop
    │               │           ├── BaseShardedTest.java
    │               │           ├── JarFinder.java
    │               │           ├── StreamingJob.java
    │               │           ├── TestSharded.java
    │               │           ├── TestStandalone.java
    │               │           ├── TestStreaming.java
    │               │           └── TreasuryTest.java
    │           └── resources
    │               ├── commons-logging.properties
    │               ├── log4j.properties
    │               └── yarn-site.xml
├── flume
    └── src
    │   └── main
    │       └── java
    │           └── com
    │               └── mongodb
    │                   └── flume
    │                       ├── BucketedMongoDBSink.java
    │                       └── MongoDBSink.java
├── gradle
    ├── functions.gradle
    ├── hadoop.gradle
    ├── maven-deployment.gradle
    └── wrapper
    │   ├── gradle-wrapper.jar
    │   └── gradle-wrapper.properties
├── gradlew
├── gradlew.bat
├── hive
    └── src
    │   ├── main
    │       └── java
    │       │   └── com
    │       │       └── mongodb
    │       │           └── hadoop
    │       │               └── hive
    │       │                   ├── BSONSerDe.java
    │       │                   ├── MongoStorageHandler.java
    │       │                   ├── input
    │       │                       └── HiveMongoInputFormat.java
    │       │                   └── output
    │       │                       ├── HiveBSONFileOutputFormat.java
    │       │                       └── HiveMongoOutputFormat.java
    │   └── test
    │       ├── java
    │           └── com
    │           │   └── mongodb
    │           │       └── hadoop
    │           │           └── hive
    │           │               ├── BSONSerDeTest.java
    │           │               ├── HiveMappingTest.java
    │           │               ├── HiveQueryTest.java
    │           │               ├── HiveTest.java
    │           │               ├── MongoStorageHandlerTest.java
    │           │               ├── Results.java
    │           │               ├── TablePropertiesTest.java
    │           │               ├── TestBsonToHive.java
    │           │               ├── TestHDFSToMongoDB.java
    │           │               ├── TestHDFSToMongoDBWithOptions.java
    │           │               └── input
    │           │                   └── HiveMongoInputFormatTest.java
    │       └── resources
    │           ├── core-site.xml
    │           ├── hivetable.properties
    │           ├── log4j.properties
    │           ├── test_data.txt
    │           ├── users.bson
    │           └── yarn-site.xml
├── mongo-defaults.xml
├── pig
    └── src
    │   ├── main
    │       └── java
    │       │   └── com
    │       │       └── mongodb
    │       │           └── hadoop
    │       │               └── pig
    │       │                   ├── BSONLoader.java
    │       │                   ├── BSONStorage.java
    │       │                   ├── JSONPigReplace.java
    │       │                   ├── MongoInsertStorage.java
    │       │                   ├── MongoLoader.java
    │       │                   ├── MongoStorage.java
    │       │                   ├── MongoStorageOptions.java
    │       │                   ├── MongoUpdateStorage.java
    │       │                   └── udf
    │       │                       ├── ByteArrayTypeEvalFunc.java
    │       │                       ├── GenMaxKey.java
    │       │                       ├── GenMinKey.java
    │       │                       ├── ObjectIdToSeconds.java
    │       │                       ├── ToBinary.java
    │       │                       ├── ToDBRef.java
    │       │                       ├── ToObjectId.java
    │       │                       └── types
    │       │                           ├── PigBoxedBSONValue.java
    │       │                           ├── PigBoxedBinary.java
    │       │                           ├── PigBoxedDBRef.java
    │       │                           ├── PigBoxedMaxKey.java
    │       │                           ├── PigBoxedMinKey.java
    │       │                           └── PigBoxedObjectId.java
    │   └── test
    │       ├── java
    │           ├── com
    │           │   └── mongodb
    │           │   │   └── hadoop
    │           │   │       └── pig
    │           │   │           ├── BSONStorageTest.java
    │           │   │           ├── JSONPigReplaceTest.java
    │           │   │           ├── MongoLoaderTest.java
    │           │   │           ├── MongoStorageOptionsTest.java
    │           │   │           ├── MongoStorageTest.java
    │           │   │           ├── PigTest.java
    │           │   │           └── UDFTest.java
    │           └── helpers
    │           │   └── TOBAG.java
    │       └── resources
    │           ├── dump
    │               └── test
    │               │   ├── persons_info.bson
    │               │   └── persons_info.metadata.json
    │           └── pig
    │               ├── bson_schemaless.pig
    │               ├── bson_test.pig
    │               ├── datestest.pig
    │               ├── ensure_index.pig
    │               ├── ensure_index_2.pig
    │               ├── genminmaxkeys.pig
    │               ├── oidtoseconds.pig
    │               ├── pig_uuid.pig
    │               ├── projection.pig
    │               ├── replace_mus.pig
    │               ├── schemaless.pig
    │               ├── tobinary.pig
    │               ├── todbref.pig
    │               ├── toobjectid.pig
    │               ├── udfschemaless.pig
    │               ├── update_age_alabis_mus.pig
    │               └── update_simple_mus.pig
├── settings.gradle
├── spark
    └── src
    │   └── main
    │       ├── java
    │           └── com
    │           │   └── mongodb
    │           │       └── spark
    │           │           ├── PySparkBSONFileInputFormat.java
    │           │           ├── PySparkBSONFileOutputFormat.java
    │           │           ├── PySparkMongoInputFormat.java
    │           │           ├── PySparkMongoOutputFormat.java
    │           │           └── pickle
    │           │               ├── BSONPickler.java
    │           │               ├── BSONValueBox.java
    │           │               ├── BinaryConstructor.java
    │           │               ├── CalendarTransformer.java
    │           │               ├── CodeConstructor.java
    │           │               ├── DBRefConstructor.java
    │           │               ├── Int64Constructor.java
    │           │               ├── MaxKeyConstructor.java
    │           │               ├── MinKeyConstructor.java
    │           │               ├── ObjectIdConstructor.java
    │           │               ├── RegexConstructor.java
    │           │               ├── RegisterConstructors.java
    │           │               ├── RegisterPickles.java
    │           │               └── TimestampConstructor.java
    │       ├── python
    │           ├── README.rst
    │           ├── pymongo_spark.py
    │           ├── setup.py
    │           └── test
    │           │   ├── __init__.py
    │           │   └── test_pymongo_spark.py
    │       └── scala
    │           └── com
    │               └── mongodb
    │                   └── spark
    │                       └── pickle
    │                           └── NoopConverter.scala
├── streaming
    ├── examples
    │   ├── enron
    │   │   ├── enron_map.js
    │   │   ├── enron_map.py
    │   │   ├── enron_map.rb
    │   │   ├── enron_reduce.js
    │   │   ├── enron_reduce.py
    │   │   ├── enron_reduce.rb
    │   │   ├── run_enron.sh
    │   │   ├── run_enron_js.sh
    │   │   └── run_enron_rb.sh
    │   ├── treasury
    │   │   ├── mapper.py
    │   │   ├── mapper.rb
    │   │   ├── mapper_kv.py
    │   │   ├── mapper_kv.rb
    │   │   ├── reducer.py
    │   │   ├── reducer.rb
    │   │   ├── reducer_kv.py
    │   │   ├── reducer_kv.rb
    │   │   ├── run_treas_kv_py.sh
    │   │   ├── run_treas_kv_rb.sh
    │   │   ├── run_treas_py.sh
    │   │   └── run_treas_rb.sh
    │   └── twitter
    │   │   ├── README.md
    │   │   ├── run_twit_py.sh
    │   │   ├── run_twit_rb.sh
    │   │   ├── twit_hashtag_map.py
    │   │   ├── twit_hashtag_reduce.py
    │   │   ├── twit_map.py
    │   │   ├── twit_map.rb
    │   │   ├── twit_reduce.py
    │   │   └── twit_reduce.rb
    ├── language_support
    │   ├── js
    │   │   ├── node_mongo_hadoop.js
    │   │   └── package.json
    │   ├── python
    │   │   ├── README.rst
    │   │   ├── distribute_setup.py
    │   │   ├── pymongo_hadoop
    │   │   │   ├── __init__.py
    │   │   │   ├── input.py
    │   │   │   ├── mapper.py
    │   │   │   ├── output.py
    │   │   │   └── reducer.py
    │   │   ├── setup.py
    │   │   └── test_install.py
    │   └── ruby
    │   │   ├── README.md
    │   │   ├── bin
    │   │       └── mongo-hadoop
    │   │   ├── lib
    │   │       ├── mongo-hadoop.rb
    │   │       └── mongo-hadoop
    │   │       │   ├── input.rb
    │   │       │   ├── mapper.rb
    │   │       │   ├── output.rb
    │   │       │   └── reducer.rb
    │   │   ├── mongo-hadoop.gemspec
    │   │   └── templates
    │   │       ├── mapper.tt
    │   │       ├── reducer.tt
    │   │       └── runner.tt
    └── src
    │   ├── main
    │       └── java
    │       │   └── com
    │       │       └── mongodb
    │       │           └── hadoop
    │       │               └── streaming
    │       │                   ├── MongoOutput.java
    │       │                   └── io
    │       │                       ├── MongoIdentifierResolver.java
    │       │                       ├── MongoInputWriter.java
    │       │                       ├── MongoOutputReader.java
    │       │                       ├── MongoUpdateInputWriter.java
    │       │                       └── MongoUpdateOutputReader.java
    │   └── test
    │       └── java
    │           └── com
    │               └── mongodb
    │                   └── hadoop
    │                       └── streaming
    │                           └── io
    │                               └── MongoUpdateOutputReaderTest.java
└── test.sh


/.evergreen/compile.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -o xtrace   # Write all commands first to stderr
 4 | set -o errexit  # Exit the script with error if any of the commands fail
 5 | 
 6 | ############################################
 7 | #            Main Program                  #
 8 | ############################################
 9 | 
10 | # We always compile with the latest version of java
11 | export JAVA_HOME="/opt/java/jdk8"
12 | ./gradlew -version
13 | ./gradlew -PxmlReports.enabled=true --info -x test clean check jar testClasses javadoc
14 | 


--------------------------------------------------------------------------------
/.evergreen/run-tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -o xtrace   # Write all commands first to stderr
 4 | set -o errexit  # Exit the script with error if any of the commands fail
 5 | 
 6 | # Supported/used environment variables:
 7 | #       MONGODB_BINARIES        The location of the MongoDB binaries, e.g. /usr/local/bin
 8 | #       HADOOP_VERSION          Sets the version of Hadoop to be used.
 9 | #       AUTH                    Set to enable authentication. Values are: "auth" / "noauth" (default)
10 | #       JDK                     Set the version of java to be used.  Java versions can be set from the java toolchain /opt/java
11 | #                               "jdk5", "jdk6", "jdk7", "jdk8"
12 | 
13 | MONGODB_BINARIES=${MONGODB_BINARIES:-}
14 | AUTH=${AUTH:-noauth}
15 | JDK=${JDK:-jdk}
16 | PROJECT_DIRECTORY=${PROJECT_DIRECTORY:-}
17 | 
18 | export HADOOP_VERSION=${HADOOP_VERSION:-2.7.2}
19 | export HADOOP_PREFIX=$PROJECT_DIRECTORY/hadoop-binaries/hadoop-$HADOOP_VERSION
20 | export HADOOP_HOME=$HADOOP_PREFIX
21 | export HADOOP_USER_CLASSPATH_FIRST=true
22 | export HIVE_HOME=$PROJECT_DIRECTORY/hadoop-binaries/apache-hive-1.2.1-bin
23 | 
24 | export JAVA_HOME="/opt/java/${JDK}"
25 | 
26 | ./gradlew -version
27 | ./gradlew -Dmongodb_bin_dir=${MONGODB_BINARIES} -Dmongodb_option=${AUTH} -DHADOOP_VERSION=${HADOOP_VERSION} --stacktrace jar testsJar test cleanHadoop


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *#*
 2 | *.crc
 3 | *.gem
 4 | *.iml
 5 | *.ipr
 6 | *.iws
 7 | *.log
 8 | *.out
 9 | *.pyc
10 | *.splits
11 | *.swp
12 | *~
13 | .DS*
14 | .classpath
15 | .gradle
16 | .idea
17 | .project
18 | TempStatsStore/
19 | WDI_GDF_Data.csv
20 | bin/hadoop-all.sh
21 | build
22 | examples/data
23 | logs
24 | out
25 | metastore_db/
26 | streaming/language_support/python/dist/
27 | streaming/language_support/python/pymongo_hadoop.egg-info/
28 | tags
29 | target
30 | test-*.out
31 | hadoop-binaries
32 | 


--------------------------------------------------------------------------------
/CONTRIBUTORS.md:
--------------------------------------------------------------------------------
 1 | * Mike O'Brien (mikeo@10gen.com)
 2 | * Brendan McAdams brendan@10gen.com
 3 | * Eliot Horowitz erh@10gen.com
 4 | * Ryan Nitz ryan@10gen.com
 5 | * Russell Jurney (@rjurney) (Lots of significant Pig improvements)
 6 | * Sarthak Dudhara sarthak.83@gmail.com (BSONWritable comparable interface)
 7 | * Priya Manda priyakanth024@gmail.com (Test Harness Code)
 8 | * Rushin Shah rushin10@gmail.com (Test Harness Code)
 9 | * Joseph Shraibman jks@iname.com (Sharded Input Splits)
10 | * Sumin Xia xiasumin1984@gmail.com (Sharded Input Splits)
11 | * Jeremy Karn
12 | * bpfoster
13 | * Ross Lawley
14 | * Carsten Hufe
15 | * Asya Kamsky
16 | * Thomas Millar
17 | * Justin Lee
18 | * Luke Lovett
19 | * Mariano Semelman
20 | * Jordan Gwyn
21 | * Powerrr
22 | 


--------------------------------------------------------------------------------
/clusterConfigs/core-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <!--
 4 |   Licensed under the Apache License, Version 2.0 (the "License");
 5 |   you may not use this file except in compliance with the License.
 6 |   You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |   Unless required by applicable law or agreed to in writing, software
11 |   distributed under the License is distributed on an "AS IS" BASIS,
12 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |   See the License for the specific language governing permissions and
14 |   limitations under the License. See accompanying LICENSE file.
15 | -->
16 | 
17 | <!-- Put site-specific property overrides in this file. -->
18 | 
19 | <configuration>
20 | <!--
21 |     <property>
22 |         <name>fs.defaultFS</name>
23 |         <value>hdfs://127.0.0.1:8020/</value>
24 |     </property>
25 | -->
26 |     <property>
27 |         <name>fs.default.name</name>
28 |         <value>hdfs://localhost:8020</value>
29 |     </property>
30 |     <property>
31 |         <name>hadoop.tmp.dir</name>
32 |         <value>@HADOOP_BINARIES@/hadoop-tmpdir</value>
33 |     </property>
34 | </configuration>
35 | 


--------------------------------------------------------------------------------
/clusterConfigs/hdfs-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <!--
 4 |   Licensed under the Apache License, Version 2.0 (the "License");
 5 |   you may not use this file except in compliance with the License.
 6 |   You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |   Unless required by applicable law or agreed to in writing, software
11 |   distributed under the License is distributed on an "AS IS" BASIS,
12 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |   See the License for the specific language governing permissions and
14 |   limitations under the License. See accompanying LICENSE file.
15 | -->
16 | 
17 | <!-- Put site-specific property overrides in this file. -->
18 | 
19 | <configuration>
20 |     <property>
21 |         <name>mapred.job.tracker</name>
22 |         <value>localhost:8021</value>
23 |     </property>
24 |     <property>
25 |         <name>hadoop.tmp.dir</name>
26 |         <value>@HADOOP_BINARIES@/hadoop-tmpdir</value>
27 |     </property>
28 | </configuration>
29 | 


--------------------------------------------------------------------------------
/clusterConfigs/hive-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <configuration>
 4 |     <property>
 5 |         <name>javax.jdo.option.ConnectionURL</name>
 6 |         <value>jdbc:derby:;databaseName=@HIVE_HOME@/metastore_db;create=true</value>
 7 |     </property>
 8 |     <property>
 9 |         <name>hive.metastore.warehouse.dir</name>
10 |         <value>hdfs://localhost:8020/user/hive/warehouse</value>
11 |     </property>
12 |     <property>
13 |         <name>dfs.datanode.address</name>
14 |         <value>50010</value>
15 |     </property>
16 |     <property>
17 |         <name>hive.aux.jars.path</name>
18 |         <value>@HIVE_HOME@/lib/mongo-hadoop-hive.jar</value>
19 |     </property>
20 | </configuration>
21 | 


--------------------------------------------------------------------------------
/clusterConfigs/mapred-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <!--
 4 |   Licensed under the Apache License, Version 2.0 (the "License");
 5 |   you may not use this file except in compliance with the License.
 6 |   You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 |   Unless required by applicable law or agreed to in writing, software
11 |   distributed under the License is distributed on an "AS IS" BASIS,
12 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |   See the License for the specific language governing permissions and
14 |   limitations under the License. See accompanying LICENSE file.
15 | -->
16 | 
17 | <!-- Put site-specific property overrides in this file. -->
18 | 
19 | <configuration>
20 |     <property>
21 |         <name>mapred.job.tracker</name>
22 |         <value>localhost:8021</value>
23 |     </property>
24 |     <!--
25 |     <property>
26 |         <name>mapred.task.tracker.report.address</name>
27 |         <value>127.0.0.1:0</value>
28 |     </property>
29 | --></configuration>
30 | 


--------------------------------------------------------------------------------
/config/findbugs-exclude.xml:
--------------------------------------------------------------------------------
 1 | <FindBugsFilter>
 2 |     <!-- specific excludes which should be fixed -->
 3 |     <Match>
 4 |         <Class name="org.bson.JSONWriterSettings"/>
 5 |     </Match>
 6 |     <Match>
 7 |         <Bug code="EI,EI2"/>
 8 |     </Match>
 9 | 
10 |     <!-- ignore driver-compat for now - this should be addressed after bson/driver warnings fixed -->
11 |     <Match>
12 |         <Package name="~com\.mongodb.*"/>
13 |     </Match>
14 | 
15 |     <!-- these specific issues are deliberate design decisions -->
16 |     <Match>
17 |         <Class name="com.mongodb.DocumentCodec">
18 |             <Bug pattern="NM_SAME_SIMPLE_NAME_AS_SUPERCLASS"/>
19 |         </Class>
20 |     </Match>
21 |     <Match>
22 |         <Class name="org.mongodb.DatabaseTestCase">
23 |             <Bug pattern="MS_PKGPROTECT"/>
24 |         </Class>
25 |     </Match>
26 |     <Match>
27 |         <Class name="org.mongodb.FunctionalSpecification">
28 |             <Bug pattern="MS_PKGPROTECT"/>
29 |         </Class>
30 |     </Match>
31 |     <Match>
32 |         <Class name="com.mongodb.DatabaseTestCase">
33 |             <Bug pattern="MS_PKGPROTECT"/>
34 |         </Class>
35 |     </Match>
36 | 
37 |     <!-- Spock tests seem to fail the serial version ID test when stubbing.  Annoying. -->
38 |     <Match>
39 |         <Class name="~.*\.*Specification.*"/>
40 |         <Bug pattern="SE_NO_SERIALVERSIONID,LI_LAZY_INIT_STATIC"/>
41 |     </Match>
42 | 
43 |     <!-- All bugs in test classes, except for JUnit-specific bugs -->
44 |     <Match>
45 |         <Class name="~.*\.*Test"/>
46 |         <Not>
47 |             <Bug code="IJU"/>
48 |         </Not>
49 |     </Match>
50 | 
51 | </FindBugsFilter>
52 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/BSONFileOutputFormat.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2010-2013 10gen Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.mongodb.hadoop;
18 | 
19 | import com.mongodb.hadoop.output.BSONFileRecordWriter;
20 | import com.mongodb.hadoop.splitter.BSONSplitter;
21 | import com.mongodb.hadoop.util.MongoConfigUtil;
22 | import org.apache.commons.logging.Log;
23 | import org.apache.commons.logging.LogFactory;
24 | import org.apache.hadoop.fs.FSDataOutputStream;
25 | import org.apache.hadoop.fs.FileSystem;
26 | import org.apache.hadoop.fs.Path;
27 | import org.apache.hadoop.mapreduce.RecordWriter;
28 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
29 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
30 | 
31 | import java.io.IOException;
32 | 
33 | public class BSONFileOutputFormat<K, V> extends FileOutputFormat<K, V> {
34 | 
35 |     @Override
36 |     public RecordWriter<K, V> getRecordWriter(final TaskAttemptContext context) throws IOException {
37 |         // Open data output stream
38 | 
39 |         Path outPath = getDefaultWorkFile(context, ".bson");
40 |         LOG.info("output going into " + outPath);
41 | 
42 |         FileSystem fs = outPath.getFileSystem(context.getConfiguration());
43 |         FSDataOutputStream outFile = fs.create(outPath);
44 | 
45 |         FSDataOutputStream splitFile = null;
46 |         if (MongoConfigUtil.getBSONOutputBuildSplits(context.getConfiguration())) {
47 |             Path splitPath = new Path(outPath.getParent(), "." + outPath.getName() + ".splits");
48 |             splitFile = fs.create(splitPath);
49 |         }
50 | 
51 |         long splitSize = BSONSplitter.getSplitSize(context.getConfiguration(), null);
52 |         return new BSONFileRecordWriter<K, V>(outFile, splitFile, splitSize);
53 |     }
54 | 
55 |     private static final Log LOG = LogFactory.getLog(BSONFileOutputFormat.class);
56 | }
57 | 
58 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/BSONPathFilter.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2010-2013 10gen Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package com.mongodb.hadoop;
17 | 
18 | import org.apache.commons.logging.Log;
19 | import org.apache.commons.logging.LogFactory;
20 | import org.apache.hadoop.fs.Path;
21 | import org.apache.hadoop.fs.PathFilter;
22 | 
23 | public class BSONPathFilter implements PathFilter {
24 | 
25 |     private static final Log LOG = LogFactory.getLog(BSONPathFilter.class);
26 | 
27 |     public BSONPathFilter() {
28 |         LOG.info("path filter constructed.");
29 |     }
30 | 
31 |     public boolean accept(final Path path) {
32 |         String pathName = path.getName().toLowerCase();
33 |         boolean acceptable = pathName.endsWith(".bson") && !pathName.startsWith(".");
34 |         LOG.info(path.toString() + " returning " + acceptable);
35 |         return acceptable;
36 |     }
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/MongoOutput.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2010-2013 10gen Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.mongodb.hadoop;
18 | 
19 | // Mongo
20 | 
21 | import com.mongodb.DBObject;
22 | 
23 | public interface MongoOutput {
24 |     void appendAsKey(DBObject o);
25 | 
26 |     void appendAsValue(DBObject o);
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/MongoOutputFormat.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2010-2013 10gen Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.mongodb.hadoop;
18 | 
19 | import com.mongodb.hadoop.output.MongoOutputCommitter;
20 | import com.mongodb.hadoop.output.MongoRecordWriter;
21 | import com.mongodb.hadoop.util.MongoConfigUtil;
22 | import org.apache.hadoop.mapreduce.JobContext;
23 | import org.apache.hadoop.mapreduce.OutputCommitter;
24 | import org.apache.hadoop.mapreduce.OutputFormat;
25 | import org.apache.hadoop.mapreduce.RecordWriter;
26 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
27 | 
28 | import java.io.IOException;
29 | 
30 | public class MongoOutputFormat<K, V> extends OutputFormat<K, V> {
31 |     public void checkOutputSpecs(final JobContext context) throws IOException {
32 |         if (MongoConfigUtil.getOutputURIs(context.getConfiguration()).isEmpty()) {
33 |             throw new IOException("No output URI is specified. You must set mongo.output.uri.");
34 |         }
35 |     }
36 | 
37 |     public OutputCommitter getOutputCommitter(final TaskAttemptContext context) {
38 |         return new MongoOutputCommitter();
39 |     }
40 | 
41 |     /**
42 |      * Get the record writer that points to the output collection.
43 |      */
44 |     public RecordWriter<K, V> getRecordWriter(final TaskAttemptContext context) {
45 |         return new MongoRecordWriter<K, V>(
46 |           MongoConfigUtil.getOutputCollection(context.getConfiguration()),
47 |           context);
48 |     }
49 | 
50 |     public MongoOutputFormat() {}
51 | 
52 |     /**
53 |      * @param updateKeys ignored
54 |      * @param multiUpdate ignored
55 |      * @deprecated this constructor is no longer useful.
56 |      */
57 |     @Deprecated
58 |     public MongoOutputFormat(final String[] updateKeys, final boolean multiUpdate) {
59 |         this();
60 |     }
61 | }


--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/input/BSONFileSplit.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.input;
 2 | 
 3 | import org.apache.hadoop.fs.Path;
 4 | import org.apache.hadoop.io.Text;
 5 | import org.apache.hadoop.mapreduce.lib.input.FileSplit;
 6 | 
 7 | import java.io.DataInput;
 8 | import java.io.DataOutput;
 9 | import java.io.IOException;
10 | 
11 | public class BSONFileSplit extends FileSplit {
12 | 
13 |     // CHECKSTYLE:OFF
14 |     protected String keyField = "_id";
15 |     // CHECKSTYLE:ON
16 | 
17 |     public BSONFileSplit(final Path file, final long start, final long length,
18 |                          final String[] hosts) {
19 |         super(file, start, length, hosts);
20 |     }
21 | 
22 |     public BSONFileSplit() { this(null, 0, 0, null); }
23 | 
24 |     public String getKeyField() {
25 |         return keyField;
26 |     }
27 | 
28 |     public void setKeyField(final String keyField) {
29 |         this.keyField = keyField;
30 |     }
31 | 
32 |     @Override
33 |     public void write(final DataOutput out) throws IOException {
34 |         super.write(out);
35 |         Text.writeString(out, getKeyField());
36 |     }
37 | 
38 |     @Override
39 |     public void readFields(final DataInput in) throws IOException {
40 |         super.readFields(in);
41 |         setKeyField(Text.readString(in));
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/io/BSONWritableComparator.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2010-2013 10gen Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.mongodb.hadoop.io;
18 | 
19 | import com.mongodb.hadoop.util.BSONComparator;
20 | import org.apache.commons.logging.Log;
21 | import org.apache.commons.logging.LogFactory;
22 | import org.apache.hadoop.io.WritableComparable;
23 | import org.apache.hadoop.io.WritableComparator;
24 | 
25 | public class BSONWritableComparator extends WritableComparator {
26 | 
27 |     private static final Log LOG = LogFactory.getLog(BSONWritableComparator.class);
28 | 
29 |     public BSONWritableComparator() {
30 |         super(BSONWritable.class, true);
31 |     }
32 | 
33 |     protected BSONWritableComparator(final Class<? extends WritableComparable> keyClass) {
34 |         super(keyClass, true);
35 |     }
36 | 
37 |     protected BSONWritableComparator(final Class<? extends WritableComparable> keyClass, final boolean createInstances) {
38 |         super(keyClass, createInstances);
39 |     }
40 | 
41 |     public int compare(final WritableComparable a, final WritableComparable b) {
42 |         if (a instanceof BSONWritable && b instanceof BSONWritable) {
43 |             return BSONComparator.getInstance().compare(((BSONWritable) a).getDoc(), ((BSONWritable) b).getDoc());
44 |         } else {
45 |             //return super.compare( a, b );
46 |             return -1;
47 |         }
48 |     }
49 | 
50 |     public int compare(final byte[] b1, final int s1, final int l1, final byte[] b2, final int s2, final int l2) {
51 |         //return BSONComparator.getInstance().compare(b1, s1, l1, b2, s2, l2);
52 |         return super.compare(b1, s1, l1, b2, s2, l2);
53 |     }
54 | 
55 |     public int compare(final Object a, final Object b) {
56 |         return BSONComparator.getInstance().compare(((BSONWritable) a).getDoc(), ((BSONWritable) b).getDoc());
57 |         //return super.compare( a, b );
58 |     }
59 | }
60 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/io/DataOutputOutputStreamAdapter.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.io;
 2 | 
 3 | import java.io.DataOutput;
 4 | import java.io.IOException;
 5 | import java.io.OutputStream;
 6 | 
 7 | class DataOutputOutputStreamAdapter extends OutputStream {
 8 |     private final DataOutput dataOutput;
 9 | 
10 |     DataOutputOutputStreamAdapter(final DataOutput dataOutput) {
11 |         this.dataOutput = dataOutput;
12 |     }
13 | 
14 |     @Override
15 |     public void write(final int b) throws IOException {
16 |         dataOutput.write(b);
17 |     }
18 | 
19 |     @Override
20 |     public void write(final byte[] b) throws IOException {
21 |         dataOutput.write(b);
22 |     }
23 | 
24 |     @Override
25 |     public void write(final byte[] b, final int off, final int len) throws IOException {
26 |         dataOutput.write(b, off, len);
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/io/MongoWritableTypes.java:
--------------------------------------------------------------------------------
1 | package com.mongodb.hadoop.io;
2 | 
3 | // CHECKSTYLE:OFF
4 | public interface MongoWritableTypes {
5 |     int BSON_WRITABLE = 0;
6 |     int MONGO_UPDATE_WRITABLE = 1;
7 | }
8 | // CHECKSTYLE:ON
9 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/mapred/BSONFileOutputFormat.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2010-2013 10gen Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.mongodb.hadoop.mapred;
18 | 
19 | import com.mongodb.hadoop.mapred.output.BSONFileRecordWriter;
20 | import com.mongodb.hadoop.splitter.BSONSplitter;
21 | import com.mongodb.hadoop.util.MongoConfigUtil;
22 | import org.apache.commons.logging.Log;
23 | import org.apache.commons.logging.LogFactory;
24 | import org.apache.hadoop.fs.FSDataOutputStream;
25 | import org.apache.hadoop.fs.FileSystem;
26 | import org.apache.hadoop.fs.Path;
27 | import org.apache.hadoop.mapred.FileOutputFormat;
28 | import org.apache.hadoop.mapred.JobConf;
29 | import org.apache.hadoop.mapred.RecordWriter;
30 | import org.apache.hadoop.util.Progressable;
31 | 
32 | import java.io.IOException;
33 | 
34 | public class BSONFileOutputFormat<K, V> extends FileOutputFormat<K, V> {
35 | 
36 |     public RecordWriter<K, V> getRecordWriter(final FileSystem ignored, final JobConf job, final String name,
37 |                                               final Progressable progress) throws IOException {
38 |         Path outPath = getDefaultWorkFile(job, name, ".bson");
39 |         LOG.info("output going into " + outPath);
40 | 
41 |         FileSystem fs = outPath.getFileSystem(job);
42 |         FSDataOutputStream outFile = fs.create(outPath);
43 | 
44 |         FSDataOutputStream splitFile = null;
45 |         if (MongoConfigUtil.getBSONOutputBuildSplits(job)) {
46 |             Path splitPath = new Path(outPath.getParent(), "." + outPath.getName() + ".splits");
47 |             splitFile = fs.create(splitPath);
48 |         }
49 | 
50 |         long splitSize = BSONSplitter.getSplitSize(job, null);
51 | 
52 |         return new BSONFileRecordWriter<K, V>(outFile, splitFile, splitSize);
53 |     }
54 | 
55 |     public static Path getDefaultWorkFile(final JobConf conf, final String name, final String extension) {
56 |         return new Path(getWorkOutputPath(conf), getUniqueName(conf, name) + extension);
57 |     }
58 | 
59 |     private static final Log LOG = LogFactory.getLog(BSONFileOutputFormat.class);
60 | }
61 | 
62 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/mapred/MongoOutputFormat.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2010-2013 10gen Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.mongodb.hadoop.mapred;
18 | 
19 | import com.mongodb.hadoop.mapred.output.MongoRecordWriter;
20 | import com.mongodb.hadoop.util.MongoConfigUtil;
21 | import org.apache.hadoop.fs.FileSystem;
22 | import org.apache.hadoop.mapred.JobConf;
23 | import org.apache.hadoop.mapred.OutputFormat;
24 | import org.apache.hadoop.mapred.RecordWriter;
25 | import org.apache.hadoop.util.Progressable;
26 | 
27 | import java.io.IOException;
28 | 
29 | @SuppressWarnings("deprecation")
30 | public class MongoOutputFormat<K, V> implements OutputFormat<K, V> {
31 |     public MongoOutputFormat() {
32 |     }
33 | 
34 |     @Override
35 |     public void checkOutputSpecs(final FileSystem ignored, final JobConf job) throws IOException {
36 |         if (MongoConfigUtil.getOutputURIs(job).isEmpty()) {
37 |             throw new IOException("No output URI is specified. You must set mongo.output.uri.");
38 |         }
39 |     }
40 | 
41 |     @Override
42 |     public RecordWriter<K, V> getRecordWriter(
43 |       final FileSystem ignored, final JobConf job, final String name,
44 |       final Progressable progress) {
45 |         return new MongoRecordWriter<K, V>(job);
46 |     }
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/mapred/input/BSONFileSplit.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.mapred.input;
 2 | 
 3 | import org.apache.hadoop.fs.Path;
 4 | import org.apache.hadoop.io.Text;
 5 | import org.apache.hadoop.mapred.FileSplit;
 6 | 
 7 | import java.io.DataInput;
 8 | import java.io.DataOutput;
 9 | import java.io.IOException;
10 | 
11 | public class BSONFileSplit extends FileSplit {
12 | 
13 |     // CHECKSTYLE:OFF
14 |     protected String keyField = "_id";
15 |     // CHECKSTYLE:ON
16 | 
17 | 
18 |     public BSONFileSplit(final Path file, final long start, final long
19 |       length, final String[] hosts) {
20 |         super(file, start, length, hosts);
21 |     }
22 | 
23 |     public BSONFileSplit() { this(null, 0, 0, null); }
24 | 
25 |     public String getKeyField() { return keyField; }
26 | 
27 |     public void setKeyField(final String keyField) {
28 |         this.keyField = keyField;
29 |     }
30 | 
31 |     @Override
32 |     public void write(final DataOutput out) throws IOException {
33 |         super.write(out);
34 |         Text.writeString(out, getKeyField());
35 |     }
36 | 
37 |     @Override
38 |     public void readFields(final DataInput in) throws IOException {
39 |         super.readFields(in);
40 |         setKeyField(Text.readString(in));
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/mapred/output/BSONFileRecordWriter.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2010-2013 10gen Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.mongodb.hadoop.mapred.output;
18 | 
19 | import org.apache.hadoop.fs.FSDataOutputStream;
20 | import org.apache.hadoop.mapred.RecordWriter;
21 | import org.apache.hadoop.mapred.Reporter;
22 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
23 | 
24 | import java.io.IOException;
25 | 
26 | 
27 | public class BSONFileRecordWriter<K, V> extends com.mongodb.hadoop.output.BSONFileRecordWriter implements RecordWriter<K, V> {
28 | 
29 |     public BSONFileRecordWriter(final FSDataOutputStream outFile, final FSDataOutputStream splitFile, final long splitSize) {
30 |         super(outFile, splitFile, splitSize);
31 |     }
32 | 
33 |     public void close(final Reporter reporter) throws IOException {
34 |         this.close((TaskAttemptContext) null);
35 |     }
36 | 
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/mapred/output/MongoOutputCommitter.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2010-2013 10gen Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | 
18 | package com.mongodb.hadoop.mapred.output;
19 | 
20 | import org.apache.hadoop.mapred.JobContext;
21 | import org.apache.hadoop.mapred.OutputCommitter;
22 | import org.apache.hadoop.mapred.TaskAttemptContext;
23 | 
24 | import java.io.IOException;
25 | 
26 | public class MongoOutputCommitter extends OutputCommitter {
27 |     private final com.mongodb.hadoop.output.MongoOutputCommitter delegate;
28 | 
29 |     public MongoOutputCommitter() {
30 |         delegate = new com.mongodb.hadoop.output.MongoOutputCommitter();
31 |     }
32 | 
33 |     @Override
34 |     public void abortTask(final TaskAttemptContext taskContext)
35 |       throws IOException {
36 |         delegate.abortTask(taskContext);
37 |     }
38 | 
39 |     @Override
40 |     public void commitTask(final TaskAttemptContext taskContext)
41 |       throws IOException {
42 |         delegate.commitTask(taskContext);
43 |     }
44 | 
45 |     @Override
46 |     public boolean needsTaskCommit(final TaskAttemptContext taskContext)
47 |       throws IOException {
48 |         return delegate.needsTaskCommit(taskContext);
49 |     }
50 | 
51 |     @Override
52 |     public void setupJob(final JobContext jobContext) {
53 |         delegate.setupJob(jobContext);
54 |     }
55 | 
56 |     @Override
57 |     public void setupTask(final TaskAttemptContext taskContext) {
58 |         delegate.setupTask(taskContext);
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/mapred/output/MongoRecordWriter.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2010-2013 10gen Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | 
18 | package com.mongodb.hadoop.mapred.output;
19 | 
20 | import com.mongodb.hadoop.util.CompatUtils;
21 | import com.mongodb.hadoop.util.MongoConfigUtil;
22 | import org.apache.hadoop.mapred.JobConf;
23 | import org.apache.hadoop.mapred.RecordWriter;
24 | import org.apache.hadoop.mapred.Reporter;
25 | 
26 | public class MongoRecordWriter<K, V>
27 |   extends com.mongodb.hadoop.output.MongoRecordWriter<K, V>
28 |   implements RecordWriter<K, V> {
29 | 
30 |     /**
31 |      * Create a new MongoRecordWriter.
32 |      * @param conf the job configuration
33 |      */
34 |     public MongoRecordWriter(final JobConf conf) {
35 |         super(
36 |           MongoConfigUtil.getOutputCollection(conf),
37 |           CompatUtils.getTaskAttemptContext(conf, conf.get("mapred.task.id")));
38 |     }
39 | 
40 |     @Override
41 |     public void close(final Reporter reporter) {
42 |         super.close(null);
43 |     }
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/splitter/MongoSplitter.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2010-2013 10gen Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.mongodb.hadoop.splitter;
18 | 
19 | import com.mongodb.hadoop.input.MongoInputSplit;
20 | import com.mongodb.hadoop.util.MongoConfigUtil;
21 | import org.apache.hadoop.conf.Configuration;
22 | import org.apache.hadoop.mapreduce.InputSplit;
23 | 
24 | import java.util.ArrayList;
25 | import java.util.List;
26 | 
27 | public abstract class MongoSplitter {
28 | 
29 |     private Configuration configuration;
30 | 
31 |     public MongoSplitter() {
32 |     }
33 | 
34 |     public MongoSplitter(final Configuration configuration) {
35 |         setConfiguration(configuration);
36 |     }
37 | 
38 |     public void setConfiguration(final Configuration conf) {
39 |         configuration = conf;
40 |     }
41 | 
42 |     public abstract List<InputSplit> calculateSplits() throws SplitFailedException;
43 | 
44 |     public Configuration getConfiguration() {
45 |         return configuration;
46 |     }
47 | 
48 |     /**
49 |      * Get a list of nonempty input splits only.
50 |      *
51 |      * @param splits a list of input splits
52 |      * @return a new list of nonempty input splits
53 |      */
54 |     public static List<InputSplit> filterEmptySplits(
55 |       final List<InputSplit> splits) {
56 |         List<InputSplit> results = new ArrayList<InputSplit>(splits.size());
57 |         for (InputSplit split : splits) {
58 |             MongoInputSplit mis = (MongoInputSplit) split;
59 |             if (mis.getCursor().hasNext()) {
60 |                 results.add(mis);
61 |             } else {
62 |                 MongoConfigUtil.close(
63 |                   mis.getCursor().getCollection().getDB().getMongo());
64 |             }
65 |         }
66 |         return results;
67 |     }
68 | }
69 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/splitter/SingleMongoSplitter.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2010-2013 10gen Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.mongodb.hadoop.splitter;
18 | 
19 | import com.mongodb.MongoClientURI;
20 | import com.mongodb.hadoop.input.MongoInputSplit;
21 | import com.mongodb.hadoop.util.MongoConfigUtil;
22 | import org.apache.commons.logging.Log;
23 | import org.apache.commons.logging.LogFactory;
24 | import org.apache.hadoop.conf.Configuration;
25 | import org.apache.hadoop.mapreduce.InputSplit;
26 | 
27 | import java.util.Collections;
28 | import java.util.List;
29 | 
30 | import static java.lang.String.format;
31 | 
32 | /* This implementation of MongoSplitter does not actually
33 |  * do any splitting, it will just create a single input split
34 |  * which represents the entire data set within a collection.
35 |  */
36 | public class SingleMongoSplitter extends MongoCollectionSplitter {
37 | 
38 |     private static final Log LOG = LogFactory.getLog(SingleMongoSplitter.class);
39 | 
40 |     //Create a single split which consists of a single
41 |     //a query over the entire collection.
42 | 
43 | 
44 |     public SingleMongoSplitter() {
45 |     }
46 | 
47 |     public SingleMongoSplitter(final Configuration conf) {
48 |         super(conf);
49 |     }
50 | 
51 |     @Override
52 |     public List<InputSplit> calculateSplits() {
53 |         if (LOG.isDebugEnabled()) {
54 |             MongoClientURI inputURI =
55 |               MongoConfigUtil.getInputURI(getConfiguration());
56 |             LOG.debug(format("SingleMongoSplitter calculating splits for namespace: %s.%s; hosts: %s",
57 |                 inputURI.getDatabase(), inputURI.getCollection(), inputURI.getHosts()));
58 |         }
59 |         return Collections.singletonList(
60 |           (InputSplit) new MongoInputSplit(getConfiguration()));
61 |     }
62 | 
63 | }
64 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/splitter/SplitFailedException.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.splitter;
 2 | 
 3 | public class SplitFailedException extends Exception {
 4 | 
 5 |     public SplitFailedException(final String message) {
 6 |         super(message);
 7 |     }
 8 | 
 9 |     public SplitFailedException(final String message, final Throwable cause) {
10 |         super(message, cause);
11 |     }
12 | }
13 | 
14 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/util/MongoPathRetriever.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.util;
 2 | 
 3 | import org.bson.BSONObject;
 4 | 
 5 | import java.util.List;
 6 | 
 7 | /**
 8 |  * Utility class providing a mechanism for retrieving data nested within
 9 |  * a MongoDB document.
10 |  */
11 | public final class MongoPathRetriever {
12 | 
13 |     private MongoPathRetriever() {}
14 | 
15 |     /**
16 |      * Returns the Object stored at a given path within a MongoDB
17 |      * document. Returns <code>null</code> if the path is not found.
18 |      *
19 |      * @param document MongoDB document in which to search.
20 |      * @param path Dot-separated path to look up.
21 |      * @return the Object stored at the path within the document.
22 |      */
23 |     public static Object get(final BSONObject document, final String path) {
24 |         String[] parts = path.split("\\.");
25 |         Object o = document;
26 |         for (String part : parts) {
27 |             if (null == o) {
28 |                 return null;
29 |             } else if (o instanceof List) {
30 |                 try {
31 |                     int index = Integer.parseInt(part);
32 |                     if (((List) o).size() > index && index >= 0) {
33 |                         o = ((List) o).get(index);
34 |                     } else {
35 |                         return null;
36 |                     }
37 |                 } catch (NumberFormatException e) {
38 |                     return null;
39 |                 }
40 |             } else if (o instanceof BSONObject) {
41 |                 o = ((BSONObject) o).get(part);
42 |             } else {
43 |                 // Hit a leaf before finding the key we were looking for.
44 |                 return null;
45 |             }
46 |         }
47 |         return o;
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/util/SplitFriendlyDBCallback.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright (c) 2010, 2011 10gen, Inc. <http://10gen.com>
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 5 |  * the License. You may obtain a copy of the License at
 6 |  *
 7 |  * http://www.apache.org/licenses/LICENSE-2.0
 8 |  *
 9 |  * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
10 |  * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
11 |  * specific language governing permissions and limitations under the License.
12 |  */
13 | 
14 | package com.mongodb.hadoop.util;
15 | 
16 | import com.mongodb.DBCallback;
17 | import com.mongodb.DBCallbackFactory;
18 | import com.mongodb.DBCollection;
19 | import com.mongodb.DefaultDBCallback;
20 | 
21 | public class SplitFriendlyDBCallback extends DefaultDBCallback {
22 | 
23 |     static final class MinKey {
24 |     }
25 | 
26 |     static final class MaxKey {
27 |     }
28 | 
29 |     static class SplitFriendlyFactory implements DBCallbackFactory {
30 |         public DBCallback create(final DBCollection collection) {
31 |             return new DefaultDBCallback(collection);
32 |         }
33 |     }
34 | 
35 |     public static final DBCallbackFactory FACTORY = new SplitFriendlyFactory();
36 |     public static final MinKey MIN_KEY_TYPE = new MinKey();
37 |     public static final MaxKey MAX_KEY_TYPE = new MaxKey();
38 | 
39 |     public SplitFriendlyDBCallback(final DBCollection coll) {
40 |         super(coll);
41 |     }
42 | 
43 |     @Override
44 |     public void gotMinKey(final String name) {
45 |         cur().put(name, MAX_KEY_TYPE);
46 |     }
47 | 
48 |     @Override
49 |     public void gotMaxKey(final String name) {
50 |         cur().put(name, MAX_KEY_TYPE);
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/core/src/test/java/com/mongodb/hadoop/BSONFileInputFormatTest.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop;
 2 | 
 3 | import com.mongodb.hadoop.io.BSONWritable;
 4 | import com.mongodb.hadoop.mapred.BSONFileInputFormat;
 5 | import org.apache.hadoop.io.NullWritable;
 6 | import org.apache.hadoop.mapred.FileSplit;
 7 | import org.apache.hadoop.mapred.JobConf;
 8 | import org.apache.hadoop.mapred.RecordReader;
 9 | import org.junit.Test;
10 | 
11 | import java.io.File;
12 | import java.io.IOException;
13 | 
14 | import static com.mongodb.hadoop.testutils.BaseHadoopTest.EXAMPLE_DATA_HOME;
15 | import static org.junit.Assert.assertEquals;
16 | 
17 | public class BSONFileInputFormatTest {
18 | 
19 |     @Test
20 |     public void enronEmails() throws IOException {
21 |         BSONFileInputFormat inputFormat = new BSONFileInputFormat();
22 |         JobConf job = new JobConf();
23 |         String inputDirectory =
24 |           new File(EXAMPLE_DATA_HOME, "/dump/enron_mail/messages.bson")
25 |             .getAbsoluteFile().toURI().toString();
26 |         // Hadoop 2.X
27 |         job.set("mapreduce.input.fileinputformat.inputdir", inputDirectory);
28 |         // Hadoop 1.2.X
29 |         job.set("mapred.input.dir", inputDirectory);
30 |         FileSplit[] splits = inputFormat.getSplits(job, 5);
31 |         int count = 0;
32 |         BSONWritable writable = new BSONWritable();
33 |         for (FileSplit split : splits) {
34 |             RecordReader<NullWritable, BSONWritable> recordReader = inputFormat.getRecordReader(split, job, null);
35 |             while (recordReader.next(null, writable)) {
36 |                 count++;
37 |             }
38 |         }
39 |         assertEquals("There are 501513 messages in the enron corpus", 501513, count);
40 |     }
41 | }


--------------------------------------------------------------------------------
/core/src/test/java/com/mongodb/hadoop/HadoopVersionFilter.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop;
 2 | 
 3 | import com.mongodb.hadoop.testutils.BaseHadoopTest;
 4 | 
 5 | import java.io.File;
 6 | import java.io.FileFilter;
 7 | 
 8 | public class HadoopVersionFilter implements FileFilter {
 9 |     private final boolean findTestJar;
10 |     private static final String PROD_FORMAT = String.format("-%s.jar", BaseHadoopTest.PROJECT_VERSION);
11 |     private static final String TEST_FORMAT = String.format("%s-tests.jar", BaseHadoopTest.PROJECT_VERSION);
12 | 
13 |     public HadoopVersionFilter() {
14 |         this(false);
15 |     }
16 | 
17 |     public HadoopVersionFilter(final boolean findTestJar) {
18 |         this.findTestJar = findTestJar;
19 |     }
20 | 
21 |     @Override
22 |     public boolean accept(final File pathname) {
23 |         return findTestJar ? pathname.getName().endsWith(TEST_FORMAT) : pathname.getName().endsWith(PROD_FORMAT);
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/core/src/test/java/com/mongodb/hadoop/MongoConfigUnitTests.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 10gen Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.mongodb.hadoop;
18 | 
19 | // Hadoop
20 | 
21 | import org.apache.hadoop.conf.Configuration;
22 | import org.junit.Test;
23 | 
24 | import static org.junit.Assert.assertNotNull;
25 | 
26 | // JUnit
27 | 
28 | /**
29 |  * The mongo config unit tests.
30 |  */
31 | public final class MongoConfigUnitTests {
32 | 
33 |     @Test
34 |     public void testConstructor() {
35 |         assertNotNull(new MongoConfig(new Configuration(false)));
36 |     }
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/core/src/test/java/com/mongodb/hadoop/bookstore/BookstoreConfig.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.bookstore;
 2 | 
 3 | import com.mongodb.hadoop.MongoConfig;
 4 | import com.mongodb.hadoop.MongoInputFormat;
 5 | import com.mongodb.hadoop.MongoOutputFormat;
 6 | import com.mongodb.hadoop.io.BSONWritable;
 7 | import com.mongodb.hadoop.io.MongoUpdateWritable;
 8 | import com.mongodb.hadoop.util.MongoTool;
 9 | import org.apache.hadoop.conf.Configuration;
10 | import org.apache.hadoop.io.Text;
11 | import org.apache.hadoop.util.ToolRunner;
12 | 
13 | public class BookstoreConfig extends MongoTool {
14 |     public BookstoreConfig() {
15 |         this(new Configuration());
16 |     }
17 | 
18 |     public BookstoreConfig(final Configuration configuration) {
19 |         MongoConfig config = new MongoConfig(configuration);
20 |         setConf(configuration);
21 | 
22 |         config.setInputFormat(MongoInputFormat.class);
23 | 
24 |         config.setMapper(TagsMapper.class);
25 |         config.setMapperOutputKey(Text.class);
26 |         config.setMapperOutputValue(BSONWritable.class);
27 | 
28 |         config.setReducer(TagsReducer.class);
29 |         config.setOutputKey(Text.class);
30 |         config.setOutputValue(MongoUpdateWritable.class);
31 |         config.setOutputFormat(MongoOutputFormat.class);
32 |     }
33 |     
34 |     public static void main(final String[] pArgs) throws Exception {
35 |         System.exit(ToolRunner.run(new BookstoreConfig(), pArgs));
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/core/src/test/java/com/mongodb/hadoop/bookstore/TagsMapper.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.bookstore;
 2 | 
 3 | import com.mongodb.hadoop.io.BSONWritable;
 4 | import org.apache.hadoop.io.Text;
 5 | import org.apache.hadoop.mapred.JobConf;
 6 | import org.apache.hadoop.mapred.OutputCollector;
 7 | import org.apache.hadoop.mapred.Reporter;
 8 | import org.apache.hadoop.mapreduce.Mapper;
 9 | import org.bson.BSONObject;
10 | import org.bson.types.BasicBSONList;
11 | 
12 | import java.io.IOException;
13 | 
14 | public class TagsMapper extends Mapper<Object, BSONObject, Text, BSONWritable>
15 |   implements org.apache.hadoop.mapred.Mapper<Object, BSONWritable, Text,
16 |   BSONWritable> {
17 | 
18 |     private BSONWritable writable;
19 | 
20 |     public TagsMapper() {
21 |         super();
22 |         writable = new BSONWritable();
23 |     }
24 | 
25 |     @Override
26 |     protected void map(final Object key, final BSONObject value, final Context
27 |       context) throws IOException, InterruptedException {
28 |         BasicBSONList tags = (BasicBSONList) value.get("tags");
29 |         Text text = new Text();
30 |         value.removeField("tags");
31 |         for (Object tag : tags) {
32 |             text.set((String) tag);
33 |             writable.setDoc(value);
34 |             context.write(text, writable);
35 |         }
36 |     }
37 | 
38 |     @Override
39 |     public void map(final Object key, final BSONWritable value, final
40 |     OutputCollector<Text, BSONWritable> output,
41 |                     final Reporter reporter) throws IOException {
42 |         BasicBSONList tags = (BasicBSONList) value.getDoc().get("tags");
43 |         Text text = new Text();
44 |         value.getDoc().removeField("tags");
45 |         for (Object tag : tags) {
46 |             text.set((String) tag);
47 |             output.collect(text, value);
48 |         }
49 |     }
50 | 
51 |     @Override
52 |     public void configure(final JobConf job) {
53 | 
54 |     }
55 | 
56 |     @Override
57 |     public void close() throws IOException {
58 |     }
59 | }
60 | 


--------------------------------------------------------------------------------
/core/src/test/java/com/mongodb/hadoop/bookstore/TagsReducer.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.bookstore;
 2 | 
 3 | import com.mongodb.BasicDBObject;
 4 | import com.mongodb.hadoop.io.BSONWritable;
 5 | import com.mongodb.hadoop.io.MongoUpdateWritable;
 6 | import org.apache.hadoop.io.NullWritable;
 7 | import org.apache.hadoop.io.Text;
 8 | import org.apache.hadoop.mapred.JobConf;
 9 | import org.apache.hadoop.mapred.OutputCollector;
10 | import org.apache.hadoop.mapred.Reporter;
11 | import org.apache.hadoop.mapreduce.Reducer;
12 | import org.bson.BSONObject;
13 | import org.bson.BasicBSONObject;
14 | 
15 | import java.io.IOException;
16 | import java.util.ArrayList;
17 | import java.util.Iterator;
18 | 
19 | public class TagsReducer extends Reducer<Text, BSONWritable, NullWritable, MongoUpdateWritable> 
20 |     implements org.apache.hadoop.mapred.Reducer<Text, BSONWritable, NullWritable, MongoUpdateWritable> {
21 | 
22 |     private MongoUpdateWritable reduceResult;
23 | 
24 |     public TagsReducer() {
25 |         super();
26 |         reduceResult = new MongoUpdateWritable();
27 |     }
28 | 
29 |     @Override
30 |     protected void reduce(final Text key, final Iterable<BSONWritable> values, final Context context)
31 |         throws IOException, InterruptedException {
32 | 
33 |         BasicDBObject query = new BasicDBObject("_id", key.toString());
34 |         ArrayList<BSONObject> books = new ArrayList<BSONObject>();
35 |         for (BSONWritable val : values) {
36 |             books.add(val.getDoc());
37 |         }
38 | 
39 |         BasicBSONObject update = new BasicBSONObject("$set", new BasicBSONObject("books", books));
40 |         reduceResult.setQuery(query);
41 |         reduceResult.setModifiers(update);
42 |         context.write(null, reduceResult);
43 |     }
44 | 
45 |     @Override
46 |     public void reduce(final Text key, final Iterator<BSONWritable> values, final OutputCollector<NullWritable, MongoUpdateWritable> output,
47 |                        final Reporter reporter) throws IOException {
48 |         BasicDBObject query = new BasicDBObject("_id", key.toString());
49 |         ArrayList<BSONObject> books = new ArrayList<BSONObject>();
50 |         while (values.hasNext()) {
51 |             books.add(values.next().getDoc());
52 |         }
53 | 
54 |         BasicBSONObject update = new BasicBSONObject("$set", new BasicBSONObject("books", books));
55 |         reduceResult.setQuery(query);
56 |         reduceResult.setModifiers(update);
57 |         output.collect(null, reduceResult);
58 |     }
59 | 
60 |     @Override
61 |     public void close() throws IOException {
62 |     }
63 | 
64 |     @Override
65 |     public void configure(final JobConf job) {
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/core/src/test/java/com/mongodb/hadoop/io/MongoInputSplitTest.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.io;
 2 | 
 3 | import com.mongodb.hadoop.input.MongoInputSplit;
 4 | import com.mongodb.hadoop.util.MongoConfigUtil;
 5 | import org.apache.hadoop.conf.Configuration;
 6 | import org.junit.Test;
 7 | 
 8 | import static junit.framework.TestCase.assertEquals;
 9 | 
10 | public class MongoInputSplitTest {
11 | 
12 |     @Test
13 |     public void testConstructor() {
14 |         Configuration conf = new Configuration();
15 |         MongoConfigUtil.setFields(conf, "{\"field\": 1}");
16 |         MongoConfigUtil.setAuthURI(conf, "mongodb://auth");
17 |         MongoConfigUtil.setInputURI(conf, "mongodb://input");
18 |         MongoConfigUtil.setInputKey(conf, "field");
19 |         MongoConfigUtil.setMaxSplitKey(conf, "{\"field\": 1e9}");
20 |         MongoConfigUtil.setMinSplitKey(conf, "{\"field\": -1e9}");
21 |         MongoConfigUtil.setNoTimeout(conf, true);
22 |         MongoConfigUtil.setQuery(conf, "{\"foo\": 42}");
23 |         MongoConfigUtil.setSort(conf, "{\"foo\": -1}");
24 |         MongoConfigUtil.setSkip(conf, 10);
25 | 
26 |         MongoInputSplit mis = new MongoInputSplit(conf);
27 | 
28 |         assertEquals(MongoConfigUtil.getFields(conf), mis.getFields());
29 |         assertEquals(MongoConfigUtil.getAuthURI(conf), mis.getAuthURI());
30 |         assertEquals(MongoConfigUtil.getInputURI(conf), mis.getInputURI());
31 |         assertEquals(MongoConfigUtil.getInputKey(conf), mis.getKeyField());
32 |         assertEquals(MongoConfigUtil.getMaxSplitKey(conf), mis.getMax());
33 |         assertEquals(MongoConfigUtil.getMinSplitKey(conf), mis.getMin());
34 |         assertEquals(MongoConfigUtil.isNoTimeout(conf), mis.getNoTimeout());
35 |         assertEquals(MongoConfigUtil.getQuery(conf), mis.getQuery());
36 |         assertEquals(MongoConfigUtil.getSort(conf), mis.getSort());
37 |         assertEquals(MongoConfigUtil.getLimit(conf), (int) mis.getLimit());
38 |         assertEquals(MongoConfigUtil.getSkip(conf), (int) mis.getSkip());
39 | 
40 |         MongoInputSplit mis2 = new MongoInputSplit(mis);
41 |         assertEquals(mis, mis2);
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/core/src/test/java/com/mongodb/hadoop/mapred/BSONFileInputFormatTest.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.mapred;
 2 | 
 3 | import com.mongodb.hadoop.io.BSONWritable;
 4 | import org.apache.hadoop.io.NullWritable;
 5 | import org.apache.hadoop.mapred.FileSplit;
 6 | import org.apache.hadoop.mapred.JobConf;
 7 | import org.apache.hadoop.mapred.RecordReader;
 8 | import org.junit.Test;
 9 | 
10 | import java.io.File;
11 | import java.io.IOException;
12 | 
13 | import static com.mongodb.hadoop.testutils.BaseHadoopTest.EXAMPLE_DATA_HOME;
14 | import static org.junit.Assert.assertEquals;
15 | 
16 | public class BSONFileInputFormatTest {
17 | 
18 |     @Test
19 |     public void enronEmails() throws IOException {
20 |         BSONFileInputFormat inputFormat = new BSONFileInputFormat();
21 |         JobConf job = new JobConf();
22 |         String inputDirectory =
23 |           new File(EXAMPLE_DATA_HOME, "/dump/enron_mail/messages.bson")
24 |             .getAbsoluteFile().toURI().toString();
25 |         // Hadoop 2.X
26 |         job.set("mapreduce.input.fileinputformat.inputdir", inputDirectory);
27 |         // Hadoop 1.2.X
28 |         job.set("mapred.input.dir", inputDirectory);
29 |         FileSplit[] splits = inputFormat.getSplits(job, 5);
30 |         int count = 0;
31 |         BSONWritable writable = new BSONWritable();
32 |         for (FileSplit split : splits) {
33 |             RecordReader<NullWritable, BSONWritable> recordReader = inputFormat.getRecordReader(split, job, null);
34 |             while (recordReader.next(null, writable)) {
35 |                 count++;
36 |             }
37 |         }
38 |         assertEquals("There are 501513 messages in the enron corpus", 501513, count);
39 |     }
40 | }


--------------------------------------------------------------------------------
/core/src/test/java/com/mongodb/hadoop/splitter/BSONFileRecordReaderTest.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.splitter;
 2 | 
 3 | import com.mongodb.hadoop.input.BSONFileRecordReader;
 4 | import com.mongodb.hadoop.input.BSONFileSplit;
 5 | import org.apache.hadoop.fs.Path;
 6 | import org.apache.hadoop.mapred.JobConf;
 7 | import org.bson.types.ObjectId;
 8 | import org.junit.Test;
 9 | 
10 | import java.io.File;
11 | import java.net.URI;
12 | 
13 | import static org.junit.Assert.assertEquals;
14 | import static org.junit.Assert.assertTrue;
15 | 
16 | public class BSONFileRecordReaderTest {
17 | 
18 |     @Test
19 |     public void testGetCurrentKey() throws Exception {
20 |         URI path = BSONFileRecordReaderTest.class.getResource(
21 |           "/bookstore-dump/inventory.bson").toURI();
22 |         File file = new File(path);
23 | 
24 |         // Default case: "_id" is used as inputKey.
25 |         BSONFileRecordReader reader = new BSONFileRecordReader();
26 |         BSONFileSplit split = new BSONFileSplit(new Path(path), 0,
27 |                 file.length(),
28 |                 new String[0]);
29 |         JobConf conf = new JobConf();
30 |         reader.init(split, conf);
31 |         assertTrue(reader.nextKeyValue());
32 |         assertEquals(reader.getCurrentKey(),
33 |                 new ObjectId("4d2a6084c6237b412fcd5597"));
34 | 
35 |         // Use a nested field as inputKey.
36 |         reader = new BSONFileRecordReader();
37 |         split = new BSONFileSplit(new Path(path), 0,
38 |                 file.length(),
39 |                 new String[0]);
40 |         split.setKeyField("price.msrp");
41 |         reader.init(split, conf);
42 |         assertTrue(reader.nextKeyValue());
43 |         assertEquals(reader.getCurrentKey(), 33);
44 | 
45 |         // Use a key within an array as the inputKey.
46 |         reader = new BSONFileRecordReader();
47 |         split = new BSONFileSplit(new Path(path), 0,
48 |                 file.length(),
49 |                 new String[0]);
50 |         split.setKeyField("tags.0");
51 |         reader.init(split, conf);
52 |         assertTrue(reader.nextKeyValue());
53 |         assertEquals(reader.getCurrentKey(), "html5");
54 |     }
55 | }
56 | 


--------------------------------------------------------------------------------
/core/src/test/java/com/mongodb/hadoop/splitter/MongoRecordReaderTest.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.splitter;
 2 | 
 3 | import com.mongodb.BasicDBList;
 4 | import com.mongodb.BasicDBObject;
 5 | import com.mongodb.DBCollection;
 6 | import com.mongodb.MongoClient;
 7 | import com.mongodb.MongoClientURI;
 8 | import com.mongodb.hadoop.input.MongoInputSplit;
 9 | import com.mongodb.hadoop.input.MongoRecordReader;
10 | import com.mongodb.hadoop.util.MongoClientURIBuilder;
11 | import org.bson.BasicBSONObject;
12 | import org.junit.Test;
13 | 
14 | import static org.junit.Assert.assertEquals;
15 | import static org.junit.Assert.assertTrue;
16 | 
17 | public class MongoRecordReaderTest {
18 | 
19 |     @Test
20 |     public void testGetCurrentKey() throws Exception {
21 |         MongoClient client = new MongoClient("localhost", 27017);
22 |         MongoClientURI uri = new MongoClientURIBuilder()
23 |                 .collection("mongo_hadoop", "mongo_record_reader_test")
24 |                 .build();
25 |         DBCollection collection = client.getDB(uri.getDatabase()).getCollection(uri.getCollection());
26 |         collection.drop();
27 |         BasicDBList colors = new BasicDBList(){
28 |             {
29 |                 add(new BasicBSONObject("red", 255));
30 |                 add(new BasicBSONObject("blue", 255));
31 |                 add(new BasicBSONObject("green", 0));
32 |             }
33 |         };
34 |         collection.insert(
35 |                 new BasicDBObject("_id", 0)
36 |                         .append("address",
37 |                                 new BasicDBObject("street", "foo street"))
38 |                         .append("colors", colors)
39 |         );
40 | 
41 |         // Default case: "_id" is used as inputKey.
42 |         MongoInputSplit split = new MongoInputSplit();
43 |         split.setInputURI(uri);
44 |         MongoRecordReader reader = new MongoRecordReader(split);
45 |         assertTrue(reader.nextKeyValue());
46 |         assertEquals(reader.getCurrentKey(), 0);
47 | 
48 |         // Use a nested field as inputKey.
49 |         split = new MongoInputSplit();
50 |         split.setInputURI(uri);
51 |         split.setKeyField("address.street");
52 |         reader = new MongoRecordReader(split);
53 |         assertTrue(reader.nextKeyValue());
54 |         assertEquals(reader.getCurrentKey(), "foo street");
55 | 
56 |         // Use a key within an array as the inputKey.
57 |         split = new MongoInputSplit();
58 |         split.setInputURI(uri);
59 |         split.setKeyField("colors.1");
60 |         reader = new MongoRecordReader(split);
61 |         assertTrue(reader.nextKeyValue());
62 |         assertEquals(reader.getCurrentKey(), new BasicBSONObject("blue", 255));
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/core/src/test/java/com/mongodb/hadoop/splitter/MongoSplitterTestUtils.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.splitter;
 2 | 
 3 | import com.mongodb.DBObject;
 4 | import com.mongodb.hadoop.input.MongoInputSplit;
 5 | import org.apache.hadoop.mapreduce.InputSplit;
 6 | 
 7 | import java.util.List;
 8 | 
 9 | import static org.junit.Assert.assertEquals;
10 | 
11 | /**
12 |  * Utilities for testing Splitter classes that produce MongoInputSplits.
13 |  */
14 | public final class MongoSplitterTestUtils {
15 | 
16 |     private MongoSplitterTestUtils() {}
17 | 
18 |     /**
19 |      * Assert that a split has the expected bounds using a range query.
20 |      * @param split an instance of MongoInputSplit
21 |      * @param min the min bound
22 |      * @param max the max bound
23 |      */
24 |     public static void assertSplitRange(
25 |       final MongoInputSplit split, final Integer min, final Integer max) {
26 |         DBObject queryObj = (DBObject) split.getQuery().get("_id");
27 |         assertEquals(min, queryObj.get("$gte"));
28 |         assertEquals(max, queryObj.get("$lt"));
29 |     }
30 | 
31 |     /**
32 |      * Assert that a list of splits has the expected overall count.
33 |      * @param expected the expected count
34 |      * @param splits a list of MongoInputSplits
35 |      */
36 |     public static void assertSplitsCount(
37 |       final long expected, final List<InputSplit> splits) {
38 |         int splitTotal = 0;
39 |         for (InputSplit split : splits) {
40 |             // Cursors have been closed; create a copy of the MongoInputSplit.
41 |             MongoInputSplit mis = new MongoInputSplit((MongoInputSplit) split);
42 |             // Query doesn't play nice with min/max, so use itcount for test.
43 |             splitTotal += mis.getCursor().itcount();
44 |         }
45 |         assertEquals(expected, splitTotal);
46 |     }
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/core/src/test/java/com/mongodb/hadoop/splitter/ShardChunkMongoSplitterTest.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.splitter;
 2 | 
 3 | import com.mongodb.BasicDBObjectBuilder;
 4 | import com.mongodb.DBObject;
 5 | import com.mongodb.MongoClientURI;
 6 | import com.mongodb.hadoop.testutils.BaseHadoopTest;
 7 | import com.mongodb.hadoop.util.MongoConfigUtil;
 8 | import org.apache.hadoop.conf.Configuration;
 9 | import org.apache.hadoop.mapreduce.InputSplit;
10 | import org.bson.types.MaxKey;
11 | import org.bson.types.MinKey;
12 | import org.junit.Test;
13 | 
14 | import java.io.IOException;
15 | import java.util.ArrayList;
16 | import java.util.Arrays;
17 | import java.util.HashMap;
18 | import java.util.List;
19 | import java.util.Map;
20 | 
21 | import static org.junit.Assert.assertEquals;
22 | 
23 | public class ShardChunkMongoSplitterTest extends BaseHadoopTest {
24 | 
25 |     private ShardChunkMongoSplitter splitter = new ShardChunkMongoSplitter();
26 | 
27 |     private DBObject createChunk(final String key, final Object min, final Object max, final String shardName) {
28 |         return new BasicDBObjectBuilder()
29 |           .push("min").add(key, min).pop()
30 |           .push("max").add(key, max).pop()
31 |           .append("shard", shardName).get();
32 |     }
33 | 
34 |     @Test
35 |     public void testSplitPreferredLocations()
36 |       throws SplitFailedException, IOException, InterruptedException {
37 |         // Create list of chunks.
38 |         List<DBObject> chunksList = new ArrayList<DBObject>(){{
39 |             add(createChunk("i", new MinKey(), 500, "sh01"));
40 |             add(createChunk("i", 500, new MaxKey(), "sh02"));
41 |         }};
42 |         // Create shards map.
43 |         Map<String, List<String>> shardsMap = new HashMap<String, List<String>>() {{
44 |             put("sh01", Arrays.asList("mongo.sh01.dc1:27017", "mongo.sh01.dc2:27017"));
45 |             put("sh02", Arrays.asList("mongo.sh02.dc1:27027", "mongo.sh02.dc2:27027"));
46 |         }};
47 | 
48 |         Configuration conf = new Configuration();
49 |         MongoConfigUtil.setInputMongosHosts(
50 |           conf, Arrays.asList("mongo.sh01.dc1:27018", "mongo.sh02.dc2:27018"));
51 |         MongoConfigUtil.setInputURI(
52 |           conf, new MongoClientURI("mongodb://mongo.dc1:27018,mongo.dc2:27018/hadoop.test"));
53 |         splitter.setConfiguration(conf);
54 | 
55 |         List<InputSplit> splits = splitter.calculateSplitsFromChunks(chunksList, shardsMap);
56 |         assertEquals("mongo.sh01.dc1:27018", splits.get(0).getLocations()[0]);
57 |         assertEquals("mongo.sh02.dc2:27018", splits.get(1).getLocations()[0]);
58 |     }
59 | }
60 | 


--------------------------------------------------------------------------------
/core/src/test/java/com/mongodb/hadoop/util/MongoConfigUtilTest.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.util;
 2 | 
 3 | import com.mongodb.MongoClientURI;
 4 | import org.apache.hadoop.conf.Configuration;
 5 | import org.junit.Test;
 6 | 
 7 | import java.util.List;
 8 | 
 9 | import static org.junit.Assert.assertEquals;
10 | 
11 | public class MongoConfigUtilTest {
12 | 
13 |     private void assertSameURIs(
14 |       final String[] expected, final List<MongoClientURI> actual) {
15 |         assertEquals(expected.length, actual.size());
16 |         for (int i = 0; i < expected.length; ++i) {
17 |             assertEquals(expected[i], actual.get(i).getURI());
18 |         }
19 |     }
20 | 
21 |     @Test
22 |     public void testGetMongoURIs() {
23 |         Configuration conf = new Configuration();
24 |         String[] connStrings = new String[] {
25 |           "mongodb://rshost1:10000,rshost2:10001/foo.bar?replicaSet=rs",
26 |           "mongodb://standalone:27017/db.collection"
27 |         };
28 | 
29 |         // Separated by ", "
30 |         conf.set(
31 |           MongoConfigUtil.INPUT_URI,
32 |           connStrings[0] + ", " + connStrings[1]);
33 |         List<MongoClientURI> uris = MongoConfigUtil.getMongoURIs(
34 |           conf, MongoConfigUtil.INPUT_URI);
35 |         assertSameURIs(connStrings, uris);
36 | 
37 |         // No delimiter
38 |         conf.set(MongoConfigUtil.INPUT_URI, connStrings[0] + connStrings[1]);
39 |         uris = MongoConfigUtil.getMongoURIs(conf, MongoConfigUtil.INPUT_URI);
40 |         assertSameURIs(connStrings, uris);
41 | 
42 |         // No value set
43 |         uris = MongoConfigUtil.getMongoURIs(conf, "this key does not exist");
44 |         assertEquals(0, uris.size());
45 | 
46 |         // Only one input URI.
47 |         String connString = connStrings[1];
48 |         conf.set(MongoConfigUtil.INPUT_URI, connString);
49 |         uris = MongoConfigUtil.getMongoURIs(conf, MongoConfigUtil.INPUT_URI);
50 |         assertSameURIs(new String[] {connString}, uris);
51 |     }
52 | 
53 | }
54 | 


--------------------------------------------------------------------------------
/core/src/test/resources/bookstore-dump/inventory.bson:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mongodb/mongo-hadoop/20208a027ad8638e56dfcf040773f176d6ee059f/core/src/test/resources/bookstore-dump/inventory.bson


--------------------------------------------------------------------------------
/core/src/test/resources/bookstore-dump/orders.bson:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mongodb/mongo-hadoop/20208a027ad8638e56dfcf040773f176d6ee059f/core/src/test/resources/bookstore-dump/orders.bson


--------------------------------------------------------------------------------
/core/src/test/resources/bookstore-dump/publishers.bson:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mongodb/mongo-hadoop/20208a027ad8638e56dfcf040773f176d6ee059f/core/src/test/resources/bookstore-dump/publishers.bson


--------------------------------------------------------------------------------
/core/src/test/resources/bookstore-dump/system.indexes.bson:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mongodb/mongo-hadoop/20208a027ad8638e56dfcf040773f176d6ee059f/core/src/test/resources/bookstore-dump/system.indexes.bson


--------------------------------------------------------------------------------
/examples/elastic-mapreduce/emr-bootstrap.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | wget -P /home/hadoop/lib http://central.maven.org/maven2/org/mongodb/mongo-java-driver/2.11.1/mongo-java-driver-2.11.1.jar
4 | 
5 | # Edit this path to point to the location of the jar you're using.
6 | wget -P /home/hadoop/lib https://s3.amazonaws.com/mongo-hadoop-code/mongo-hadoop-core_1.1.2-1.1.0.jar
7 | 


--------------------------------------------------------------------------------
/examples/elastic-mapreduce/run_emr_job.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | ~/projects/elastic-mapreduce-ruby/elastic-mapreduce --create --jobflow ENRON000 \
 3 |     --instance-type m1.xlarge \
 4 |     --bootstrap-action s3://$S3_BUCKET/emr-bootstrap.sh \
 5 |     --log-uri s3://$S3_BUCKET/enron_logs \
 6 |     --jar s3://$S3_BUCKET/enron-example_1.1.2-1.1.0.jar \
 7 |     --arg -D --arg mongo.job.input.format=com.mongodb.hadoop.BSONFileInputFormat \
 8 |     --arg -D --arg mapred.input.dir=s3n://mongo-test-data/messages.bson \
 9 |     --arg -D --arg mongo.job.mapper=com.mongodb.hadoop.examples.enron.EnronMailMapper \
10 |     --arg -D --arg mongo.job.output.key=com.mongodb.hadoop.examples.enron.MailPair \
11 |     --arg -D --arg mongo.job.output.value=org.apache.hadoop.io.IntWritable \
12 |     --arg -D --arg mongo.job.partitioner= \
13 |     --arg -D --arg mongo.job.reducer=com.mongodb.hadoop.examples.enron.EnronMailReducer \
14 |     --arg -D --arg mongo.job.sort_comparator= \
15 |     --arg -D --arg mongo.job.background= \
16 |     --arg -D --arg mapred.output.dir=s3n://$S3_BUCKET/BSON_OUT \
17 |     --arg -D --arg mongo.job.output.format=com.mongodb.hadoop.BSONFileOutputFormat \
18 |     --arg -D --arg mapred.child.java.opts=-Xmx2048m
19 |     #--arg -D --arg mapred.task.profile=true \
20 | 


--------------------------------------------------------------------------------
/examples/elastic-mapreduce/update_s3.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | #Take the enron example jars and put them into an S3 bucket.
 4 | HERE="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 5 | 
 6 | s3cp $HERE/emr-bootstrap.sh s3://$S3_BUCKET/emr-bootstrap.sh
 7 | s3mod s3://$S3_BUCKET/emr-bootstrap.sh public-read
 8 | s3cp $HERE/../../core/target/mongo-hadoop-core_1.1.2-1.1.0.jar s3://$S3_BUCKET/mongo-hadoop-core_1.1.2-1.1.0.jar
 9 | s3mod s3://$S3_BUCKET/mongo-hadoop-core_1.1.2-1.1.0.jar public-read
10 | s3cp $HERE/../enron/target/enron-example_1.1.2-1.1.0.jar s3://$S3_BUCKET/enron-example_1.1.2-1.1.0.jar
11 | s3mod s3://$S3_BUCKET/enron-example_1.1.2-1.1.0.jar public-read
12 | 


--------------------------------------------------------------------------------
/examples/enron/hive/hive_enron.q:
--------------------------------------------------------------------------------
 1 | -- Hive doesn't allow hyphens in field names
 2 | 
 3 | -- This hive script takes in the emails from Enron and
 4 | -- counts the numbers exchanged between each pair of people
 5 | 
 6 | -- Get the headers struct, which contains the "from" and "to".
 7 | -- except the words "from", "to", and "date" are reserved in Hive 
 8 | DROP TABLE raw;
 9 | 
10 | CREATE EXTERNAL TABLE raw(
11 |     h STRUCT<hivefrom:STRING,hiveto:STRING>
12 | )
13 | ROW FORMAT SERDE "com.mongodb.hadoop.hive.BSONSerDe"
14 | WITH SERDEPROPERTIES("mongo.columns.mapping"="{'h.hivefrom':'headers.From',
15 |  'h.hiveto':'headers.To'}")
16 | STORED AS INPUTFORMAT "com.mongodb.hadoop.mapred.BSONFileInputFormat"
17 | OUTPUTFORMAT "com.mongodb.hadoop.hive.output.HiveBSONFileOutputFormat"
18 | LOCATION '${INPUT}';
19 | 
20 | 
21 | DROP TABLE send_recip;
22 | CREATE TABLE send_recip (
23 |     f STRING,
24 |     t_array ARRAY<STRING>
25 | );
26 | 
27 | -- Strip the white space from the "hiveto" string
28 | -- Then split the comma delimited string into an array of strings
29 | INSERT OVERWRITE TABLE send_recip 
30 | SELECT 
31 |     h.hivefrom AS f,
32 |     split(h.hiveto, "\\s*,\\s*") 
33 |         AS t_array
34 | FROM raw
35 | WHERE h IS NOT NULL 
36 |     AND h.hiveto IS NOT NULL;
37 | 
38 | 
39 | DROP TABLE send_recip_explode;
40 | CREATE TABLE send_recip_explode (
41 |     f STRING, 
42 |     t STRING,
43 |     num INT
44 | );
45 | 
46 | -- Explode the array so that every element in the array gets it
47 | -- own row. Then group by the unique "f" and "t" pair
48 | -- to find the number of emails between the sender and receiver
49 | INSERT OVERWRITE TABLE send_recip_explode
50 | SELECT 
51 |     f, 
52 |     t, 
53 |     count(1) AS num
54 | FROM send_recip
55 |     LATERAL VIEW explode(t_array) tmpTable AS t
56 | GROUP BY f, t;
57 | 
58 | 
59 | DROP TABLE send_recip_counted;
60 | CREATE TABLE send_recip_counted (
61 |     id STRUCT<
62 |         t : STRING, 
63 |         f : STRING
64 |     >,
65 |     count INT
66 | )
67 | ROW FORMAT SERDE "com.mongodb.hadoop.hive.BSONSerDe"
68 | WITH SERDEPROPERTIES ("mongo.columns.mapping"="{'id':'_id'}")
69 | STORED AS INPUTFORMAT "com.mongodb.hadoop.mapred.BSONFileInputFormat"
70 | OUTPUTFORMAT "com.mongodb.hadoop.hive.output.HiveBSONFileOutputFormat"
71 | LOCATION '${OUTPUT}';
72 | 
73 | -- Final output with the correct format
74 | INSERT INTO TABLE send_recip_counted
75 | SELECT 
76 |     named_struct('t', t, 'f', f) AS id,
77 |     num AS count
78 | FROM send_recip_explode;
79 | 


--------------------------------------------------------------------------------
/examples/enron/pig/pig_enron.pig:
--------------------------------------------------------------------------------
 1 | --Change these jar locations to point to the correct locations/version on your system.
 2 | REGISTER  /Users/mike/Downloads/mongo-2.10.1.jar;
 3 | REGISTER ../core/target/mongo-hadoop-core_cdh4.3.0-1.1.0.jar
 4 | REGISTER ../pig/target/mongo-hadoop-pig_cdh4.3.0-1.1.0.jar
 5 | 
 6 | 
 7 | raw = LOAD 'file:///tmp/enron_mail/messages.bson' using com.mongodb.hadoop.pig.BSONLoader('','headers:[]') ; 
 8 | send_recip = FOREACH raw GENERATE $0#'From' as from, $0#'To' as to;
 9 | send_recip_filtered = FILTER send_recip BY to IS NOT NULL;
10 | send_recip_split = FOREACH send_recip_filtered GENERATE from as from, FLATTEN(TOKENIZE(to)) as to;
11 | send_recip_split_trimmed = FOREACH send_recip_split GENERATE from as from, TRIM(to) as to;
12 | send_recip_grouped = GROUP send_recip_split_trimmed BY (from, to);
13 | send_recip_counted = FOREACH send_recip_grouped GENERATE group, COUNT($1) as count;
14 | STORE send_recip_counted INTO 'file:///tmp/enron_result.bson' using com.mongodb.hadoop.pig.BSONStorage;
15 | 


--------------------------------------------------------------------------------
/examples/enron/run_job.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | echo "Run this job via gradle from the root directory:  ./gradlew enronEmails"


--------------------------------------------------------------------------------
/examples/enron/src/main/java/com/mongodb/hadoop/examples/enron/EnronMail.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 10gen Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package com.mongodb.hadoop.examples.enron;
17 | 
18 | 
19 | import com.mongodb.hadoop.BSONFileInputFormat;
20 | import com.mongodb.hadoop.MongoConfig;
21 | import com.mongodb.hadoop.MongoOutputFormat;
22 | import com.mongodb.hadoop.util.MapredMongoConfigUtil;
23 | import com.mongodb.hadoop.util.MongoConfigUtil;
24 | import com.mongodb.hadoop.util.MongoTool;
25 | import org.apache.hadoop.conf.Configuration;
26 | import org.apache.hadoop.fs.Path;
27 | import org.apache.hadoop.io.IntWritable;
28 | import org.apache.hadoop.mapred.FileInputFormat;
29 | import org.apache.hadoop.mapred.JobConf;
30 | import org.apache.hadoop.util.ToolRunner;
31 | 
32 | public class EnronMail extends MongoTool {
33 |     public EnronMail() {
34 |         JobConf conf = new JobConf(new Configuration());
35 |         if (MongoTool.isMapRedV1()) {
36 |             MapredMongoConfigUtil.setInputFormat(conf,
37 |               com.mongodb.hadoop.mapred.BSONFileInputFormat.class);
38 |             MapredMongoConfigUtil.setOutputFormat(conf,
39 |               com.mongodb.hadoop.mapred.MongoOutputFormat.class);
40 |         } else {
41 |             MongoConfigUtil.setInputFormat(conf, BSONFileInputFormat.class);
42 |             MongoConfigUtil.setOutputFormat(conf, MongoOutputFormat.class);
43 |         }
44 |         FileInputFormat.addInputPath(conf, new Path("/messages"));
45 |         MongoConfig config = new MongoConfig(conf);
46 |         config.setInputKey("headers.From");
47 |         config.setMapper(EnronMailMapper.class);
48 |         config.setReducer(EnronMailReducer.class);
49 |         config.setMapperOutputKey(MailPair.class);
50 |         config.setMapperOutputValue(IntWritable.class);
51 |         config.setOutputKey(MailPair.class);
52 |         config.setOutputValue(IntWritable.class);
53 |         config.setOutputURI(
54 |           "mongodb://localhost:27017/mongo_hadoop.message_pairs");
55 |         setConf(conf);
56 |     }
57 | 
58 |     public static void main(final String[] pArgs) throws Exception {
59 |         System.exit(ToolRunner.run(new EnronMail(), pArgs));
60 |     }
61 | }
62 | 
63 | 


--------------------------------------------------------------------------------
/examples/enron/src/main/java/com/mongodb/hadoop/examples/enron/EnronMailMapper.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.examples.enron;
 2 | 
 3 | import com.mongodb.hadoop.io.BSONWritable;
 4 | import org.apache.hadoop.io.IntWritable;
 5 | import org.apache.hadoop.mapred.JobConf;
 6 | import org.apache.hadoop.mapred.OutputCollector;
 7 | import org.apache.hadoop.mapred.Reporter;
 8 | import org.apache.hadoop.mapreduce.Mapper;
 9 | import org.bson.BSONObject;
10 | 
11 | import java.io.IOException;
12 | 
13 | public class EnronMailMapper extends Mapper<Object, BSONObject, MailPair, IntWritable>
14 |     implements org.apache.hadoop.mapred.Mapper<Object, BSONWritable, MailPair, IntWritable> {
15 | 
16 |     private final IntWritable intw;
17 |     private final MailPair mp;
18 | 
19 |     public EnronMailMapper() {
20 |         super();
21 |         intw = new IntWritable(1);
22 |         mp = new MailPair();
23 |     }
24 | 
25 |     @Override
26 |     public void map(final Object key, final BSONObject val,
27 |                     final Context context)
28 |             throws IOException, InterruptedException {
29 | 
30 |         BSONObject headers = (BSONObject) val.get("headers");
31 |         String to = (String) headers.get("To");
32 |         if (null != to) {
33 |             String[] recipients = to.split(",");
34 |             for (final String recip1 : recipients) {
35 |                 String recip = recip1.trim();
36 |                 if (recip.length() > 0) {
37 |                     mp.setFrom((String) key);
38 |                     mp.setTo(recip);
39 |                     context.write(mp, intw);
40 |                 }
41 |             }
42 |         }
43 |     }
44 | 
45 |     @Override
46 |     public void map(final Object key, final BSONWritable writable, final OutputCollector<MailPair, IntWritable> output,
47 |                     final Reporter reporter) throws IOException {
48 |         BSONObject headers = (BSONObject) writable.getDoc().get("headers");
49 |         String to = (String) headers.get("To");
50 |         String from = (String) headers.get("From");
51 |         if (null != to) {
52 |             String[] recipients = to.split(",");
53 |             for (final String recip1 : recipients) {
54 |                 String recip = recip1.trim();
55 |                 if (recip.length() > 0) {
56 |                     mp.setFrom(from);
57 |                     mp.setTo(recip);
58 |                     output.collect(mp, intw);
59 |                 }
60 |             }
61 |         }
62 |     }
63 | 
64 |     @Override
65 |     public void close() throws IOException {
66 |     }
67 | 
68 |     @Override
69 |     public void configure(final JobConf job) {
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/examples/enron/src/main/java/com/mongodb/hadoop/examples/enron/MailPair.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.examples.enron;
 2 | 
 3 | import org.apache.hadoop.io.WritableComparable;
 4 | 
 5 | import java.io.DataInput;
 6 | import java.io.DataOutput;
 7 | import java.io.IOException;
 8 | 
 9 | 
10 | public class MailPair implements WritableComparable {
11 |     private String from;
12 |     private String to;
13 | 
14 |     public MailPair() {
15 |     }
16 | 
17 |     public MailPair(final String from, final String to) {
18 |         this.from = from;
19 |         this.to = to;
20 |     }
21 | 
22 |     public String getFrom() {
23 |         return from;
24 |     }
25 | 
26 |     public void setFrom(final String from) {
27 |         this.from = from;
28 |     }
29 | 
30 |     public String getTo() {
31 |         return to;
32 |     }
33 | 
34 |     public void setTo(final String to) {
35 |         this.to = to;
36 |     }
37 | 
38 |     public void readFields(final DataInput in) throws IOException {
39 |         this.from = in.readUTF();
40 |         this.to = in.readUTF();
41 |     }
42 | 
43 |     public void write(final DataOutput out) throws IOException {
44 |         out.writeUTF(this.from);
45 |         out.writeUTF(this.to);
46 |     }
47 | 
48 |     @Override
49 |     public boolean equals(final Object o) {
50 |         if (o instanceof MailPair) {
51 |             MailPair mp = (MailPair) o;
52 |             return from.equals(mp.from) && to.equals(mp.to);
53 |         }
54 |         return false;
55 |     }
56 | 
57 |     @Override
58 |     public int hashCode() {
59 |         int result = from != null ? from.hashCode() : 0;
60 |         result = 31 * result + (to != null ? to.hashCode() : 0);
61 |         return result;
62 |     }
63 | 
64 |     @Override
65 |     public int compareTo(final Object o) {
66 |         if (!(o instanceof MailPair)) {
67 |             return -1;
68 |         }
69 |         MailPair mp = (MailPair) o;
70 |         int first = from.compareTo(mp.from);
71 |         if (first != 0) {
72 |             return first;
73 |         }
74 |         int second = to.compareTo(mp.to);
75 |         if (second != 0) {
76 |             return second;
77 |         }
78 |         return 0;
79 |     }
80 | 
81 | }
82 | 


--------------------------------------------------------------------------------
/examples/sensors/run_job.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | echo "Run this job via gradle from the root directory:  ./gradlew sensorData"


--------------------------------------------------------------------------------
/examples/sensors/src/main/java/com/mongodb/hadoop/examples/sensors/DeviceMapper.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.examples.sensors;
 2 | 
 3 | import com.mongodb.hadoop.io.BSONWritable;
 4 | import org.apache.hadoop.io.Text;
 5 | import org.apache.hadoop.mapred.JobConf;
 6 | import org.apache.hadoop.mapred.OutputCollector;
 7 | import org.apache.hadoop.mapred.Reporter;
 8 | import org.apache.hadoop.mapreduce.Mapper;
 9 | import org.bson.BSONObject;
10 | 
11 | import java.io.IOException;
12 | 
13 | public class DeviceMapper extends Mapper<Object, BSONObject, Text, Text>
14 |     implements org.apache.hadoop.mapred.Mapper<Object, BSONWritable, Text, Text> {
15 | 
16 |     private final Text keyText;
17 |     private final Text valueText;
18 | 
19 |     public DeviceMapper() {
20 |         super();
21 |         keyText = new Text();
22 |         valueText = new Text();
23 |     }
24 | 
25 |     @Override
26 |     public void map(final Object key, final BSONObject val, final Context context) throws IOException, InterruptedException {
27 |         String keyOut = (String) val.get("owner") + " " + (String) val.get("type");
28 |         keyText.set(keyOut);
29 |         valueText.set(val.get("_id").toString());
30 |         context.write(keyText, valueText);
31 |     }
32 | 
33 |     @Override
34 |     public void map(final Object key, final BSONWritable value, final OutputCollector<Text, Text> output,
35 |                     final Reporter reporter) throws IOException {
36 |         BSONObject val = value.getDoc();
37 |         
38 |         String keyOut = (String) val.get("owner") + " " + (String) val.get("type");
39 |         keyText.set(keyOut);
40 |         valueText.set(val.get("_id").toString());
41 |         output.collect(keyText, valueText);
42 |     }
43 | 
44 |     @Override
45 |     public void close() throws IOException {
46 |     }
47 | 
48 |     @Override
49 |     public void configure(final JobConf job) {
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/examples/sensors/src/main/java/com/mongodb/hadoop/examples/sensors/DeviceReducer.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.examples.sensors;
 2 | 
 3 | import com.mongodb.hadoop.io.MongoUpdateWritable;
 4 | import org.apache.hadoop.io.NullWritable;
 5 | import org.apache.hadoop.io.Text;
 6 | import org.apache.hadoop.mapred.JobConf;
 7 | import org.apache.hadoop.mapred.OutputCollector;
 8 | import org.apache.hadoop.mapred.Reporter;
 9 | import org.apache.hadoop.mapreduce.Reducer;
10 | import org.bson.BasicBSONObject;
11 | import org.bson.types.ObjectId;
12 | 
13 | import java.io.IOException;
14 | import java.util.ArrayList;
15 | import java.util.Iterator;
16 | 
17 | public class DeviceReducer extends Reducer<Text, Text, NullWritable, MongoUpdateWritable>
18 |     implements org.apache.hadoop.mapred.Reducer<Text, Text, NullWritable, MongoUpdateWritable> {
19 | 
20 |     private MongoUpdateWritable reduceResult;
21 | 
22 |     public DeviceReducer() {
23 |         super();
24 |         reduceResult = new MongoUpdateWritable();
25 |     }
26 | 
27 |     @Override
28 |     public void reduce(final Text pKey, final Iterable<Text> pValues, final Context pContext) throws IOException, InterruptedException {
29 |         BasicBSONObject query = new BasicBSONObject("_id", pKey.toString());
30 |         ArrayList<ObjectId> devices = new ArrayList<ObjectId>();
31 |         for (Text val : pValues) {
32 |             devices.add(new ObjectId(val.toString()));
33 |         }
34 | 
35 |         BasicBSONObject update = new BasicBSONObject("$pushAll", new BasicBSONObject("devices", devices));
36 |         reduceResult.setQuery(query);
37 |         reduceResult.setModifiers(update);
38 |         pContext.write(null, reduceResult);
39 |     }
40 | 
41 |     @Override
42 |     public void reduce(final Text key, final Iterator<Text> values, final OutputCollector<NullWritable, MongoUpdateWritable> output,
43 |                        final Reporter reporter) throws IOException {
44 |         BasicBSONObject query = new BasicBSONObject("_id", key.toString());
45 |         ArrayList<ObjectId> devices = new ArrayList<ObjectId>();
46 |         while (values.hasNext()) {
47 |             Text val = values.next();
48 |             devices.add(new ObjectId(val.toString()));
49 |         }
50 | 
51 |         BasicBSONObject update = new BasicBSONObject("$pushAll", new BasicBSONObject("devices", devices));
52 |         reduceResult.setQuery(query);
53 |         reduceResult.setModifiers(update);
54 |         output.collect(null, reduceResult);
55 |     }
56 | 
57 |     @Override
58 |     public void close() throws IOException {
59 |     }
60 | 
61 |     @Override
62 |     public void configure(final JobConf job) {
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/examples/sensors/src/main/java/com/mongodb/hadoop/examples/sensors/Devices.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.examples.sensors;
 2 | 
 3 | import com.mongodb.hadoop.MongoInputFormat;
 4 | import com.mongodb.hadoop.MongoOutputFormat;
 5 | import com.mongodb.hadoop.io.BSONWritable;
 6 | import com.mongodb.hadoop.util.MapredMongoConfigUtil;
 7 | import com.mongodb.hadoop.util.MongoConfigUtil;
 8 | import com.mongodb.hadoop.util.MongoTool;
 9 | import org.apache.hadoop.conf.Configuration;
10 | import org.apache.hadoop.io.IntWritable;
11 | import org.apache.hadoop.io.Text;
12 | import org.apache.hadoop.util.ToolRunner;
13 | 
14 | import java.net.UnknownHostException;
15 | 
16 | public class Devices extends MongoTool {
17 | 
18 |     public Devices() throws UnknownHostException {
19 |         setConf(new Configuration());
20 | 
21 |         if (MongoTool.isMapRedV1()) {
22 |             MapredMongoConfigUtil.setInputFormat(getConf(), com.mongodb.hadoop.mapred.MongoInputFormat.class);
23 |             MapredMongoConfigUtil.setOutputFormat(getConf(), com.mongodb.hadoop.mapred.MongoOutputFormat.class);
24 |         } else {
25 |             MongoConfigUtil.setInputFormat(getConf(), MongoInputFormat.class);
26 |             MongoConfigUtil.setOutputFormat(getConf(), MongoOutputFormat.class);
27 |         }
28 |         
29 |         MongoConfigUtil.setInputURI(getConf(), "mongodb://localhost:27017/mongo_hadoop.devices");
30 |         MongoConfigUtil.setOutputURI(getConf(), "mongodb://localhost:27017/mongo_hadoop.logs_aggregate");
31 | 
32 |         MongoConfigUtil.setMapper(getConf(), DeviceMapper.class);
33 |         MongoConfigUtil.setReducer(getConf(), DeviceReducer.class);
34 |         MongoConfigUtil.setMapperOutputKey(getConf(), Text.class);
35 |         MongoConfigUtil.setMapperOutputValue(getConf(), Text.class);
36 |         MongoConfigUtil.setOutputKey(getConf(), IntWritable.class);
37 |         MongoConfigUtil.setOutputValue(getConf(), BSONWritable.class);
38 |         
39 |         new SensorDataGenerator().run();
40 |     }
41 | 
42 |     public static void main(final String[] pArgs) throws Exception {
43 |         System.exit(ToolRunner.run(new Devices(), pArgs));
44 |     }
45 | }


--------------------------------------------------------------------------------
/examples/sensors/src/main/java/com/mongodb/hadoop/examples/sensors/LogCombiner.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.examples.sensors;
 2 | 
 3 | import org.apache.commons.logging.Log;
 4 | import org.apache.commons.logging.LogFactory;
 5 | import org.apache.hadoop.io.IntWritable;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapred.JobConf;
 8 | import org.apache.hadoop.mapred.OutputCollector;
 9 | import org.apache.hadoop.mapred.Reporter;
10 | import org.apache.hadoop.mapreduce.Reducer;
11 | 
12 | import java.io.IOException;
13 | import java.util.Iterator;
14 | 
15 | public class LogCombiner extends Reducer<Text, IntWritable, Text, IntWritable>
16 |     implements org.apache.hadoop.mapred.Reducer<Text, IntWritable, Text, IntWritable> {
17 | 
18 |     private static final Log LOG = LogFactory.getLog(LogCombiner.class);
19 | 
20 |     @Override
21 |     public void reduce(final Text pKey, final Iterable<IntWritable> pValues, final Context pContext)
22 |         throws IOException, InterruptedException {
23 | 
24 |         int count = 0;
25 |         for (IntWritable val : pValues) {
26 |             count += val.get();
27 |         }
28 | 
29 |         pContext.write(pKey, new IntWritable(count));
30 |     }
31 | 
32 |     @Override
33 |     public void reduce(final Text key, final Iterator<IntWritable> values, final OutputCollector<Text, IntWritable> output,
34 |                        final Reporter reporter) throws IOException {
35 |         int count = 0;
36 |         while (values.hasNext()) {
37 |             count += values.next().get();
38 |         }
39 | 
40 |         output.collect(key, new IntWritable(count));
41 |     }
42 | 
43 |     @Override
44 |     public void close() throws IOException {
45 |     }
46 | 
47 |     @Override
48 |     public void configure(final JobConf job) {
49 |     }
50 | }
51 | 


--------------------------------------------------------------------------------
/examples/sensors/src/main/java/com/mongodb/hadoop/examples/sensors/LogMapper.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.examples.sensors;
 2 | 
 3 | import com.mongodb.hadoop.io.BSONWritable;
 4 | import org.apache.hadoop.io.IntWritable;
 5 | import org.apache.hadoop.io.Text;
 6 | import org.apache.hadoop.mapred.JobConf;
 7 | import org.apache.hadoop.mapred.OutputCollector;
 8 | import org.apache.hadoop.mapred.Reporter;
 9 | import org.apache.hadoop.mapreduce.Mapper;
10 | import org.bson.BSONObject;
11 | 
12 | import java.io.IOException;
13 | 
14 | public class LogMapper extends Mapper<Object, BSONObject, Text, IntWritable> 
15 |     implements org.apache.hadoop.mapred.Mapper<Object, BSONWritable, Text, IntWritable> {
16 | 
17 |     private final Text keyText;
18 |     private final IntWritable valueInt;
19 | 
20 |     public LogMapper() {
21 |         super();
22 |         keyText = new Text();
23 |         valueInt = new IntWritable(1);
24 |     }
25 | 
26 |     @Override
27 |     public void map(final Object key, final BSONObject val, final Context context) throws IOException, InterruptedException {
28 |         keyText.set(val.get("d_id").toString());
29 |         context.write(keyText, valueInt);
30 |     }
31 | 
32 |     @Override
33 |     public void map(final Object key, final BSONWritable value, final OutputCollector<Text, IntWritable> output, final Reporter reporter)
34 |         throws IOException {
35 |         keyText.set(value.getDoc().get("d_id").toString());
36 |         output.collect(keyText, valueInt);
37 |     }
38 | 
39 |     @Override
40 |     public void close() throws IOException {
41 |     }
42 | 
43 |     @Override
44 |     public void configure(final JobConf job) {
45 |     }
46 | }
47 | 
48 | 


--------------------------------------------------------------------------------
/examples/sensors/src/main/java/com/mongodb/hadoop/examples/sensors/Logs.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.examples.sensors;
 2 | 
 3 | import com.mongodb.hadoop.MongoInputFormat;
 4 | import com.mongodb.hadoop.MongoOutputFormat;
 5 | import com.mongodb.hadoop.util.MapredMongoConfigUtil;
 6 | import com.mongodb.hadoop.util.MongoConfigUtil;
 7 | import com.mongodb.hadoop.util.MongoTool;
 8 | import org.apache.hadoop.conf.Configuration;
 9 | import org.apache.hadoop.fs.FileSystem;
10 | import org.apache.hadoop.io.IntWritable;
11 | import org.apache.hadoop.io.Text;
12 | import org.apache.hadoop.util.ToolRunner;
13 | 
14 | import java.net.UnknownHostException;
15 | 
16 | public class Logs extends MongoTool {
17 | 
18 |     public Logs() throws UnknownHostException {
19 |         Configuration conf = new Configuration();
20 |         setConf(conf);
21 |         boolean mrv1Job;
22 |         try {
23 |             FileSystem.class.getDeclaredField("DEFAULT_FS");
24 |             mrv1Job = false;
25 |         } catch (NoSuchFieldException e) {
26 |             mrv1Job = true;
27 |         }
28 |         if (mrv1Job) {
29 |             MapredMongoConfigUtil.setInputFormat(getConf(), com.mongodb.hadoop.mapred.MongoInputFormat.class);
30 |             MapredMongoConfigUtil.setOutputFormat(getConf(), com.mongodb.hadoop.mapred.MongoOutputFormat.class);
31 |         } else {
32 |             MongoConfigUtil.setInputFormat(getConf(), MongoInputFormat.class);
33 |             MongoConfigUtil.setOutputFormat(getConf(), MongoOutputFormat.class);
34 |         }
35 | 
36 | 
37 |         MongoConfigUtil.setInputURI(getConf(), "mongodb://localhost:27017/mongo_hadoop.logs");
38 |         MongoConfigUtil.setOutputURI(getConf(), "mongodb://localhost:27017/mongo_hadoop.logs_aggregate");
39 | 
40 |         MongoConfigUtil.setMapper(getConf(), LogMapper.class);
41 |         MongoConfigUtil.setReducer(getConf(), LogReducer.class);
42 |         MongoConfigUtil.setCombiner(getConf(), LogCombiner.class);
43 | 
44 |         MongoConfigUtil.setOutputKey(getConf(), Text.class);
45 |         MongoConfigUtil.setOutputValue(getConf(), IntWritable.class);
46 |     }
47 | 
48 |     public static void main(final String[] pArgs) throws Exception {
49 |         System.exit(ToolRunner.run(new Logs(), pArgs));
50 |     }
51 | }


--------------------------------------------------------------------------------
/examples/sensors/testdata_generator.js:
--------------------------------------------------------------------------------
 1 | NUM_DEVICES = 1000;
 2 | NUM_LOGS = NUM_DEVICES * 50 * 1000
 3 | setVerboseShell(false);
 4 | 
 5 | db.devices.remove()
 6 | db.logs.remove()
 7 | 
 8 | function getRandomInRange(from, to, fixed) {
 9 |     return (Math.random() * (to - from) + from).toFixed(fixed) * 1;
10 | }
11 | 
12 | function getRandomString (len) {
13 |     var possible = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
14 |     var randomString = '';
15 |     for (var i = 0; i < len; i++) {
16 |         var randomPoz = Math.floor(Math.random() * possible.length);
17 |         randomString += possible.substring(randomPoz,randomPoz+1);
18 |     }
19 |     return randomString;
20 | }
21 | 
22 | function randomDate(start, end) {
23 |     return new Date(start.getTime() + Math.random() * (end.getTime() - start.getTime()))
24 | }
25 | 
26 | function choose(choices) {
27 |     index = Math.floor(Math.random() * choices.length);
28 |     return choices[index];
29 | }
30 | 
31 | function getRandomInt (min, max) {
32 |   return Math.floor(Math.random() * (max - min) + min);
33 | }
34 | 
35 | owners = []
36 | for(var i=0;i<10;i++){
37 |     owners.push(getRandomString(10));
38 | }
39 | 
40 | models = []
41 | for(var i=0;i<10;i++){
42 |     models.push(getRandomInt(10, 20));
43 | }
44 | 
45 | types = ["temp", "humidity", "pressure", "sound", "light"]
46 | 
47 | 
48 | device_ids = []
49 | // devices
50 | //
51 | for(var i=0;i<NUM_DEVICES;i++){
52 | 
53 |     var device_obj = {
54 |         _id : ObjectId(),
55 |         name : getRandomString(5) + getRandomInt(3),
56 |         type : choose(types),
57 |         owner : choose(owners),
58 |         model : choose(models),
59 |         created_at : randomDate(ISODate("2000-01-01T16:49:29.044-0400"), ISODate()),
60 |     }
61 | 
62 |     device_ids.push(device_obj._id)
63 |     db.devices.insert(device_obj)
64 | }
65 | 
66 | for(var i=0;i<NUM_LOGS;i++){
67 | 
68 |     var log_obj = {
69 |         _id : ObjectId(),
70 |         d_id : choose(device_ids),
71 |         v : Math.random() * getRandomInt(0, 10000),
72 |         timestamp : randomDate(ISODate("2013-01-01T16:49:29.044-0400"), ISODate()),
73 |         loc : [getRandomInRange(-180, 180, 3), getRandomInRange(-90, 90, 3)],
74 |     }
75 | 
76 |     db.logs.insert(log_obj)
77 | }
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/examples/treasury_yield/pig/pig_mongo_test.pig:
--------------------------------------------------------------------------------
 1 | -- Update these jar locations to point to the correct location on your machine
 2 | REGISTER /tmp/piggybank.jar
 3 | REGISTER  /Users/mike/Downloads/mongo-java-driver-2.11.2.jar;
 4 | REGISTER /Users/mike/projects/mongo-hadoop/pig/target/mongo-hadoop-pig_1.1.2-1.1.0.jar
 5 | REGISTER /Users/mike/projects/mongo-hadoop/core/target/mongo-hadoop-core_1.1.2-1.1.0.jar
 6 | 
 7 | raw = LOAD 'mongodb://localhost:27017/demo.yield_historical.in' using com.mongodb.hadoop.pig.MongoLoader;
 8 | DEFINE UnixToISO org.apache.pig.piggybank.evaluation.datetime.convert.UnixToISO();
 9 | DEFINE RegexExtract org.apache.pig.piggybank.evaluation.string.RegexExtract();
10 | 
11 | date_tenyear = foreach raw generate UnixToISO($0#'_id'), $0#'bc10Year';
12 | parsed_year = foreach date_tenyear generate RegexExtract($0, '(\\d{4})', 0) AS year, (double)$1 as bc;
13 | 
14 | by_year = GROUP parsed_year BY (chararray)year;
15 | year_10yearavg = FOREACH by_year GENERATE group, AVG(parsed_year.bc);
16 | dump year_10yearavg;
17 | 


--------------------------------------------------------------------------------
/examples/treasury_yield/run_job.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | echo "Run this job via gradle from the root directory:  ./gradlew historicalYield"


--------------------------------------------------------------------------------
/examples/treasury_yield/src/main/java/com/mongodb/hadoop/examples/treasury/TreasuryYieldMulti.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 10gen Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package com.mongodb.hadoop.examples.treasury;
17 | 
18 | import com.mongodb.BasicDBObject;
19 | import com.mongodb.MongoClientURI;
20 | import com.mongodb.hadoop.splitter.MultiCollectionSplitBuilder;
21 | import com.mongodb.hadoop.splitter.MultiMongoCollectionSplitter;
22 | import com.mongodb.hadoop.util.MongoTool;
23 | import org.apache.hadoop.conf.Configuration;
24 | import org.apache.hadoop.util.ToolRunner;
25 | 
26 | import java.util.Date;
27 | 
28 | /**
29 |  * The treasury yield xml config object.
30 |  */
31 | public class TreasuryYieldMulti extends MongoTool {
32 |     public static void main(final String[] pArgs) throws Exception {
33 |         //Here is an example of how to use multiple collections as the input to
34 |         //a hadoop job, from within Java code directly.
35 |         MultiCollectionSplitBuilder builder = new MultiCollectionSplitBuilder();
36 |         builder.add(new MongoClientURI("mongodb://localhost:27017/mongo_hadoop.yield_historical.in"), null, true, null, null, null, false,
37 |                     null)
38 |                .add(new MongoClientURI("mongodb://localhost:27017/mongo_hadoop.yield_historical.in"), null, true, null, null,
39 |                     new BasicDBObject("_id", new BasicDBObject("$gt", new Date(883440000000L))), false, null);
40 | 
41 |         Configuration conf = new Configuration();
42 |         conf.set(MultiMongoCollectionSplitter.MULTI_COLLECTION_CONF_KEY, builder.toJSON());
43 | 
44 |         System.exit(ToolRunner.run(conf, new TreasuryYieldXMLConfig(conf), pArgs));
45 |     }
46 | }
47 | 
48 | 


--------------------------------------------------------------------------------
/examples/treasury_yield/src/main/java/com/mongodb/hadoop/examples/treasury/TreasuryYieldXMLConfig.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2011 10gen Inc.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  * http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package com.mongodb.hadoop.examples.treasury;
17 | 
18 | import com.mongodb.hadoop.MongoInputFormat;
19 | import com.mongodb.hadoop.MongoOutputFormat;
20 | import com.mongodb.hadoop.io.BSONWritable;
21 | import com.mongodb.hadoop.util.MapredMongoConfigUtil;
22 | import com.mongodb.hadoop.util.MongoConfigUtil;
23 | import com.mongodb.hadoop.util.MongoTool;
24 | import org.apache.hadoop.conf.Configuration;
25 | import org.apache.hadoop.io.DoubleWritable;
26 | import org.apache.hadoop.io.IntWritable;
27 | import org.apache.hadoop.util.ToolRunner;
28 | 
29 | /**
30 |  * The treasury yield xml config object.
31 |  */
32 | public class TreasuryYieldXMLConfig extends MongoTool {
33 |     public TreasuryYieldXMLConfig() {
34 |         this(new Configuration());
35 |     }
36 | 
37 |     public TreasuryYieldXMLConfig(final Configuration conf) {
38 |         setConf(conf);
39 | 
40 |         if (MongoTool.isMapRedV1()) {
41 |             MapredMongoConfigUtil.setInputFormat(conf, com.mongodb.hadoop.mapred.MongoInputFormat.class);
42 |             MapredMongoConfigUtil.setOutputFormat(conf, com.mongodb.hadoop.mapred.MongoOutputFormat.class);
43 |         } else {
44 |             MongoConfigUtil.setInputFormat(conf, MongoInputFormat.class);
45 |             MongoConfigUtil.setOutputFormat(conf, MongoOutputFormat.class);
46 |         }
47 |         MongoConfigUtil.setMapper(conf, TreasuryYieldMapper.class);
48 |         MongoConfigUtil.setMapperOutputKey(conf, IntWritable.class);
49 |         MongoConfigUtil.setMapperOutputValue(conf, DoubleWritable.class);
50 | 
51 |         MongoConfigUtil.setReducer(conf, TreasuryYieldReducer.class);
52 |         MongoConfigUtil.setOutputKey(conf, IntWritable.class);
53 |         MongoConfigUtil.setOutputValue(conf, BSONWritable.class);
54 |     }
55 | 
56 |     public static void main(final String[] pArgs) throws Exception {
57 |         System.exit(ToolRunner.run(new TreasuryYieldXMLConfig(), pArgs));
58 |     }
59 | }


--------------------------------------------------------------------------------
/examples/treasury_yield/src/main/resources/commons-logging.properties:
--------------------------------------------------------------------------------
 1 | # commons-logging.properties
 2 | # jdk handlers
 3 | handlers=java.util.logging.ConsoleHandler, java.util.logging.FileHandler
 4 | 
 5 | # default log level
 6 | .level=DEBUG
 7 | 
 8 | # Specific logger level
 9 | #MyClassLogger.level=FINE
10 | 
11 | # FileHandler options - can also be set to the ConsoleHandler
12 | # FileHandler level can be set to override the global level:
13 | #java.util.logging.FileHandler.level=WARN
14 | 
15 | # log file name for the File Handler
16 | java.util.logging.FileHandler.pattern=/tmp/javalog%u.log
17 | 
18 | # Specify the style of output (simple or xml)
19 | java.util.logging.FileHandler.formatter=java.util.logging.SimpleFormatter
20 | 
21 | # Optional - Limit the size of the file (in bytes)
22 | java.util.logging.FileHandler.limit=50000
23 | 
24 | # Optional - The number of files to cycle through, by
25 | # appending an integer to the base file name:
26 | java.util.logging.FileHandler.count=1


--------------------------------------------------------------------------------
/examples/treasury_yield/src/main/resources/parse_yield_historical.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from xml.dom.minidom import parse
 4 | from pymongo import Connection
 5 | from datetime import datetime
 6 | 
 7 | conn = Connection()
 8 | db = conn.test
 9 | mongo = db['yield_historical.in']
10 | 
11 | dom = parse('yield_historical_Jan90_Sep10.xml')
12 | 
13 | data = dom.firstChild.firstChild
14 | 
15 | def parseDecimal(input):
16 |     try:
17 |         return float(input.strip())
18 |     except Exception, e:
19 |         return None
20 | 
21 | for entry in [item for item in data.childNodes if not item.getElementsByTagName("BOND_MKT_UNAVAIL")[0].hasChildNodes()]:
22 |     t_item = {}
23 |     try:
24 |         t_item['_id'] = datetime.strptime(entry.getElementsByTagName("NEW_DATE")[0].firstChild.wholeText.strip(), "%m-%d-%Y")
25 |     except:
26 |         print "Bad date ... '%s'" %  entry.getElementsByTagName("NEW_DATE")[0].firstChild.wholeText
27 |     t_item['dayOfWeek'] = entry.getElementsByTagName("DAY_OF_WEEK")[0].firstChild.wholeText
28 |     t_item['bc1Month'] = parseDecimal(entry.getElementsByTagName("BC_1MONTH")[0].firstChild.wholeText)
29 |     t_item['bc3Month'] = parseDecimal(entry.getElementsByTagName("BC_3MONTH")[0].firstChild.wholeText)
30 |     t_item['bc6Month'] = parseDecimal(entry.getElementsByTagName("BC_6MONTH")[0].firstChild.wholeText)
31 |     t_item['bc1Year'] = parseDecimal(entry.getElementsByTagName("BC_1YEAR")[0].firstChild.wholeText)
32 |     t_item['bc2Year'] = parseDecimal(entry.getElementsByTagName("BC_2YEAR")[0].firstChild.wholeText)
33 |     t_item['bc3Year'] = parseDecimal(entry.getElementsByTagName("BC_3YEAR")[0].firstChild.wholeText)
34 |     t_item['bc5Year'] = parseDecimal(entry.getElementsByTagName("BC_5YEAR")[0].firstChild.wholeText)
35 |     t_item['bc7Year'] = parseDecimal(entry.getElementsByTagName("BC_7YEAR")[0].firstChild.wholeText)
36 |     t_item['bc10Year'] = parseDecimal(entry.getElementsByTagName("BC_10YEAR")[0].firstChild.wholeText)
37 |     t_item['bc20Year'] = parseDecimal(entry.getElementsByTagName("BC_20YEAR")[0].firstChild.wholeText)
38 |     t_item['bc30Year'] = parseDecimal(entry.getElementsByTagName("BC_30YEAR")[0].firstChild.wholeText)
39 |     mongo.save(t_item)
40 | 
41 | 


--------------------------------------------------------------------------------
/examples/treasury_yield/src/test/java/com/mongodb/hadoop/TestStreaming.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop;
 2 | 
 3 | import com.mongodb.DBCollection;
 4 | import com.mongodb.hadoop.util.MongoConfigUtil;
 5 | import org.junit.Before;
 6 | import org.junit.Ignore;
 7 | import org.junit.Test;
 8 | 
 9 | import java.util.Map;
10 | import java.util.TreeMap;
11 | 
12 | import static org.junit.Assert.assertEquals;
13 | import static org.junit.Assume.assumeFalse;
14 | 
15 | public class TestStreaming extends TreasuryTest {
16 |     @Before
17 |     public void hadoopVersionCheck() {
18 |         assumeFalse(HADOOP_VERSION.startsWith("1.0"));
19 |         assumeFalse(isSharded(getInputUri()));
20 |     }
21 | 
22 |     @Test
23 |     @Ignore
24 |     public void testBasicStreamingJob() {
25 |         Map<String, String> params = new TreeMap<String, String>();
26 |         params.put(MongoConfigUtil.INPUT_QUERY, "{_id:{$gt:{$date:883440000000}}}");
27 |         new StreamingJob()
28 |             .params(params)
29 |             .inputUris(getInputUri())
30 |             .outputUris(getOutputUri())
31 |             .execute();
32 |         
33 |         DBCollection collection = getClient(getInputUri()).getDB("mongo_hadoop").getCollection("yield_historical.out");
34 |         assertEquals(14, collection.count());
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/examples/treasury_yield/src/test/resources/commons-logging.properties:
--------------------------------------------------------------------------------
 1 | # commons-logging.properties
 2 | # jdk handlers
 3 | handlers=java.util.logging.ConsoleHandler, java.util.logging.FileHandler
 4 | 
 5 | # default log level
 6 | .level=DEBUG
 7 | 
 8 | # Specific logger level
 9 | #MyClassLogger.level=FINE
10 | 
11 | # FileHandler options - can also be set to the ConsoleHandler
12 | # FileHandler level can be set to override the global level:
13 | #java.util.logging.FileHandler.level=WARN
14 | 
15 | # log file name for the File Handler
16 | java.util.logging.FileHandler.pattern=/tmp/javalog%u.log
17 | 
18 | # Specify the style of output (simple or xml)
19 | java.util.logging.ConsoleHandler.formatter=java.util.logging.SimpleFormatter
20 | java.util.logging.FileHandler.formatter=java.util.logging.SimpleFormatter
21 | 
22 | # Optional - Limit the size of the file (in bytes)
23 | java.util.logging.FileHandler.limit=50000
24 | 
25 | # Optional - The number of files to cycle through, by
26 | # appending an integer to the base file name:
27 | java.util.logging.FileHandler.count=1


--------------------------------------------------------------------------------
/examples/treasury_yield/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 |  
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n


--------------------------------------------------------------------------------
/examples/treasury_yield/src/test/resources/yarn-site.xml:
--------------------------------------------------------------------------------
1 | <configuration>
2 |     <!-- dummy file to be updated by tests -->
3 | </configuration>


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mongodb/mongo-hadoop/20208a027ad8638e56dfcf040773f176d6ee059f/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | #Mon Mar 09 18:25:42 PDT 2015
2 | distributionBase=GRADLE_USER_HOME
3 | distributionPath=wrapper/dists
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | distributionUrl=https\://services.gradle.org/distributions/gradle-2.2.1-all.zip
7 | 


--------------------------------------------------------------------------------
/gradlew.bat:
--------------------------------------------------------------------------------
 1 | @if "%DEBUG%" == "" @echo off
 2 | @rem ##########################################################################
 3 | @rem
 4 | @rem  Gradle startup script for Windows
 5 | @rem
 6 | @rem ##########################################################################
 7 | 
 8 | @rem Set local scope for the variables with windows NT shell
 9 | if "%OS%"=="Windows_NT" setlocal
10 | 
11 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
12 | set DEFAULT_JVM_OPTS=
13 | 
14 | set DIRNAME=%~dp0
15 | if "%DIRNAME%" == "" set DIRNAME=.
16 | set APP_BASE_NAME=%~n0
17 | set APP_HOME=%DIRNAME%
18 | 
19 | @rem Find java.exe
20 | if defined JAVA_HOME goto findJavaFromJavaHome
21 | 
22 | set JAVA_EXE=java.exe
23 | %JAVA_EXE% -version >NUL 2>&1
24 | if "%ERRORLEVEL%" == "0" goto init
25 | 
26 | echo.
27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28 | echo.
29 | echo Please set the JAVA_HOME variable in your environment to match the
30 | echo location of your Java installation.
31 | 
32 | goto fail
33 | 
34 | :findJavaFromJavaHome
35 | set JAVA_HOME=%JAVA_HOME:"=%
36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37 | 
38 | if exist "%JAVA_EXE%" goto init
39 | 
40 | echo.
41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42 | echo.
43 | echo Please set the JAVA_HOME variable in your environment to match the
44 | echo location of your Java installation.
45 | 
46 | goto fail
47 | 
48 | :init
49 | @rem Get command-line arguments, handling Windowz variants
50 | 
51 | if not "%OS%" == "Windows_NT" goto win9xME_args
52 | if "%@eval[2+2]" == "4" goto 4NT_args
53 | 
54 | :win9xME_args
55 | @rem Slurp the command line arguments.
56 | set CMD_LINE_ARGS=
57 | set _SKIP=2
58 | 
59 | :win9xME_args_slurp
60 | if "x%~1" == "x" goto execute
61 | 
62 | set CMD_LINE_ARGS=%*
63 | goto execute
64 | 
65 | :4NT_args
66 | @rem Get arguments from the 4NT Shell from JP Software
67 | set CMD_LINE_ARGS=%$
68 | 
69 | :execute
70 | @rem Setup the command line
71 | 
72 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
73 | 
74 | @rem Execute Gradle
75 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
76 | 
77 | :end
78 | @rem End local scope for the variables with windows NT shell
79 | if "%ERRORLEVEL%"=="0" goto mainEnd
80 | 
81 | :fail
82 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
83 | rem the _cmd.exe /c_ return code!
84 | if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
85 | exit /b 1
86 | 
87 | :mainEnd
88 | if "%OS%"=="Windows_NT" endlocal
89 | 
90 | :omega
91 | 


--------------------------------------------------------------------------------
/hive/src/test/java/com/mongodb/hadoop/hive/HiveQueryTest.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.hive;
 2 | 
 3 | import com.mongodb.MongoClient;
 4 | import com.mongodb.client.MongoCollection;
 5 | import org.bson.Document;
 6 | import org.junit.After;
 7 | import org.junit.Before;
 8 | import org.junit.Test;
 9 | 
10 | import java.sql.SQLException;
11 | 
12 | import static org.junit.Assert.assertEquals;
13 | 
14 | public class HiveQueryTest extends HiveTest {
15 | 
16 |     private static MongoCollection<Document> coll;
17 | 
18 |     @Before
19 |     public void setUp() {
20 |         MongoClient client = new MongoClient("localhost:27017");
21 |         coll = client.getDatabase("mongo_hadoop").getCollection("hive_query");
22 |         for (int i = 0; i < 1000; ++i) {
23 |             coll.insertOne(new Document("i", i).append("j", i % 5));
24 |         }
25 |     }
26 | 
27 |     @After
28 |     public void tearDown() {
29 |         coll.drop();
30 |         dropTable("querytest");
31 |     }
32 | 
33 |     @Test
34 |     public void testQueryPushdown() throws SQLException {
35 |         execute(
36 |           "CREATE EXTERNAL TABLE querytest (id STRING, i INT, j INT) "
37 |             + "STORED BY \"com.mongodb.hadoop.hive.MongoStorageHandler\" "
38 |             + "WITH SERDEPROPERTIES(\"mongo.columns.mapping\"="
39 |             + "'{\"id\":\"_id\"}') "
40 |             + "TBLPROPERTIES(\"mongo.uri\"="
41 |             + "\"mongodb://localhost:27017/mongo_hadoop.hive_query\")");
42 |         Results results = query("SELECT * FROM querytest WHERE i > 20");
43 |         assertEquals(979, results.size());
44 |     }
45 | 
46 |     @Test
47 |     public void testQueryPushdownWithQueryTable() throws SQLException {
48 |         execute(
49 |           "CREATE EXTERNAL TABLE querytest (id STRING, i INT, j INT) "
50 |             + "STORED BY \"com.mongodb.hadoop.hive.MongoStorageHandler\" "
51 |             + "WITH SERDEPROPERTIES(\"mongo.columns.mapping\"="
52 |             + "'{\"id\":\"_id\"}') "
53 |             + "TBLPROPERTIES(\"mongo.uri\"="
54 |             + "\"mongodb://localhost:27017/mongo_hadoop.hive_query\","
55 |             + "\"mongo.input.query\"='{\"j\":0}')");
56 |         Results results = query("SELECT * FROM querytest WHERE i > 20");
57 |         assertEquals(195, results.size());
58 | 
59 |         results = query("SELECT * from querytest WHERE j > 2");
60 |         assertEquals(0, results.size());
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/hive/src/test/java/com/mongodb/hadoop/hive/TablePropertiesTest.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.hive;
 2 | 
 3 | import com.mongodb.MongoClient;
 4 | import com.mongodb.MongoClientURI;
 5 | import com.mongodb.client.MongoCollection;
 6 | import org.bson.Document;
 7 | import org.junit.After;
 8 | import org.junit.Before;
 9 | import org.junit.Test;
10 | 
11 | import java.sql.SQLException;
12 | import java.util.ArrayList;
13 | 
14 | import static org.junit.Assert.assertEquals;
15 | 
16 | public class TablePropertiesTest extends HiveTest {
17 | 
18 |     private MongoCollection<Document> collection;
19 | 
20 |     @Before
21 |     public void setUp() {
22 |         MongoClientURI clientURI = new MongoClientURI(
23 |           "mongodb://localhost:27017/mongo_hadoop.tabletest");
24 |         MongoClient client = new MongoClient(clientURI);
25 | 
26 |         // Seed some documents into MongoDB.
27 |         collection = client
28 |           .getDatabase(clientURI.getDatabase())
29 |           .getCollection(clientURI.getCollection());
30 |         ArrayList<Document> documents = new ArrayList<Document>(1000);
31 |         for (int i = 0; i < 1000; ++i) {
32 |             documents.add(new Document("i", i));
33 |         }
34 |         collection.insertMany(documents);
35 | 
36 |         // Make sure table doesn't exist already.
37 |         dropTable("props_file_test");
38 |     }
39 | 
40 |     @After
41 |     public void tearDown() {
42 |         // Tear down collection.
43 |         collection.drop();
44 | 
45 |         // Drop Hive table.
46 |         dropTable("props_file_test");
47 |     }
48 | 
49 |     @Test
50 |     public void testPropertiesFile() throws SQLException {
51 |         // Create the table.
52 |         execute(
53 |           "CREATE TABLE props_file_test"
54 |             + " (id STRING, i INT)"
55 |             + " STORED BY 'com.mongodb.hadoop.hive.MongoStorageHandler'"
56 |             + " WITH SERDEPROPERTIES('mongo.columns.mapping'='{\"id\":\"_id\"}')"
57 |             + " TBLPROPERTIES('mongo.properties.path'='"
58 |             + getPath("hivetable.properties") + "')");
59 | 
60 |         // Read and write some data through the table.
61 |         Results results = query("SELECT i FROM props_file_test WHERE i >= 20");
62 |         assertEquals(490, results.size());
63 | 
64 |         execute(
65 |           "INSERT INTO props_file_test VALUES ('55d5005b6e32ab5664606195', 42)");
66 |         assertEquals(2, collection.count(new Document("i", 42)));
67 |     }
68 | }
69 | 


--------------------------------------------------------------------------------
/hive/src/test/java/com/mongodb/hadoop/hive/TestHDFSToMongoDBWithOptions.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.hive;
 2 | 
 3 | import com.mongodb.DBObject;
 4 | import com.mongodb.util.JSON;
 5 | import org.junit.After;
 6 | import org.junit.Before;
 7 | import org.junit.Test;
 8 | 
 9 | import java.sql.SQLException;
10 | import java.util.Map;
11 | import java.util.Set;
12 | 
13 | import static org.junit.Assert.assertEquals;
14 | import static org.junit.Assert.assertNotEquals;
15 | import static org.junit.Assert.assertNotNull;
16 | import static org.junit.Assert.assertTrue;
17 | 
18 | public class TestHDFSToMongoDBWithOptions extends HiveTest {
19 |     @Before
20 |     public void setUp() throws SQLException {
21 |         loadDataIntoHDFSHiveTable();
22 |         loadDataIntoMongoDBHiveTable(true);
23 |     }
24 | 
25 |     @After
26 |     public void tearDown() throws SQLException {
27 |         dropTable(MONGO_BACKED_TABLE);
28 |         dropTable(HDFS_BACKED_TABLE);
29 |     }
30 | 
31 |     @Test
32 |     @SuppressWarnings("unchecked")
33 |     public void testMongoMapping() {
34 |         DBObject doc = getCollection(MONGO_COLLECTION).findOne();
35 |         String[] propsSplit = SERDE_PROPERTIES.split("=");
36 | 
37 |         int propsSplitLen = propsSplit.length;
38 |         assertEquals(propsSplitLen % 2, 0);
39 | 
40 |         // now read in the 'mongo.columns.mapping' mapping
41 |         String colsMap = null;
42 |         for (int i = 0; i < propsSplit.length && colsMap == null; i++) {
43 |             final String entry = propsSplit[i];
44 |             if (entry.toLowerCase().equals("'mongo.columns.mapping'") && i - 1 < propsSplitLen) {
45 |                 colsMap = propsSplit[i + 1];
46 |             }
47 |         }
48 | 
49 |         assertNotNull(colsMap);
50 |         // first remove '' around colsMap
51 |         colsMap = colsMap.substring(1, colsMap.length() - 1);
52 |         Set<String> docKeys = doc.keySet();
53 | 
54 |         for (String s : ((Map<String, String>) JSON.parse(colsMap)).values()) {
55 |             assertTrue(docKeys.contains(s));
56 |         }
57 |     }
58 | 
59 |     @Test
60 |     public void testCountSameTable() {
61 |         Results hiveData = getAllDataFromTable(HDFS_BACKED_TABLE);
62 |         Results mongoData = getAllDataFromTable(MONGO_BACKED_TABLE);
63 |         assertNotEquals(hiveData.size(), 0);
64 |         assertNotEquals(mongoData.size(), 0);
65 | 
66 |         assertEquals(hiveData, mongoData);
67 |     }
68 | }
69 | 


--------------------------------------------------------------------------------
/hive/src/test/resources/core-site.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | <!--
 3 |     <property>
 4 |         <name>dfs.datanode.address</name>
 5 |         <value>50010</value>
 6 |     </property>
 7 |     <property>
 8 |         <name>dfs.namenode.rpc-address</name>
 9 |         <value>8020</value>
10 |     </property>
11 |     <property>
12 |         <name>fs.defaultFS</name>
13 |         <value>hdfs://localhost:8020</value>
14 |     </property>
15 | -->
16 | </configuration>


--------------------------------------------------------------------------------
/hive/src/test/resources/hivetable.properties:
--------------------------------------------------------------------------------
1 | mongo.uri=mongodb://localhost:27017/mongo_hadoop.tabletest
2 | mongo.input.query={"i": {"$mod": [2, 0]}}


--------------------------------------------------------------------------------
/hive/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # suppress inspection "UnusedProperty" for whole file
 2 | log4j.rootLogger=info, stdout, R
 3 | 
 4 | log4j.logger.com.mongodb=debug, R
 5 | log4j.logger.org.apache=ERROR, R
 6 | log4j.logger.com.jolbox.bonecp=ERROR, R
 7 | log4j.logger.org.slf4j=ERROR, R
 8 | log4j.logger.org.datanucleus=ERROR, R
 9 | log4j.logger.org.datanucleus.util=ERROR, R
10 | log4j.logger.org.mortbay=ERROR, R
11 | log4j.logger.org.jboss=ERROR, R
12 | 
13 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
14 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
15 | 
16 | # Pattern to output the caller's file name and line number.
17 | log4j.appender.stdout.layout.ConversionPattern=%5p [%t] (%F:%L) - %m%n
18 | 
19 | log4j.appender.R=org.apache.log4j.RollingFileAppender
20 | log4j.appender.R.File=/tmp/hive.log4j
21 | 
22 | log4j.appender.R.MaxFileSize=100KB
23 | # Keep one backup file
24 | log4j.appender.R.MaxBackupIndex=1
25 | 
26 | log4j.appender.R.layout=org.apache.log4j.PatternLayout
27 | log4j.appender.R.layout.ConversionPattern=%p %t %c - %m%n


--------------------------------------------------------------------------------
/hive/src/test/resources/test_data.txt:
--------------------------------------------------------------------------------
1 | 1	Tom	28
2 | 2	Alice	18
3 | 3	Bob	29
4 | 101	Scott	10
5 | 102	Randall	100
6 | 103	Mike	100
7 | 104	Jesse	152
8 | 


--------------------------------------------------------------------------------
/hive/src/test/resources/users.bson:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mongodb/mongo-hadoop/20208a027ad8638e56dfcf040773f176d6ee059f/hive/src/test/resources/users.bson


--------------------------------------------------------------------------------
/hive/src/test/resources/yarn-site.xml:
--------------------------------------------------------------------------------
1 | <configuration></configuration>


--------------------------------------------------------------------------------
/pig/src/main/java/com/mongodb/hadoop/pig/udf/ByteArrayTypeEvalFunc.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.pig.udf;
 2 | 
 3 | import org.apache.pig.EvalFunc;
 4 | import org.apache.pig.data.DataType;
 5 | import org.apache.pig.impl.logicalLayer.schema.Schema;
 6 | 
 7 | /**
 8 |  * Convenience abstract implementation of Pig's EvalFunc that automatically
 9 |  * tells Pig that the return type of the UDF is a DataByteArray.
10 |  *
11 |  * Subclasses specify what subclass of DataByteArray to use in the type
12 |  * parameter T.
13 |  */
14 | public abstract class ByteArrayTypeEvalFunc<T> extends EvalFunc<T> {
15 |     @Override
16 |     public Schema outputSchema(final Schema input) {
17 |         return new Schema(new Schema.FieldSchema(null, DataType.BYTEARRAY));
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/pig/src/main/java/com/mongodb/hadoop/pig/udf/GenMaxKey.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.pig.udf;
 2 | 
 3 | import com.mongodb.hadoop.pig.udf.types.PigBoxedMaxKey;
 4 | import org.apache.pig.data.Tuple;
 5 | 
 6 | import java.io.IOException;
 7 | 
 8 | /**
 9 |  * Pig UDF that always returns MaxKey().
10 |  */
11 | public class GenMaxKey extends ByteArrayTypeEvalFunc<PigBoxedMaxKey> {
12 |     @Override
13 |     public PigBoxedMaxKey exec(final Tuple input) throws IOException {
14 |         return new PigBoxedMaxKey();
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/pig/src/main/java/com/mongodb/hadoop/pig/udf/GenMinKey.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.pig.udf;
 2 | 
 3 | import com.mongodb.hadoop.pig.udf.types.PigBoxedMinKey;
 4 | import org.apache.pig.data.Tuple;
 5 | 
 6 | import java.io.IOException;
 7 | 
 8 | /**
 9 |  * Pig UDF that always returns MinKey().
10 |  */
11 | public class GenMinKey extends ByteArrayTypeEvalFunc<PigBoxedMinKey> {
12 |     @Override
13 |     public PigBoxedMinKey exec(final Tuple input) throws IOException {
14 |         return new PigBoxedMinKey();
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/pig/src/main/java/com/mongodb/hadoop/pig/udf/ObjectIdToSeconds.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.pig.udf;
 2 | 
 3 | import com.mongodb.hadoop.pig.udf.types.PigBoxedObjectId;
 4 | import org.apache.pig.EvalFunc;
 5 | import org.apache.pig.data.DataByteArray;
 6 | import org.apache.pig.data.DataType;
 7 | import org.apache.pig.data.Tuple;
 8 | import org.apache.pig.impl.logicalLayer.schema.Schema;
 9 | import org.bson.types.ObjectId;
10 | 
11 | import java.io.IOException;
12 | 
13 | /**
14 |  * Pig UDF that extracts the timestamp from an ObjectId.
15 |  */
16 | public class ObjectIdToSeconds extends EvalFunc<Integer> {
17 | 
18 |     public Integer exec(final Tuple input) throws IOException {
19 |         if (null == input || input.size() == 0) {
20 |             return null;
21 |         }
22 |         Object oid = input.get(0);
23 |         if (oid instanceof PigBoxedObjectId) {
24 |             return ((PigBoxedObjectId) oid).getObject().getTimestamp();
25 |         } else if (oid instanceof String) {
26 |             return new ObjectId((String) oid).getTimestamp();
27 |         } else if (oid instanceof DataByteArray) {
28 |             return new ObjectId(((DataByteArray) oid).get()).getTimestamp();
29 |         }
30 |         throw new IOException(
31 |           "Not an ObjectId, so cannot convert to seconds: " + oid);
32 |     }
33 | 
34 |     @Override
35 |     public Schema outputSchema(final Schema input) {
36 |         return new Schema(new Schema.FieldSchema("seconds", DataType.INTEGER));
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/pig/src/main/java/com/mongodb/hadoop/pig/udf/ToBinary.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.pig.udf;
 2 | 
 3 | import com.mongodb.hadoop.pig.udf.types.PigBoxedBinary;
 4 | import org.apache.pig.data.DataByteArray;
 5 | import org.apache.pig.data.Tuple;
 6 | 
 7 | import java.io.IOException;
 8 | 
 9 | /**
10 |  * Pig UDF that transforms the incoming value into a BSON Binary object.
11 |  */
12 | public class ToBinary extends ByteArrayTypeEvalFunc<PigBoxedBinary> {
13 |     @Override
14 |     public PigBoxedBinary exec(final Tuple input) throws IOException {
15 |         if (null == input || input.size() == 0) {
16 |             return null;
17 |         }
18 |         Object o = input.get(0);
19 |         if (o instanceof String) {
20 |             return new PigBoxedBinary(((String) o).getBytes());
21 |         } else if (o instanceof DataByteArray) {
22 |             return new PigBoxedBinary(((DataByteArray) o).get());
23 |         }
24 |         throw new IOException(
25 |           "Need String or DataByteArray to build a Binary, not " + o);
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/pig/src/main/java/com/mongodb/hadoop/pig/udf/ToDBRef.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.pig.udf;
 2 | 
 3 | import com.mongodb.hadoop.pig.udf.types.PigBoxedDBRef;
 4 | import org.apache.pig.data.Tuple;
 5 | import org.bson.types.ObjectId;
 6 | 
 7 | import java.io.IOException;
 8 | import java.util.Map;
 9 | 
10 | /**
11 |  * Pig UDF that transforms the incoming value into a MongoDB DBRef.
12 |  */
13 | public class ToDBRef extends ByteArrayTypeEvalFunc<PigBoxedDBRef> {
14 |     @Override
15 |     public PigBoxedDBRef exec(final Tuple input) throws IOException {
16 |         if (null == input || input.size() == 0) {
17 |             return null;
18 |         }
19 |         Object o = input.get(0);
20 |         if (o instanceof Map) {
21 |             Object collectionName = ((Map) o).get("$ref");
22 |             Object id = ((Map) o).get("$id");
23 |             if (null == collectionName || null == id) {
24 |                 throw new IOException(
25 |                   "Map must contain both $ref and $id fields: " + o);
26 |             }
27 |             byte[] collectionNameBytes =
28 |               ((String) collectionName).getBytes();
29 |             byte[] dbrefBytes =
30 |               new byte[12 + 1 + collectionNameBytes.length];
31 |             byte[] oidBytes = new ObjectId((String) id).toByteArray();
32 |             System.arraycopy(
33 |               collectionNameBytes, 0,
34 |               dbrefBytes, 0, collectionNameBytes.length);
35 |             dbrefBytes[collectionNameBytes.length] = 0;
36 |             System.arraycopy(
37 |               oidBytes, 0,
38 |               dbrefBytes, collectionNameBytes.length + 1, 12);
39 |             return new PigBoxedDBRef(dbrefBytes);
40 |         }
41 |         throw new IOException("Need a Map to build a DBRef, not " + o);
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/pig/src/main/java/com/mongodb/hadoop/pig/udf/ToObjectId.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.pig.udf;
 2 | 
 3 | import com.mongodb.hadoop.pig.udf.types.PigBoxedObjectId;
 4 | import org.apache.pig.data.DataByteArray;
 5 | import org.apache.pig.data.Tuple;
 6 | import org.bson.types.ObjectId;
 7 | 
 8 | import java.io.IOException;
 9 | 
10 | /**
11 |  * UDF that transforms the incoming value into a BSON ObjectId.
12 |  */
13 | public class ToObjectId extends ByteArrayTypeEvalFunc<PigBoxedObjectId> {
14 |     public PigBoxedObjectId exec(final Tuple input) throws IOException {
15 |         if (null == input || input.size() == 0) {
16 |             return null;
17 |         }
18 |         Object o = input.get(0);
19 |         if (o instanceof String) {
20 |             return new PigBoxedObjectId(
21 |               new ObjectId((String) o).toByteArray());
22 |         } else if (o instanceof DataByteArray) {
23 |             return new PigBoxedObjectId(((DataByteArray) o).get());
24 |         }
25 |         throw new IOException(
26 |           "Need a String or DataByteArray to build an ObjectId, not " + o);
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/pig/src/main/java/com/mongodb/hadoop/pig/udf/types/PigBoxedBSONValue.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.pig.udf.types;
 2 | 
 3 | import org.apache.pig.data.DataByteArray;
 4 | 
 5 | public abstract class PigBoxedBSONValue<T> extends DataByteArray {
 6 |     public PigBoxedBSONValue() {}
 7 | 
 8 |     public PigBoxedBSONValue(final byte[] b) {
 9 |         super(b);
10 |     }
11 | 
12 |     public abstract T getObject();
13 | }
14 | 


--------------------------------------------------------------------------------
/pig/src/main/java/com/mongodb/hadoop/pig/udf/types/PigBoxedBinary.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.pig.udf.types;
 2 | 
 3 | import org.bson.types.Binary;
 4 | 
 5 | public class PigBoxedBinary extends PigBoxedBSONValue<Binary> {
 6 |     public PigBoxedBinary(final byte[] b) {
 7 |         super(b);
 8 |     }
 9 | 
10 |     @Override
11 |     public Binary getObject() {
12 |         return new Binary(get());
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/pig/src/main/java/com/mongodb/hadoop/pig/udf/types/PigBoxedDBRef.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.pig.udf.types;
 2 | 
 3 | import com.mongodb.DBRef;
 4 | import org.bson.types.ObjectId;
 5 | 
 6 | import java.util.Arrays;
 7 | 
 8 | public class PigBoxedDBRef extends PigBoxedBSONValue<DBRef> {
 9 |     public PigBoxedDBRef(final byte[] b) {
10 |         super(b);
11 |     }
12 | 
13 |     @Override
14 |     public DBRef getObject() {
15 |         byte[] bytes = get();
16 |         ObjectId id = new ObjectId(
17 |           Arrays.copyOfRange(bytes, bytes.length - 12, bytes.length));
18 |         String collectionName = new String(
19 |           Arrays.copyOfRange(bytes, 0, bytes.length - 13));
20 | 
21 |         return new DBRef(collectionName, id);
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/pig/src/main/java/com/mongodb/hadoop/pig/udf/types/PigBoxedMaxKey.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.pig.udf.types;
 2 | 
 3 | import org.bson.types.MaxKey;
 4 | 
 5 | public class PigBoxedMaxKey extends PigBoxedBSONValue<MaxKey> {
 6 |     @Override
 7 |     public MaxKey getObject() {
 8 |         return new MaxKey();
 9 |     }
10 | }
11 | 


--------------------------------------------------------------------------------
/pig/src/main/java/com/mongodb/hadoop/pig/udf/types/PigBoxedMinKey.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.pig.udf.types;
 2 | 
 3 | import org.bson.types.MinKey;
 4 | 
 5 | public class PigBoxedMinKey extends PigBoxedBSONValue<MinKey> {
 6 |     @Override
 7 |     public MinKey getObject() {
 8 |         return new MinKey();
 9 |     }
10 | }
11 | 


--------------------------------------------------------------------------------
/pig/src/main/java/com/mongodb/hadoop/pig/udf/types/PigBoxedObjectId.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.pig.udf.types;
 2 | 
 3 | import org.bson.types.ObjectId;
 4 | 
 5 | public class PigBoxedObjectId extends PigBoxedBSONValue<ObjectId> {
 6 |     public PigBoxedObjectId(final byte[] b) {
 7 |         super(b);
 8 |     }
 9 | 
10 |     @Override
11 |     public ObjectId getObject() {
12 |         return new ObjectId(get());
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/pig/src/test/java/com/mongodb/hadoop/pig/BSONStorageTest.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.pig;
 2 | 
 3 | import org.apache.pig.ResourceSchema;
 4 | import org.apache.pig.impl.util.Utils;
 5 | import org.junit.Test;
 6 | 
 7 | import static org.junit.Assert.assertNull;
 8 | 
 9 | 
10 | public class BSONStorageTest {
11 |     @Test
12 |     public void testNullMap() throws Exception {
13 |         ResourceSchema schema = new ResourceSchema(Utils.getSchemaFromString("m:map[]"));
14 | 
15 |         assertNull(BSONStorage.getTypeForBSON(null, schema.getFields()[0], null));
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/pig/src/test/java/com/mongodb/hadoop/pig/MongoStorageTest.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.pig;
 2 | 
 3 | import com.mongodb.BasicDBObjectBuilder;
 4 | import com.mongodb.DBObject;
 5 | import org.apache.pig.ResourceSchema;
 6 | import org.apache.pig.impl.util.Utils;
 7 | import org.junit.Test;
 8 | 
 9 | import java.util.HashMap;
10 | import java.util.Map;
11 | import java.util.Set;
12 | 
13 | import static org.junit.Assert.assertEquals;
14 | 
15 | public class MongoStorageTest {
16 |     @Test
17 |     public void testMap() throws Exception {
18 |         MongoStorage ms = new MongoStorage();
19 |         BasicDBObjectBuilder builder = BasicDBObjectBuilder.start();
20 |         ResourceSchema schema = new ResourceSchema(Utils.getSchemaFromString("m:map[]"));
21 | 
22 |         Map<String, Object> val = new HashMap<String, Object>();
23 |         val.put("f1", 1);
24 |         val.put("f2", "2");
25 | 
26 |         ms.writeField(builder, schema.getFields()[0], val);
27 | 
28 |         DBObject out = builder.get();
29 | 
30 |         Set<String> outKeySet = out.keySet();
31 | 
32 |         assertEquals(2, outKeySet.size());
33 |         assertEquals(1, out.get("f1"));
34 |         assertEquals("2", out.get("f2"));
35 |     }
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/pig/src/test/java/helpers/TOBAG.java:
--------------------------------------------------------------------------------
 1 | package helpers;
 2 | 
 3 | import org.apache.pig.EvalFunc;
 4 | import org.apache.pig.data.BagFactory;
 5 | import org.apache.pig.data.DataBag;
 6 | import org.apache.pig.data.Tuple;
 7 | import org.apache.pig.data.TupleFactory;
 8 | 
 9 | import java.io.IOException;
10 | 
11 | /*
12 |  * TOBAG : converts a tuple to a bag of one-item tuples
13 |  */
14 | public class TOBAG extends EvalFunc<DataBag> {
15 |     private TupleFactory mTupleFactory = TupleFactory.getInstance();
16 |     private BagFactory mBagFactory = BagFactory.getInstance();
17 | 
18 |     public DataBag exec(final Tuple input) throws IOException {
19 |         if (input == null || input.size() == 0) {
20 |             return null;
21 |         }
22 | 
23 |         try {
24 |             DataBag output = mBagFactory.newDefaultBag();
25 |             Tuple nested = (Tuple) input.get(0);
26 |             for (Object o : nested.getAll()) {
27 |                 output.add(mTupleFactory.newTuple(o));
28 |             }
29 | 
30 |             return output;
31 |         } catch (Exception e) {
32 |             return null;
33 |         }
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/pig/src/test/resources/dump/test/persons_info.bson:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mongodb/mongo-hadoop/20208a027ad8638e56dfcf040773f176d6ee059f/pig/src/test/resources/dump/test/persons_info.bson


--------------------------------------------------------------------------------
/pig/src/test/resources/dump/test/persons_info.metadata.json:
--------------------------------------------------------------------------------
1 | { "indexes" : [ { "v" : 1, "key" : { "_id" : 1 }, "ns" : "test.persons_info", "name" : "_id_" } ] }


--------------------------------------------------------------------------------
/pig/src/test/resources/pig/bson_schemaless.pig:
--------------------------------------------------------------------------------
 1 | REGISTER @PROJECT_HOME@/core/build/libs/mongo-hadoop-core-@PROJECT_VERSION@.jar
 2 | REGISTER @PROJECT_HOME@/pig/build/libs/mongo-hadoop-pig-@PROJECT_VERSION@.jar
 3 | 
 4 | -- Load data from BSON, providing no schema.
 5 | persons_info =
 6 |     LOAD '@PROJECT_HOME@/pig/src/test/resources/dump/test/persons_info.bson'
 7 |     USING com.mongodb.hadoop.pig.BSONLoader;
 8 | 
 9 | -- Insert into MongoDB.
10 | STORE persons_info
11 |     INTO 'mongodb://localhost:27017/mongo_hadoop.bson_schemaless'
12 |     USING com.mongodb.hadoop.pig.MongoInsertStorage;
13 | 
14 | -- Get the results back from mongo.
15 | results = LOAD 'mongodb://localhost:27017/mongo_hadoop.bson_schemaless'
16 |           USING com.mongodb.hadoop.pig.MongoLoader('first, last, age');
17 | 


--------------------------------------------------------------------------------
/pig/src/test/resources/pig/bson_test.pig:
--------------------------------------------------------------------------------
 1 | REGISTER @PROJECT_HOME@/core/build/libs/mongo-hadoop-core-@PROJECT_VERSION@.jar
 2 | REGISTER @PROJECT_HOME@/pig/build/libs/mongo-hadoop-pig-@PROJECT_VERSION@.jar
 3 | 
 4 | -- Load data from BSON.
 5 | persons_info =
 6 |     LOAD '@PROJECT_HOME@/pig/src/test/resources/dump/test/persons_info.bson'
 7 |     USING com.mongodb.hadoop.pig.BSONLoader;
 8 | 
 9 | -- Make sure the BSON doesn't already exist.
10 | rmf file://@PIG_RESOURCES@/pig/test_output
11 | 
12 | STORE persons_info
13 |     INTO 'file://@PIG_RESOURCES@/pig/test_output'
14 |     USING com.mongodb.hadoop.pig.BSONStorage;
15 | 
16 | persons_read =
17 |     LOAD 'file://@PIG_RESOURCES@/pig/test_output'
18 |     USING com.mongodb.hadoop.pig.BSONLoader(
19 |       'id', 'first: chararray, last: chararray, age: double')
20 |     AS (first: chararray, last: chararray, age: double);
21 | 
22 | DUMP persons_read;
23 | 


--------------------------------------------------------------------------------
/pig/src/test/resources/pig/datestest.pig:
--------------------------------------------------------------------------------
1 | data =
2 |     LOAD 'mongodb://localhost:27017/mongo_hadoop.pigtests'
3 |     USING com.mongodb.hadoop.pig.MongoLoader('today:datetime');
4 | 
5 | STORE data
6 |     INTO 'mongodb://localhost:27017/mongo_hadoop.datetest'
7 |     USING com.mongodb.hadoop.pig.MongoInsertStorage;
8 | 


--------------------------------------------------------------------------------
/pig/src/test/resources/pig/ensure_index.pig:
--------------------------------------------------------------------------------
 1 | -- Load data from BSON.
 2 | persons_info =
 3 |     LOAD '@PROJECT_HOME@/pig/src/test/resources/dump/test/persons_info.bson'
 4 |     USING com.mongodb.hadoop.pig.BSONLoader(
 5 |       'id', 'first: chararray, last: chararray, age: double')
 6 |     AS (first: chararray, last: chararray, age: double);
 7 | 
 8 | -- Dump into mongo, ensure index on last name.
 9 | STORE persons_info
10 |     INTO 'mongodb://localhost:27017/mongo_hadoop.ensure_indexes'
11 |     USING com.mongodb.hadoop.pig.MongoStorage(
12 |         '{last: 1}, {}'
13 |     );
14 | 


--------------------------------------------------------------------------------
/pig/src/test/resources/pig/ensure_index_2.pig:
--------------------------------------------------------------------------------
 1 | -- Load data from BSON.
 2 | persons_info =
 3 |     LOAD '@PROJECT_HOME@/pig/src/test/resources/dump/test/persons_info.bson'
 4 |     USING com.mongodb.hadoop.pig.BSONLoader(
 5 |       'id', 'first: chararray, last: chararray, age: double')
 6 |     AS (first: chararray, last: chararray, age: double);
 7 | 
 8 | -- Dump into mongo, ensure index on first name.
 9 | STORE persons_info
10 |     INTO 'mongodb://localhost:27017/mongo_hadoop.ensure_indexes'
11 |     USING com.mongodb.hadoop.pig.MongoStorage(
12 |         '{first: 1}, {}'
13 |     );


--------------------------------------------------------------------------------
/pig/src/test/resources/pig/genminmaxkeys.pig:
--------------------------------------------------------------------------------
 1 | data =
 2 |     LOAD 'mongodb://localhost:27017/mongo_hadoop.udftest.input'
 3 |     USING com.mongodb.hadoop.pig.MongoLoader;
 4 | 
 5 | create_min_max_keys =
 6 |     FOREACH data
 7 |     GENERATE com.mongodb.hadoop.pig.udf.GenMaxKey() AS newMax,
 8 |              com.mongodb.hadoop.pig.udf.GenMinKey() AS newMin;
 9 | 
10 | STORE create_min_max_keys
11 |     INTO 'mongodb://localhost:27017/mongo_hadoop.udftest.output'
12 |     USING com.mongodb.hadoop.pig.MongoInsertStorage;
13 | 


--------------------------------------------------------------------------------
/pig/src/test/resources/pig/oidtoseconds.pig:
--------------------------------------------------------------------------------
 1 | data =
 2 |     LOAD 'mongodb://localhost:27017/mongo_hadoop.udftest.input'
 3 |     USING com.mongodb.hadoop.pig.MongoLoader;
 4 | 
 5 | calc_seconds =
 6 |     FOREACH data
 7 |     GENERATE com.mongodb.hadoop.pig.udf.ToObjectId($0#'_id') AS id,
 8 |              com.mongodb.hadoop.pig.udf.ObjectIdToSeconds($0#'_id') AS seconds,
 9 |              -- Make sure we can nest UDFs.
10 |              com.mongodb.hadoop.pig.udf.ObjectIdToSeconds(
11 |                  com.mongodb.hadoop.pig.udf.ToObjectId($0#'_id')) AS seconds2;
12 | 
13 | STORE calc_seconds
14 |     INTO 'mongodb://localhost:27017/mongo_hadoop.udftest.output'
15 |     USING com.mongodb.hadoop.pig.MongoInsertStorage('id');
16 | 


--------------------------------------------------------------------------------
/pig/src/test/resources/pig/pig_uuid.pig:
--------------------------------------------------------------------------------
1 | REGISTER @PROJECT_HOME@/core/build/libs/mongo-hadoop-core-@PROJECT_VERSION@.jar
2 | REGISTER @PROJECT_HOME@/pig/build/libs/mongo-hadoop-pig-@PROJECT_VERSION@.jar
3 | 
4 | uuids =
5 |     LOAD 'mongodb://localhost:27017/mongo_hadoop.uuid_test'
6 |     USING com.mongodb.hadoop.pig.MongoLoader('uuid');
7 | 
8 | STORE uuids INTO 'test_results';
9 | 


--------------------------------------------------------------------------------
/pig/src/test/resources/pig/projection.pig:
--------------------------------------------------------------------------------
 1 | data =
 2 |     LOAD 'mongodb://localhost:27017/mongo_hadoop.projection_test'
 3 |     USING com.mongodb.hadoop.pig.MongoLoader('id:chararray,i:int,d:[]', 'id');
 4 | 
 5 | -- Pig only pushes projections with subfields when the outer field is a map (d).
 6 | projected =
 7 |     FOREACH data
 8 |     GENERATE $1 AS age, d#'s' AS name, d#'k' AS ssn;
 9 | 
10 | STORE projected INTO 'test_results';
11 | 


--------------------------------------------------------------------------------
/pig/src/test/resources/pig/replace_mus.pig:
--------------------------------------------------------------------------------
 1 | documents =
 2 |     LOAD 'mongodb://localhost:27017/mongo_hadoop.replace_test'
 3 |     USING com.mongodb.hadoop.pig.MongoLoader('id:chararray,i:int', 'id');
 4 | 
 5 | increment_number =
 6 |     FOREACH documents
 7 |     GENERATE com.mongodb.hadoop.pig.udf.ToObjectId(id) AS id,
 8 |              i + 1 AS i;
 9 | 
10 | STORE increment_number
11 |     INTO 'mongodb://localhost:27017/mongo_hadoop.replace_test'
12 |     USING com.mongodb.hadoop.pig.MongoUpdateStorage(
13 |         '{_id:"\$id"}',  -- query
14 |         '{i:"\$i"}',  -- replacement
15 |         'id:bytearray,i:int',  -- schema
16 |         '',  -- toIgnore (none)
17 |         '{replace:true}'  -- update options
18 |     );
19 | 


--------------------------------------------------------------------------------
/pig/src/test/resources/pig/schemaless.pig:
--------------------------------------------------------------------------------
1 | -- no schema provided
2 | data = LOAD 'mongodb://localhost:27017/mongo_hadoop.pig.schemaless'
3 |        USING com.mongodb.hadoop.pig.MongoLoader;
4 | 
5 | -- no schema or id provided
6 | STORE data INTO 'mongodb://localhost:27017/mongo_hadoop.pig.schemaless.out'
7 |            USING com.mongodb.hadoop.pig.MongoInsertStorage;


--------------------------------------------------------------------------------
/pig/src/test/resources/pig/tobinary.pig:
--------------------------------------------------------------------------------
 1 | data =
 2 |     LOAD 'mongodb://localhost:27017/mongo_hadoop.udftest.input'
 3 |     USING com.mongodb.hadoop.pig.MongoLoader('binary:bytearray');
 4 | 
 5 | create_bson_binary =
 6 |     FOREACH data
 7 |     GENERATE com.mongodb.hadoop.pig.udf.ToBinary(binary) AS binary;
 8 | 
 9 | STORE create_bson_binary
10 |     INTO 'mongodb://localhost:27017/mongo_hadoop.udftest.output'
11 |     USING com.mongodb.hadoop.pig.MongoInsertStorage;
12 | 


--------------------------------------------------------------------------------
/pig/src/test/resources/pig/todbref.pig:
--------------------------------------------------------------------------------
 1 | data =
 2 |     LOAD 'mongodb://localhost:27017/mongo_hadoop.udftest.input'
 3 |     USING com.mongodb.hadoop.pig.MongoLoader('dbref:[]');
 4 | 
 5 | create_dbref =
 6 |     FOREACH data
 7 |     GENERATE com.mongodb.hadoop.pig.udf.ToDBRef($0) AS dbref;
 8 | 
 9 | STORE create_dbref
10 |     INTO 'mongodb://localhost:27017/mongo_hadoop.udftest.output'
11 |     USING com.mongodb.hadoop.pig.MongoInsertStorage;
12 | 


--------------------------------------------------------------------------------
/pig/src/test/resources/pig/toobjectid.pig:
--------------------------------------------------------------------------------
 1 | data =
 2 |     LOAD 'mongodb://localhost:27017/mongo_hadoop.udftest.input'
 3 |     USING com.mongodb.hadoop.pig.MongoLoader(
 4 |         'id:chararray,oidBytes:bytearray', 'id');
 5 | 
 6 | create_objids =
 7 |     FOREACH data
 8 |     GENERATE com.mongodb.hadoop.pig.udf.ToObjectId(id) AS id,
 9 |              com.mongodb.hadoop.pig.udf.ToObjectId(oidBytes) AS otherid;
10 | 
11 | STORE create_objids
12 |     INTO 'mongodb://localhost:27017/mongo_hadoop.udftest.output'
13 |     USING com.mongodb.hadoop.pig.MongoInsertStorage('id');
14 | 


--------------------------------------------------------------------------------
/pig/src/test/resources/pig/udfschemaless.pig:
--------------------------------------------------------------------------------
 1 | data =
 2 |     LOAD 'mongodb://localhost:27017/mongo_hadoop.udftest.input'
 3 |     USING com.mongodb.hadoop.pig.MongoLoader;
 4 | 
 5 | create_objids =
 6 |     FOREACH data
 7 |     GENERATE com.mongodb.hadoop.pig.udf.ToObjectId($0#'_id');
 8 | 
 9 | STORE create_objids
10 |     INTO 'mongodb://localhost:27017/mongo_hadoop.udftest.output'
11 |     USING com.mongodb.hadoop.pig.MongoInsertStorage;
12 | 


--------------------------------------------------------------------------------
/pig/src/test/resources/pig/update_age_alabis_mus.pig:
--------------------------------------------------------------------------------
 1 | REGISTER @PROJECT_HOME@/core/build/libs/mongo-hadoop-core-@PROJECT_VERSION@.jar
 2 | REGISTER @PROJECT_HOME@/pig/build/libs/mongo-hadoop-pig-@PROJECT_VERSION@.jar
 3 | 
 4 | -- Load data from BSON.
 5 | persons_info =
 6 |     LOAD '@PROJECT_HOME@/pig/src/test/resources/dump/test/persons_info.bson'
 7 |     USING com.mongodb.hadoop.pig.BSONLoader(
 8 |       'id', 'first: chararray, last: chararray, age: double')
 9 |     AS (first: chararray, last: chararray, age: double);
10 | 
11 | -- Insert into MongoDB.
12 | STORE persons_info
13 |     INTO 'mongodb://localhost:27017/mongo_hadoop.update_mus'
14 |     USING com.mongodb.hadoop.pig.MongoInsertStorage;
15 | 
16 | -- Perform the update (everyone gets a little older).
17 | STORE persons_info INTO 'mongodb://localhost:27017/mongo_hadoop.update_mus'
18 |                USING com.mongodb.hadoop.pig.MongoUpdateStorage(
19 |              '{}',
20 |              '{\$inc:{age:1}}',
21 |              'first, last, age', '',
22 |              '{multi : true}');
23 | 
24 | -- Get the results back from mongo.
25 | results =
26 |     LOAD 'mongodb://localhost:27017/mongo_hadoop.update_mus'
27 |     USING com.mongodb.hadoop.pig.MongoLoader('first, last, age');
28 | 


--------------------------------------------------------------------------------
/pig/src/test/resources/pig/update_simple_mus.pig:
--------------------------------------------------------------------------------
 1 | REGISTER @PROJECT_HOME@/core/build/libs/mongo-hadoop-core-@PROJECT_VERSION@.jar
 2 | REGISTER @PROJECT_HOME@/pig/build/libs/mongo-hadoop-pig-@PROJECT_VERSION@.jar
 3 | REGISTER @PROJECT_HOME@/pig/build/libs/mongo-hadoop-pig-@PROJECT_VERSION@-tests.jar
 4 | 
 5 | -- Load data from BSON.
 6 | persons_info =
 7 |     LOAD '@PROJECT_HOME@/pig/src/test/resources/dump/test/persons_info.bson'
 8 |     USING com.mongodb.hadoop.pig.BSONLoader;
 9 | 
10 | -- Parse data from BSON into tuples so we can address fields when doing an
11 | -- update. Explicitly define the schema for the 'cars' bag so we can write it
12 | -- out later with MongoInsertStorage.
13 | to_store =
14 |     FOREACH persons_info
15 |     GENERATE
16 |         $0#'first' as first,
17 |         $0#'last' as last,
18 |         helpers.TOBAG($0#'cars') as cars: bag{t: tuple(car: chararray)};
19 | 
20 | -- Insert into MongoDB.
21 | STORE to_store
22 |     INTO 'mongodb://localhost:27017/mongo_hadoop.update_mus'
23 |     USING com.mongodb.hadoop.pig.MongoInsertStorage;
24 | 
25 | -- Perform the update (everyone gets 2x their cars).
26 | STORE to_store
27 |     INTO 'mongodb://localhost:27017/mongo_hadoop.update_mus'
28 |     USING com.mongodb.hadoop.pig.MongoUpdateStorage(
29 |         '{first:"\$first", last:"\$last"}',
30 |         '{\$pushAll:{cars:"\$cars"}}');
31 | 
32 | -- Get the results back from mongo.
33 | results = LOAD 'mongodb://localhost:27017/mongo_hadoop.update_mus'
34 |           USING com.mongodb.hadoop.pig.MongoLoader('first, last, cars');
35 | 


--------------------------------------------------------------------------------
/settings.gradle:
--------------------------------------------------------------------------------
1 | include 'core', 'hive', 'pig', 'streaming', 'flume',
2 |         'spark', 'examples/treasury_yield', 'examples/enron',
3 |         'examples/enron/spark', 'examples/sensors',
4 |         'examples/shakespeare'
5 | 


--------------------------------------------------------------------------------
/spark/src/main/java/com/mongodb/spark/PySparkBSONFileInputFormat.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.spark;
 2 | 
 3 | import com.mongodb.hadoop.BSONFileInputFormat;
 4 | import com.mongodb.spark.pickle.RegisterConstructors;
 5 | import com.mongodb.spark.pickle.RegisterPickles;
 6 | 
 7 | public class PySparkBSONFileInputFormat extends BSONFileInputFormat {
 8 |     private static final RegisterPickles PICKLES = new RegisterPickles();
 9 |     private static final RegisterConstructors CONSTRUCTORS =
10 |       new RegisterConstructors();
11 | 
12 |     static {
13 |         PICKLES.register();
14 |         CONSTRUCTORS.register();
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/spark/src/main/java/com/mongodb/spark/PySparkBSONFileOutputFormat.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.spark;
 2 | 
 3 | import com.mongodb.hadoop.BSONFileOutputFormat;
 4 | import com.mongodb.spark.pickle.RegisterConstructors;
 5 | import com.mongodb.spark.pickle.RegisterPickles;
 6 | 
 7 | public class PySparkBSONFileOutputFormat<K, V>
 8 |   extends BSONFileOutputFormat<K, V> {
 9 |     private static final RegisterPickles PICKLES = new RegisterPickles();
10 |     private static final RegisterConstructors CONSTRUCTORS =
11 |       new RegisterConstructors();
12 | 
13 |     static {
14 |         PICKLES.register();
15 |         CONSTRUCTORS.register();
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/spark/src/main/java/com/mongodb/spark/PySparkMongoInputFormat.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.spark;
 2 | 
 3 | import com.mongodb.hadoop.MongoInputFormat;
 4 | import com.mongodb.spark.pickle.RegisterConstructors;
 5 | import com.mongodb.spark.pickle.RegisterPickles;
 6 | 
 7 | /**
 8 |  * InputFormat that attaches custom Picklers and IObjectConstructors for
 9 |  * reading and writing BSON types with PyMongo.
10 |  */
11 | public class PySparkMongoInputFormat extends MongoInputFormat {
12 |     private static final RegisterPickles PICKLES = new RegisterPickles();
13 |     private static final RegisterConstructors CONSTRUCTORS =
14 |       new RegisterConstructors();
15 | 
16 |     static {
17 |         PICKLES.register();
18 |         CONSTRUCTORS.register();
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/spark/src/main/java/com/mongodb/spark/PySparkMongoOutputFormat.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.spark;
 2 | 
 3 | import com.mongodb.hadoop.MongoOutputFormat;
 4 | import com.mongodb.spark.pickle.RegisterConstructors;
 5 | import com.mongodb.spark.pickle.RegisterPickles;
 6 | 
 7 | public class PySparkMongoOutputFormat<K, V>
 8 |   extends MongoOutputFormat<K, V> {
 9 |     private static final RegisterPickles PICKLES = new RegisterPickles();
10 |     private static final RegisterConstructors CONSTRUCTORS =
11 |       new RegisterConstructors();
12 | 
13 |     static {
14 |         PICKLES.register();
15 |         CONSTRUCTORS.register();
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/spark/src/main/java/com/mongodb/spark/pickle/BSONValueBox.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.spark.pickle;
 2 | 
 3 | import com.mongodb.hadoop.io.BSONWritable;
 4 | import org.apache.hadoop.io.Writable;
 5 | import org.bson.BasicBSONObject;
 6 | import org.bson.Transformer;
 7 | 
 8 | import java.io.DataInput;
 9 | import java.io.DataOutput;
10 | import java.io.IOException;
11 | import java.io.Serializable;
12 | 
13 | /**
14 |  * Base class for containers that hold BSON values.
15 |  * These containers are used when unpickling objects from Python. Generally,
16 |  * these objects implement a "__setstate__" method that allows their internal
17 |  * state to be set after they are created.
18 |  *
19 |  * @param <T> the type of BSON value to be held.
20 |  */
21 | abstract class BSONValueBox<T> implements Writable, Serializable {
22 | 
23 |     private static final Transformer TRANSFORMER = new Transformer() {
24 |         @Override
25 |         public Object transform(final Object objectToTransform) {
26 |             if (!(objectToTransform instanceof BSONValueBox)) {
27 |                 throw new IllegalArgumentException(
28 |                   "Can only transform instances of BSONValueBox, not "
29 |                     + objectToTransform);
30 |             }
31 |             return ((BSONValueBox) objectToTransform).get();
32 |         }
33 |     };
34 | 
35 |     public abstract T get();
36 | 
37 |     static Transformer getTransformer() {
38 |         return TRANSFORMER;
39 |     }
40 | 
41 |     /**
42 |      * Inflate a BSONValueBox from a DataInput.
43 |      * This method is here so that BSONValueBox implements Hadoop's Writable
44 |      * interface, which is a requirement to use this type with Spark Hadoop
45 |      * RDDs. However, you should never call this method directly.
46 |      *
47 |      * @param in the DataInput.
48 |      * @throws IOException is always thrown when this method is called.
49 |      */
50 |     @Override
51 |     public void readFields(final DataInput in) throws IOException {
52 |         throw new IOException("Cannot read fields into a BSONValueBox.");
53 |     }
54 | 
55 |     /**
56 |      * Write a BSONValueBox type to a DataOutput.
57 |      * This method is here so that BSONValueBox implements Hadoop's Writable
58 |      * interface, which is a requirement to use this type with Spark's Hadoop
59 |      * RDDs. Calling this method will write into the output a document of the
60 |      * form:
61 |      * <code>
62 |      *     {"value": (boxed value)}
63 |      * </code>
64 |      * @param out the DataOutput
65 |      * @throws IOException when there is an error writing to the DataOutput
66 |      */
67 |     @Override
68 |     public void write(final DataOutput out) throws IOException {
69 |         (new BSONWritable(new BasicBSONObject("value", get()))).write(out);
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/spark/src/main/java/com/mongodb/spark/pickle/BinaryConstructor.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.spark.pickle;
 2 | 
 3 | import net.razorvine.pickle.IObjectConstructor;
 4 | import net.razorvine.pickle.PickleException;
 5 | import org.bson.BSON;
 6 | import org.bson.types.Binary;
 7 | 
 8 | import java.util.HashMap;
 9 | 
10 | public class BinaryConstructor implements IObjectConstructor {
11 | 
12 |     public static class BinaryBox extends BSONValueBox<Binary> {
13 |         private Binary value = null;
14 |         static {
15 |             BSON.addEncodingHook(BinaryBox.class, getTransformer());
16 |         }
17 | 
18 |         public BinaryBox(final String data, final int type) {
19 |             byte[] byteData = new byte[data.length()];
20 |             for (int i = 0; i < byteData.length; ++i) {
21 |                 byteData[i] = (byte) data.charAt(i);
22 |             }
23 |             this.value = new Binary((byte) type, byteData);
24 |         }
25 | 
26 |         // CHECKSTYLE:OFF
27 |         public void __setstate__(final HashMap<String, Object> hm) {
28 |             // State has already been set from constructor.
29 |         }
30 |         // CHECKSTYLE:ON
31 | 
32 |         @Override
33 |         public Binary get() {
34 |             return value;
35 |         }
36 |     }
37 | 
38 |     @Override
39 |     public Object construct(final Object[] args) {
40 |         if (args.length != 2) {
41 |             throw new PickleException(
42 |               "Binary constructor requires 2 arguments, not " + args.length);
43 |         }
44 |         if (!((args[0] instanceof String) && (args[1] instanceof Integer))) {
45 |             throw new PickleException(
46 |               "Binary constructor takes a String and an Integer, "
47 |                 + "not a " + args[0].getClass().getName()
48 |                 + " and a " + args[1].getClass().getName());
49 |         }
50 |         return new BinaryBox((String) args[0], (Integer) args[1]);
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/spark/src/main/java/com/mongodb/spark/pickle/CalendarTransformer.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.spark.pickle;
 2 | 
 3 | import org.bson.Transformer;
 4 | 
 5 | import java.util.Calendar;
 6 | import java.util.TimeZone;
 7 | 
 8 | /**
 9 |  * Transformer that turns java.util.Calendar objects into java.util.Date
10 |  * objects.
11 |  *
12 |  * This class is needed because Spark constructs pickled Python
13 |  * datetime.datetime objects into java.util.GregorianCalendar instances instead
14 |  * of java.util.Date objects.
15 |  */
16 | public class CalendarTransformer implements Transformer {
17 |     @Override
18 |     public Object transform(final Object objectToTransform) {
19 |         Calendar calendar = (Calendar) objectToTransform;
20 |         calendar.setTimeZone(TimeZone.getTimeZone("UTC"));
21 |         return calendar.getTime();
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/spark/src/main/java/com/mongodb/spark/pickle/CodeConstructor.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.spark.pickle;
 2 | 
 3 | import net.razorvine.pickle.IObjectConstructor;
 4 | import net.razorvine.pickle.PickleException;
 5 | import org.bson.BSON;
 6 | import org.bson.BasicBSONObject;
 7 | import org.bson.types.Code;
 8 | import org.bson.types.CodeWScope;
 9 | 
10 | import java.util.HashMap;
11 | import java.util.Map;
12 | 
13 | public class CodeConstructor implements IObjectConstructor {
14 | 
15 |     public static class CodeBox extends BSONValueBox<Code> {
16 |         private String code;
17 |         private Code value;
18 |         static {
19 |             BSON.addEncodingHook(CodeBox.class, getTransformer());
20 |         }
21 | 
22 |         public CodeBox(final String code) {
23 |             this.code = code;
24 |         }
25 | 
26 |         // CHECKSTYLE:OFF
27 |         public void __setstate__(final HashMap<String, Object> state) {
28 |             // CHECKSTYLE:ON
29 |             Object scope = state.get("_Code__scope");
30 |             if (!(scope instanceof Map)) {
31 |                 throw new PickleException(
32 |                   "Expected a Map for key \"_Code__scope\", not a "
33 |                     + scope.getClass().getName());
34 |             }
35 |             Map scopeMap = (Map) scope;
36 |             if (!scopeMap.isEmpty()) {
37 |                 this.value = new CodeWScope(this.code,
38 |                   new BasicBSONObject(scopeMap));
39 |             } else {
40 |                 this.value = new Code(this.code);
41 |             }
42 |         }
43 | 
44 |         @Override
45 |         public Code get() {
46 |             return value;
47 |         }
48 |     }
49 | 
50 |     @Override
51 |     public Object construct(final Object[] args) {
52 |         if (args.length != 1) {
53 |             throw new PickleException(
54 |               "Code constructor requires 1 argument, not " + args.length);
55 |         }
56 |         if (!(args[0] instanceof String)) {
57 |             throw new PickleException(
58 |               "Code constructor requries a String, not a "
59 |               + args[0].getClass().getName());
60 |         }
61 |         return new CodeBox((String) args[0]);
62 |     }
63 | }
64 | 


--------------------------------------------------------------------------------
/spark/src/main/java/com/mongodb/spark/pickle/DBRefConstructor.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.spark.pickle;
 2 | 
 3 | import com.mongodb.DBRef;
 4 | import net.razorvine.pickle.IObjectConstructor;
 5 | import net.razorvine.pickle.PickleException;
 6 | import org.bson.BSON;
 7 | 
 8 | import java.util.HashMap;
 9 | 
10 | public class DBRefConstructor implements IObjectConstructor {
11 | 
12 |     public static class DBRefBox extends BSONValueBox<DBRef> {
13 |         private DBRef value;
14 |         static {
15 |             BSON.addEncodingHook(DBRefBox.class, getTransformer());
16 |         }
17 | 
18 |         // CHECKSTYLE:OFF
19 |         public void __setstate__(final HashMap<String, Object> state) {
20 |             // CHECKSTYLE:ON
21 |             Object collection = state.get("_DBRef__collection");
22 |             if (!(collection instanceof String)) {
23 |                 throw new PickleException(
24 |                   "Expected a String for key \"_DBRef__colledction\", not a "
25 |                     + collection.getClass().getName());
26 |             }
27 |             this.value = new DBRef(
28 |               (String) collection, state.get("_DBRef__id"));
29 |         }
30 | 
31 |         @Override
32 |         public DBRef get() {
33 |             return value;
34 |         }
35 |     }
36 | 
37 |     @Override
38 |     public Object construct(final Object[] args) {
39 |         if (args.length != 0) {
40 |             throw new PickleException(
41 |               "DBRef constructor requires 0 arguments, not " + args.length);
42 |         }
43 |         return new DBRefBox();
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/spark/src/main/java/com/mongodb/spark/pickle/Int64Constructor.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.spark.pickle;
 2 | 
 3 | import net.razorvine.pickle.IObjectConstructor;
 4 | import net.razorvine.pickle.PickleException;
 5 | import org.bson.BSON;
 6 | 
 7 | import java.util.HashMap;
 8 | 
 9 | public class Int64Constructor implements IObjectConstructor {
10 | 
11 |     public static class Int64Box extends BSONValueBox<Long> {
12 |         private Long value;
13 |         static {
14 |             BSON.addEncodingHook(Int64Box.class, getTransformer());
15 |         }
16 | 
17 |         public Int64Box(final Long value) {
18 |             this.value = value;
19 |         }
20 | 
21 |         // CHECKSTYLE:OFF
22 |         public void __setstate__(HashMap<String, Object> state) {
23 |             // No state to set.
24 |         }
25 |         // CHECKSTYLE:ON
26 | 
27 |         @Override
28 |         public Long get() {
29 |             return this.value;
30 |         }
31 |     }
32 | 
33 |     @Override
34 |     public Object construct(final Object[] args) {
35 |         if (args.length != 1) {
36 |             throw new PickleException(
37 |               "Int64 constructor requires 1 argument, not " + args.length);
38 |         }
39 |         if (!((args[0] instanceof Integer) || (args[0] instanceof Long))) {
40 |             throw new PickleException(
41 |               "Int64 constructor requires an Integer or Long, not a "
42 |                 + args[0].getClass().getName());
43 |         }
44 |         return new Int64Box((Long) args[0]);
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/spark/src/main/java/com/mongodb/spark/pickle/MaxKeyConstructor.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.spark.pickle;
 2 | 
 3 | import net.razorvine.pickle.IObjectConstructor;
 4 | import net.razorvine.pickle.PickleException;
 5 | import org.bson.BSON;
 6 | import org.bson.types.MaxKey;
 7 | 
 8 | import java.util.HashMap;
 9 | 
10 | public class MaxKeyConstructor implements IObjectConstructor {
11 | 
12 |     public static class MaxKeyBox extends BSONValueBox<MaxKey> {
13 |         private static final MaxKey MAX_KEY = new MaxKey();
14 |         static {
15 |             BSON.addEncodingHook(MaxKeyBox.class, getTransformer());
16 |         }
17 | 
18 |         // CHECKSTYLE:OFF
19 |         public void __setstate__(final HashMap<String, Object> state) {
20 |             // no state to set here.
21 |         }
22 |         // CHECKSTYLE:ON
23 | 
24 |         @Override
25 |         public MaxKey get() {
26 |             return MAX_KEY;
27 |         }
28 |     }
29 | 
30 |     @Override
31 |     public Object construct(final Object[] args) {
32 |         if (args.length != 0) {
33 |             throw new PickleException(
34 |               "MaxKey constructor requires 0 arguments, not " + args.length);
35 |         }
36 |         return new MaxKeyBox();
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/spark/src/main/java/com/mongodb/spark/pickle/MinKeyConstructor.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.spark.pickle;
 2 | 
 3 | import net.razorvine.pickle.IObjectConstructor;
 4 | import net.razorvine.pickle.PickleException;
 5 | import org.bson.BSON;
 6 | import org.bson.types.MinKey;
 7 | 
 8 | import java.util.HashMap;
 9 | 
10 | public class MinKeyConstructor implements IObjectConstructor {
11 | 
12 |     public static class MinKeyBox extends BSONValueBox<MinKey> {
13 |         private static final MinKey MIN_KEY = new MinKey();
14 |         static {
15 |             BSON.addEncodingHook(MinKeyBox.class, getTransformer());
16 |         }
17 | 
18 |         // CHECKSTYLE:OFF
19 |         public void __setstate__(final HashMap<String, Object> state) {
20 |             // no state to set here.
21 |         }
22 |         // CHECKSTYLE:ON
23 | 
24 |         @Override
25 |         public MinKey get() {
26 |             return MIN_KEY;
27 |         }
28 |     }
29 | 
30 |     @Override
31 |     public Object construct(final Object[] args) {
32 |         if (args.length != 0) {
33 |             throw new PickleException(
34 |               "MinKey constructor requires 0 arguments, not " + args.length);
35 |         }
36 |         return new MinKeyBox();
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/spark/src/main/java/com/mongodb/spark/pickle/ObjectIdConstructor.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.spark.pickle;
 2 | 
 3 | import net.razorvine.pickle.IObjectConstructor;
 4 | import net.razorvine.pickle.PickleException;
 5 | import org.bson.BSON;
 6 | import org.bson.types.ObjectId;
 7 | 
 8 | public class ObjectIdConstructor implements IObjectConstructor {
 9 | 
10 |     public static class ObjectIdBox extends BSONValueBox<ObjectId> {
11 |         private ObjectId oid;
12 |         static {
13 |             BSON.addEncodingHook(ObjectIdBox.class, getTransformer());
14 |         }
15 | 
16 |         // CHECKSTYLE:OFF
17 |         public void __setstate__(final String state) {
18 |             // CHECKSTYLE:ON
19 |             byte[] oidBytes = new byte[state.length()];
20 |             for (int i = 0; i < state.length(); ++i) {
21 |                 oidBytes[i] = (byte) state.charAt(i);
22 |             }
23 |             this.oid = new ObjectId(oidBytes);
24 |         }
25 | 
26 |         @Override
27 |         public ObjectId get() {
28 |             return this.oid;
29 |         }
30 |     }
31 | 
32 |     @Override
33 |     public Object construct(final Object[] args) {
34 |         if (args.length != 0) {
35 |             throw new PickleException(
36 |               "ObjectId constructor requires 0 arguments, not " + args.length);
37 |         }
38 |         return new ObjectIdBox();
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/spark/src/main/java/com/mongodb/spark/pickle/RegexConstructor.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.spark.pickle;
 2 | 
 3 | import net.razorvine.pickle.IObjectConstructor;
 4 | import net.razorvine.pickle.PickleException;
 5 | import org.bson.BSON;
 6 | 
 7 | import java.util.HashMap;
 8 | import java.util.regex.Pattern;
 9 | 
10 | public class RegexConstructor implements IObjectConstructor {
11 | 
12 |     public static class RegexBox extends BSONValueBox<Pattern> {
13 |         private Pattern value;
14 |         static {
15 |             BSON.addEncodingHook(RegexBox.class, getTransformer());
16 |         }
17 | 
18 |         private static int pythonFlagsToJavaFlags(final int pythonFlags) {
19 |             int javaFlags = 0;
20 |             if ((pythonFlags & 2) > 0) {
21 |                 javaFlags |= Pattern.CASE_INSENSITIVE;
22 |             }
23 |             if ((pythonFlags & 64) > 0) {
24 |                 javaFlags |= Pattern.COMMENTS;
25 |             }
26 |             if ((pythonFlags & 16) > 0) {
27 |                 javaFlags |= Pattern.DOTALL;
28 |             }
29 |             if ((pythonFlags & 8) > 0) {
30 |                 javaFlags |= Pattern.MULTILINE;
31 |             }
32 |             if ((pythonFlags & 32) > 0) {
33 |                 // 0x100 == Pattern.UNICODE_CHARACTER_CLASS in Java >= 7.
34 |                 javaFlags |= (Pattern.UNICODE_CASE | 0x100);
35 |             }
36 |             return javaFlags;
37 |         }
38 | 
39 |         @SuppressWarnings("MagicConstant")
40 |         // CHECKSTYLE:OFF
41 |         public void __setstate__(final HashMap<String, Object> state) {
42 |             // CHECKSTYLE:ON
43 |             Object pattern = state.get("pattern");
44 |             Object flags = state.get("flags");
45 |             if (!((pattern instanceof String) && (flags instanceof Integer))) {
46 |                 throw new PickleException(
47 |                   "Expected a String for key \"pattern\" and an Integer for "
48 |                     + "key \"flags\", not a " + pattern.getClass().getName()
49 |                     + " and a " + flags.getClass().getName());
50 |             }
51 |             value = Pattern.compile(
52 |               (String) pattern, pythonFlagsToJavaFlags((Integer) flags));
53 |         }
54 | 
55 |         @Override
56 |         public Pattern get() {
57 |             return value;
58 |         }
59 |     }
60 | 
61 |     @Override
62 |     public Object construct(final Object[] args) {
63 |         if (args.length != 0) {
64 |             throw new PickleException(
65 |               "Regex constructor requires 0 arguments, not " + args.length);
66 |         }
67 |         return new RegexBox();
68 |     }
69 | }
70 | 


--------------------------------------------------------------------------------
/spark/src/main/java/com/mongodb/spark/pickle/RegisterConstructors.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.spark.pickle;
 2 | 
 3 | import net.razorvine.pickle.Unpickler;
 4 | import org.bson.BSON;
 5 | 
 6 | public class RegisterConstructors {
 7 |     public void register() {
 8 |         Unpickler.registerConstructor("bson.binary", "Binary",
 9 |           new com.mongodb.spark.pickle.BinaryConstructor());
10 |         Unpickler.registerConstructor("bson.code", "Code",
11 |           new com.mongodb.spark.pickle.CodeConstructor());
12 |         Unpickler.registerConstructor("bson.dbref", "DBRef",
13 |           new com.mongodb.spark.pickle.DBRefConstructor());
14 |         Unpickler.registerConstructor("bson.int64", "Int64",
15 |           new com.mongodb.spark.pickle.Int64Constructor());
16 |         Unpickler.registerConstructor("bson.max_key", "MaxKey",
17 |           new com.mongodb.spark.pickle.MaxKeyConstructor());
18 |         Unpickler.registerConstructor("bson.min_key", "MinKey",
19 |           new com.mongodb.spark.pickle.MinKeyConstructor());
20 |         Unpickler.registerConstructor("bson.timestamp", "Timestamp",
21 |           new com.mongodb.spark.pickle.TimestampConstructor());
22 |         Unpickler.registerConstructor("bson.regex", "Regex",
23 |           new com.mongodb.spark.pickle.RegexConstructor());
24 |         Unpickler.registerConstructor("bson.objectid", "ObjectId",
25 |           new com.mongodb.spark.pickle.ObjectIdConstructor());
26 | 
27 |         BSON.addEncodingHook(
28 |           java.util.GregorianCalendar.class,
29 |           new CalendarTransformer());
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/spark/src/main/java/com/mongodb/spark/pickle/RegisterPickles.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.spark.pickle;
 2 | 
 3 | import net.razorvine.pickle.Pickler;
 4 | 
 5 | public class RegisterPickles {
 6 |     private static final BSONPickler PICKLER = new BSONPickler();
 7 | 
 8 |     public void register() {
 9 |         Pickler.registerCustomPickler(org.bson.types.ObjectId.class, PICKLER);
10 |         Pickler.registerCustomPickler(org.bson.types.Binary.class, PICKLER);
11 |         Pickler.registerCustomPickler(org.bson.types.Code.class, PICKLER);
12 |         Pickler.registerCustomPickler(org.bson.types.CodeWScope.class, PICKLER);
13 |         Pickler.registerCustomPickler(
14 |           org.bson.types.CodeWithScope.class, PICKLER);
15 |         Pickler.registerCustomPickler(org.bson.types.MaxKey.class, PICKLER);
16 |         Pickler.registerCustomPickler(org.bson.types.MinKey.class, PICKLER);
17 |         Pickler.registerCustomPickler(
18 |           org.bson.types.BSONTimestamp.class, PICKLER);
19 |         Pickler.registerCustomPickler(com.mongodb.DBRef.class, PICKLER);
20 |         Pickler.registerCustomPickler(java.util.regex.Pattern.class, PICKLER);
21 |         Pickler.registerCustomPickler(java.util.Date.class, PICKLER);
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/spark/src/main/java/com/mongodb/spark/pickle/TimestampConstructor.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.spark.pickle;
 2 | 
 3 | import net.razorvine.pickle.IObjectConstructor;
 4 | import net.razorvine.pickle.PickleException;
 5 | import org.bson.BSON;
 6 | import org.bson.types.BSONTimestamp;
 7 | 
 8 | import java.util.HashMap;
 9 | 
10 | public class TimestampConstructor implements IObjectConstructor {
11 | 
12 |     public static class TimestampBox extends BSONValueBox<BSONTimestamp> {
13 |         private BSONTimestamp value;
14 |         static {
15 |             BSON.addEncodingHook(TimestampBox.class, getTransformer());
16 |         }
17 | 
18 |         // CHECKSTYLE:OFF
19 |         public void __setstate__(final HashMap state) {
20 |             // CHECKSTYLE:ON
21 |             Object time = state.get("_Timestamp__time");
22 |             Object inc = state.get("_Timestamp__inc");
23 |             if (!((time instanceof Integer) && (inc instanceof Integer))) {
24 |                 throw new PickleException(
25 |                   "Excpected Integer for keys \"_Timestamp__time\" and "
26 |                     + "\"Timestamp__inc\", not a "
27 |                     + time.getClass().getName() + " and a "
28 |                     + inc.getClass().getName());
29 |             }
30 |             value = new BSONTimestamp((Integer) time, (Integer) inc);
31 |         }
32 | 
33 |         public BSONTimestamp get() {
34 |             return value;
35 |         }
36 |     }
37 | 
38 |     @Override
39 |     public Object construct(final Object[] args) {
40 |         if (args.length != 0) {
41 |             throw new PickleException(
42 |               "Timestamp constructor requires 0 arguments, not " + args.length);
43 |         }
44 |         return new TimestampBox();
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/spark/src/main/python/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # Copyright 2015 MongoDB, Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | # http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | _classifiers = """
17 | Development Status :: 4 - Beta
18 | Intended Audience :: Developers
19 | License :: OSI Approved :: Apache Software License
20 | Operating System :: OS Independent
21 | Programming Language :: Python :: 2.6
22 | Programming Language :: Python :: 2.7
23 | Topic :: Database :: Front-Ends
24 | Topic :: Scientfic/Engineering :: Interface Engine/Protocol Translator
25 | """
26 | 
27 | try:
28 |     from setuptools import setup, find_packages
29 | except ImportError:
30 |     from ez_setup import use_setuptools
31 |     use_setuptools()
32 |     from setuptools import setup, find_packages
33 | 
34 | extra_opts = {}
35 | try:
36 |     with open('README.rst', 'r') as fd:
37 |         extra_opts['long_description'] = fd.read()
38 | except IOError:
39 |     pass        # Install without README.rst
40 | 
41 | setup(
42 |     name='pymongo-spark',
43 |     version='0.1.dev0',
44 |     author='MongoDB, Inc.',
45 |     author_email='mongodb-user@googlegroups.com',
46 |     description='Utilities for using Spark with PyMongo',
47 |     keywords=['spark', 'mongodb', 'mongo', 'hadoop', 'pymongo'],
48 |     license="http://www.apache.org/licenses/LICENSE-2.0.html",
49 |     platforms=['any'],
50 |     url='https://github.com/mongodb/mongo-hadoop',
51 |     install_requires=['pymongo>=3.0.3'],
52 |     packages=find_packages(exclude=('test',)),
53 |     classifiers=_classifiers.splitlines(),
54 |     test_suite='test',
55 |     **extra_opts
56 | )
57 | 


--------------------------------------------------------------------------------
/spark/src/main/scala/com/mongodb/spark/pickle/NoopConverter.scala:
--------------------------------------------------------------------------------
1 | package com.mongodb.spark.pickle
2 | 
3 | import org.apache.spark.api.python.Converter
4 | 
5 | 
6 | class NoopConverter extends Converter[Any, Any] {
7 |   override def convert(obj: Any): Any = { obj }
8 | }
9 | 


--------------------------------------------------------------------------------
/streaming/examples/enron/enron_map.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | var node_mongo_hadoop = require('node_mongo_hadoop')
 4 | 
 5 | 
 6 | var trimString = function(str){
 7 |   return String(str).replace(/^\s+|\s+$/g, '');
 8 | }
 9 | 
10 | function mapFunc(doc, callback){
11 |     if(doc.headers && doc.headers.From && doc.headers.To){
12 |         var from_field = doc['headers']['From']
13 |         var to_field = doc['headers']['To']
14 |         var recips = []
15 |         to_field.split(',').forEach(function(to){
16 |           callback( {'_id': {'f':from_field, 't':trimString(to)}, 'count': 1} )
17 |         });
18 |     }
19 | }
20 | 
21 | node_mongo_hadoop.MapBSONStream(mapFunc);
22 | 


--------------------------------------------------------------------------------
/streaming/examples/enron/enron_map.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | sys.path.append(".")
 5 | 
 6 | from pymongo_hadoop import BSONMapper
 7 | 
 8 | def mapper(documents):
 9 |     i = 0
10 |     for doc in documents:
11 |         i = i + 1
12 |         if 'headers' in doc and 'To' in doc['headers'] and 'From' in doc['headers']:
13 |             from_field  = doc['headers']['From']
14 |             to_field = doc['headers']['To']
15 |             recips = [x.strip() for x in to_field.split(',')]
16 |             for r in recips:
17 |                 yield {'_id': {'f':from_field, 't':r}, 'count': 1}
18 | 
19 | BSONMapper(mapper)
20 | print >> sys.stderr, "Done Mapping."
21 | 


--------------------------------------------------------------------------------
/streaming/examples/enron/enron_map.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | require 'mongo-hadoop'
 3 | 
 4 | MongoHadoop.map do |document|
 5 |   if document.has_key?('headers')
 6 |     headers = document['headers']
 7 |     if ['To', 'From'].all? { |header| headers.has_key? (header) }
 8 |       to_field = headers['To']
 9 |       from_field = headers['From']
10 |       recipients = to_field.split(',').map { |recipient| recipient.strip }
11 |       recipients.map { |recipient| {:_id => {:f => from_field, :t => recipient}, :count => 1} }
12 |     end
13 |   end
14 | end
15 | 


--------------------------------------------------------------------------------
/streaming/examples/enron/enron_reduce.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | var node_mongo_hadoop = require('node_mongo_hadoop')
 4 | 
 5 | function reduceFunc(key, values, callback){
 6 |     var count = 0;
 7 |     values.forEach(function(v){
 8 |         count += v.count
 9 |     });
10 |     callback( {'_id':key, 'count':count } );
11 | }
12 | 
13 | node_mongo_hadoop.ReduceBSONStream(reduceFunc);
14 | 


--------------------------------------------------------------------------------
/streaming/examples/enron/enron_reduce.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | sys.path.append(".")
 5 | 
 6 | from pymongo_hadoop import BSONReducer
 7 | 
 8 | def reducer(key, values):
 9 |     print >> sys.stderr, "Processing from/to %s" % str(key)
10 |     _count = 0
11 |     for v in values:
12 |         _count += v['count']
13 |     return {'_id': key, 'count': _count}
14 | 
15 | BSONReducer(reducer)
16 | 


--------------------------------------------------------------------------------
/streaming/examples/enron/enron_reduce.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | require 'mongo-hadoop'
3 | 
4 | MongoHadoop.reduce do |key, values|
5 |   count = values.reduce { |sum, current| sum += current['count'] }
6 | 
7 |   { :_id => key, :count => count }
8 | end
9 | 


--------------------------------------------------------------------------------
/streaming/examples/enron/run_enron.sh:
--------------------------------------------------------------------------------
1 | hadoop jar target/mongo-hadoop-streaming-assembly*.jar -mapper examples/enron/enron_map.py -reducer examples/enron/enron_reduce.py -inputURI mongodb://127.0.0.1/enron_mail.messages -outputURI mongodb://127.0.0.1/enron_mail.output -file  examples/enron/enron_map.py  -file examples/enron/enron_reduce.py
2 | 


--------------------------------------------------------------------------------
/streaming/examples/enron/run_enron_js.sh:
--------------------------------------------------------------------------------
1 | hadoop jar target/mongo-hadoop-streaming-assembly*.jar -mapper examples/enron/enron_map.js -reducer examples/enron/enron_reduce.js -inputURI mongodb://127.0.0.1/enron_mail.messages -outputURI mongodb://127.0.0.1/enron_mail.output -file  examples/enron/enron_map.js  -file examples/enron/enron_reduce.js
2 | 


--------------------------------------------------------------------------------
/streaming/examples/enron/run_enron_rb.sh:
--------------------------------------------------------------------------------
1 | hadoop jar target/mongo-hadoop-streaming-assembly*.jar -mapper examples/enron/enron_map.rb -reducer examples/enron/enron_reduce.rb -inputURI mongodb://127.0.0.1/enron_mail.messages -outputURI mongodb://127.0.0.1/enron_mail.output -file  examples/enron/enron_map.py  -file examples/enron/enron_reduce.py
2 | 


--------------------------------------------------------------------------------
/streaming/examples/treasury/mapper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import os
 5 | sys.path.append(".")
 6 | 
 7 | try:
 8 |     from pymongo_hadoop import BSONMapper
 9 |     import pymongo_hadoop
10 |     print >> sys.stderr, "pymongo_hadoop is not installed or in path - will try to import from source tree."
11 | except:
12 |     here = os.path.abspath(__file__)
13 |     module_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(here))),
14 |                     'language_support',
15 |                     'python')
16 |     sys.path.append(module_dir)
17 |     print >> sys.stderr, sys.path
18 |     from pymongo_hadoop import BSONMapper
19 | 
20 | def mapper(documents):
21 |     print >> sys.stderr, "Running python mapper."
22 | 
23 |     for doc in documents:
24 |         yield {'_id': doc['_id'].year, 'bc10Year': doc['bc10Year']}
25 | 
26 |     print >> sys.stderr, "Python mapper finished."
27 | 
28 | BSONMapper(mapper)
29 | 


--------------------------------------------------------------------------------
/streaming/examples/treasury/mapper.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | require 'mongo-hadoop'
3 | 
4 | MongoHadoop.map do |document|
5 |   { :_id => document['_id'].year, :bc10Year => document['bc10Year'] }
6 | end
7 | 


--------------------------------------------------------------------------------
/streaming/examples/treasury/mapper_kv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | sys.path.append(".")
 5 | 
 6 | from pymongo_hadoop import KeyValueBSONMapper
 7 | 
 8 | def mapper(entries):
 9 |     for (k, v) in entries:
10 |         yield (k.year, v['bc10Year'])
11 | 
12 | KeyValueBSONMapper(mapper)
13 | print >> sys.stderr, "Done Mapping."
14 | 


--------------------------------------------------------------------------------
/streaming/examples/treasury/mapper_kv.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | require 'mongo-hadoop'
3 | 
4 | MongoHadoop.kvmap do |key, value|
5 |   [key.year, value['bc10Year']]
6 | end
7 | 


--------------------------------------------------------------------------------
/streaming/examples/treasury/reducer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import os
 5 | sys.path.append(".")
 6 | 
 7 | try:
 8 |     from pymongo_hadoop import BSONReducer
 9 |     import pymongo_hadoop
10 | except:
11 |     here = os.path.abspath(__file__)
12 |     module_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(here))),
13 |                     'language_support',
14 |                     'python')
15 |     sys.path.append(module_dir)
16 |     from pymongo_hadoop import BSONReducer
17 | 
18 | def reducer(key, values):
19 |     print >> sys.stderr, "Processing Key: %s" % key
20 |     _count = _sum = 0
21 |     for v in values:
22 |         _count += 1
23 |         _sum += v['bc10Year']
24 |     return {'_id': key, 'avg': _sum / _count,
25 |             'count': _count, 'sum': _sum }
26 | 
27 | BSONReducer(reducer)
28 | 


--------------------------------------------------------------------------------
/streaming/examples/treasury/reducer.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | require 'mongo-hadoop'
 3 | 
 4 | MongoHadoop.reduce do |key, values|
 5 |   count = sum = 0
 6 |   
 7 |   values.each do |value|
 8 |     count += 1
 9 |     sum += value['bc10Year']
10 |   end
11 | 
12 |   { :_id => key, :average => sum / count }
13 | end
14 | 


--------------------------------------------------------------------------------
/streaming/examples/treasury/reducer_kv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | sys.path.append(".")
 5 | 
 6 | from pymongo_hadoop import KeyValueBSONReducer, KeyValueBSONInput
 7 | 
 8 | def reducer(key, values):
 9 |     print >> sys.stderr, "Processing Key: %s" % key
10 |     _count = _sum = 0
11 |     for v in values:
12 |         _count += 1
13 |         _sum += v['value']
14 |     return (key, _sum / _count)
15 | 
16 | 
17 | KeyValueBSONReducer(reducer, input_fh=KeyValueBSONInput())
18 | 


--------------------------------------------------------------------------------
/streaming/examples/treasury/reducer_kv.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | require 'mongo-hadoop'
 3 | 
 4 | MongoHadoop.kvreduce do |key, values|
 5 |   count = sum = 0
 6 | 
 7 |   values.each do |value|
 8 |     count += 1
 9 |     sum += value['value']
10 |   end
11 | 
12 |   [key, sum / count]
13 | end
14 | 


--------------------------------------------------------------------------------
/streaming/examples/treasury/run_treas_kv_py.sh:
--------------------------------------------------------------------------------
1 | hadoop jar target/mongo-hadoop-streaming-assembly*.jar -mapper examples/treasury/mapper_kv.py -reducer examples/treasury/reducer_kv.py -inputformat com.mongodb.hadoop.mapred.MongoInputFormat -outputformat com.mongodb.hadoop.mapred.MongoOutputFormat -inputURI mongodb://127.0.0.1/demo.yield_historical.in -outputURI mongodb://127.0.0.1/demo.yield_historical.streaming.kv.out


--------------------------------------------------------------------------------
/streaming/examples/treasury/run_treas_kv_rb.sh:
--------------------------------------------------------------------------------
1 | hadoop jar target/mongo-hadoop-streaming-assembly*.jar -mapper examples/treasury/mapper_kv.rb -reducer examples/treasury/reducer_kv.rb -inputformat com.mongodb.hadoop.mapred.MongoInputFormat -outputformat com.mongodb.hadoop.mapred.MongoOutputFormat -inputURI mongodb://127.0.0.1/demo.yield_historical.in -outputURI mongodb://127.0.0.1/demo.yield_historical.streaming.kv.out
2 | 


--------------------------------------------------------------------------------
/streaming/examples/treasury/run_treas_py.sh:
--------------------------------------------------------------------------------
1 | hadoop jar target/mongo-hadoop-streaming-assembly*.jar -mapper examples/treasury/mapper.py -reducer examples/treasury/reducer.py -inputformat com.mongodb.hadoop.mapred.MongoInputFormat -outputformat com.mongodb.hadoop.mapred.MongoOutputFormat -inputURI mongodb://127.0.0.1/demo.yield_historical.in -outputURI mongodb://127.0.0.1/demo.yield_historical.streaming.out
2 | 


--------------------------------------------------------------------------------
/streaming/examples/treasury/run_treas_rb.sh:
--------------------------------------------------------------------------------
1 | hadoop jar target/mongo-hadoop-streaming-assembly*.jar -mapper examples/treasury/mapper.rb -reducer examples/treasury/reducer.rb -inputURI mongodb://127.0.0.1/demo.yield_historical.in -outputURI mongodb://127.0.0.1/demo.yield_historical.streaming.out
2 | 


--------------------------------------------------------------------------------
/streaming/examples/twitter/README.md:
--------------------------------------------------------------------------------
1 | Importing Live Twitter Data, you'll need a twitter login and password:
2 | 
3 |     curl https://stream.twitter.com/1/statuses/sample.json -u<login>:<password> | mongoimport -d test -c live
4 | 
5 | This will continue streaming until you ^C it.
6 | 


--------------------------------------------------------------------------------
/streaming/examples/twitter/run_twit_py.sh:
--------------------------------------------------------------------------------
1 | hadoop jar target/mongo-hadoop-streaming-assembly*.jar -mapper examples/twitter/twit_map.py -reducer examples/twitter/twit_reduce.py -inputURI mongodb://127.0.0.1/test.live -outputURI mongodb://127.0.0.1/test.twit_reduction -file  examples/twitter/twit_map.py  -file examples/twitter/twit_reduce.py
2 | 


--------------------------------------------------------------------------------
/streaming/examples/twitter/run_twit_rb.sh:
--------------------------------------------------------------------------------
1 | hadoop jar target/mongo-hadoop-streaming-assembly*.jar -mapper examples/twitter/twit_map.rb -reducer examples/twitter/twit_reduce.rb -inputURI mongodb://127.0.0.1/test.live -outputURI mongodb://127.0.0.1/test.twit_reduction
2 | 


--------------------------------------------------------------------------------
/streaming/examples/twitter/twit_hashtag_map.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | sys.path.append(".")
 5 | 
 6 | from pymongo_hadoop import BSONMapper
 7 | 
 8 | def mapper(documents):
 9 |     for doc in documents:
10 |         for hashtag in doc['entities']['hashtags']:
11 |             yield {'_id': hashtag['text'], 'count': 1}
12 | 
13 | BSONMapper(mapper)
14 | print >> sys.stderr, "Done Mapping."
15 | 


--------------------------------------------------------------------------------
/streaming/examples/twitter/twit_hashtag_reduce.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | sys.path.append(".")
 5 | 
 6 | from pymongo_hadoop import BSONReducer
 7 | 
 8 | def reducer(key, values):
 9 |     print >> sys.stderr, "Processing Hashtag %s" % key.encode('utf8')
10 |     _count = 0
11 |     for v in values:
12 |         _count += v['count']
13 |     return {'_id': key.encode('utf8'), 'count': _count}
14 | 
15 | BSONReducer(reducer)
16 | 


--------------------------------------------------------------------------------
/streaming/examples/twitter/twit_map.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | sys.path.append(".")
 5 | 
 6 | from pymongo_hadoop import BSONMapper
 7 | 
 8 | def mapper(documents):
 9 |     for doc in documents:
10 |         if 'user' in doc:
11 |             yield {'_id': doc['user']['time_zone'], 'count': 1}
12 | 
13 | BSONMapper(mapper)
14 | print >> sys.stderr, "Done Mapping."
15 | 


--------------------------------------------------------------------------------
/streaming/examples/twitter/twit_map.rb:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | require 'mongo-hadoop'
3 | 
4 | MongoHadoop.map do |document|
5 |   { :_id => document['user']['time_zone'], :count => 1 }
6 | end
7 | 


--------------------------------------------------------------------------------
/streaming/examples/twitter/twit_reduce.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | sys.path.append(".")
 5 | 
 6 | from pymongo_hadoop import BSONReducer
 7 | 
 8 | def reducer(key, values):
 9 |     print >> sys.stderr, "Processing Timezone %s" % key
10 |     _count = 0
11 |     for v in values:
12 |         _count += v['count']
13 |     return {'_id': key, 'count': _count}
14 | 
15 | BSONReducer(reducer)
16 | 


--------------------------------------------------------------------------------
/streaming/examples/twitter/twit_reduce.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | require 'mongo-hadoop'
 3 | 
 4 | # Function that takes key and array of values, iterates over all of the values,
 5 | # and returns a single document with the reduced data (summary) for that key.
 6 | 
 7 | MongoHadoop.reduce do |key, values|
 8 |   count = 0
 9 | 
10 |   values.each do |value|
11 |     count += value['count']
12 |   end
13 |   
14 |   { :_id => key, :count => count }
15 | end
16 | 


--------------------------------------------------------------------------------
/streaming/language_support/js/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "author": "Mike O'Brien <mikeo@10gen.com> (http://mpobrien.net)",
 3 |   "name": "node_mongo_hadoop",
 4 |   "description": "Bindings to connect to the MongoDB adapter for writing Map/Reduce jobs in Javascript with Hadoop Streaming.",
 5 |   "version": "0.0.2",
 6 |   "homepage": "api.mongodb.org/hadoop",
 7 |   "repository": {
 8 |     "type": "git",
 9 |     "url": "git@github.com:mpobrien/node_mongo_hadoop.git"
10 |   },
11 |   "main": "./node_mongo_hadoop",
12 |   "dependencies": {
13 |     "mongodb": "*",
14 |     "buffers": "*",
15 |     "underscore": "*"
16 |   },
17 |   "devDependencies": {},
18 |   "optionalDependencies": {},
19 |   "engines": {
20 |     "node": "*"
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/streaming/language_support/python/README.rst:
--------------------------------------------------------------------------------
 1 | ==============
 2 | pymongo_hadoop
 3 | ==============
 4 | :Info: See `documentation <http://api.mongodb.org>`_ for more information. See `github <http://github.com/mongodb/mongo-hadoop/>`_ for the latest source.
 5 | :Author: Brendan McAdams
 6 | :Maintainer: Mike O'Brien <mikeo@10gen.com>
 7 | 
 8 | About
 9 | =====
10 | 
11 | The pymongo_hadoop module contains basic classes for using python 
12 | scripts for Hadoop Streaming jobs with the mongo-hadoop adapter.
13 | 
14 | Issues / Questions / Feedback
15 | =============================
16 | 
17 | Any issues with, questions about, or feedback for PyMongo should be
18 | sent to the mongodb-user list on Google Groups. For confirmed issues
19 | or feature requests, open a case on `jira
20 | <http://jira.mongodb.org/browse/HADOOP>`_. Please do not e-mail any of the 
21 | developers directly with issues or questions - you're more likely to
22 | get an answer on the list.
23 | 
24 | Installation
25 | ============
26 | 
27 | If you have `setuptools
28 | <http://peak.telecommunity.com/DevCenter/setuptools>`_ installed you
29 | should be able to do **easy_install pymongo_hadoop** to install
30 | the module. Otherwise you can download the project source and do **python
31 | setup.py install** to install.
32 | 
33 | 


--------------------------------------------------------------------------------
/streaming/language_support/python/pymongo_hadoop/__init__.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from input import BSONInput, KeyValueBSONInput
 4 | from output import BSONOutput, KeyValueBSONOutput
 5 | from reducer import BSONReducer, BSONReducerInput
 6 | from reducer import KeyValueBSONReducer, KeyValueBSONReducerInput
 7 | from mapper import BSONMapper, KeyValueBSONMapper
 8 | 
 9 | __all__ = ['BSONInput', 'BSONOutput',
10 |            'KeyValueBSONOutput', 'KeyValueBSONInput',
11 |            'BSONReducerInput', 'BSONReducer',
12 |            'KeyValueBSONReducer', 'KeyValueBSONReducerInput']
13 | 
14 | def dump_bits(bits):
15 |     for bit in bits:
16 |         print >> sys.stderr, "\t * Bit: %s Ord: %d" % (hex(ord(bit)), ord(bit))
17 | 
18 | 


--------------------------------------------------------------------------------
/streaming/language_support/python/pymongo_hadoop/input.py:
--------------------------------------------------------------------------------
 1 | from bson import InvalidBSON, BSON
 2 | from bson.codec_options import CodecOptions
 3 | 
 4 | import sys
 5 | import struct
 6 | 
 7 | STREAMING_CODEC_OPTIONS = CodecOptions(tz_aware=True)
 8 | 
 9 | 
10 | class BSONInput(object):
11 |     """Custom file class for decoding streaming BSON,
12 |     based upon the Dumbo & "typedbytes" modules at
13 |     https://github.com/klbostee/dumbo &
14 |     https://github.com/klbostee/typedbytes
15 |     """
16 | 
17 |     def __init__(self, fh=sys.stdin, unicode_errors='strict'):
18 |         self.fh = fh
19 |         self.unicode_errors = unicode_errors
20 |         self.eof = False
21 | 
22 |     def _read(self):
23 |         try:
24 |             size_bits = self.fh.read(4)
25 |             size = struct.unpack("<i", size_bits)[0] - 4 # BSON size byte includes itself 
26 |             data = size_bits + self.fh.read(size)
27 |             if len(data) != size + 4:
28 |                 raise struct.error("Unable to cleanly read expected BSON Chunk; EOF, underful buffer or invalid object size.")
29 |             if data[size + 4 - 1] != "\x00":
30 |                 raise InvalidBSON("Bad EOO in BSON Data")
31 |             doc = BSON(data).decode(codec_options=STREAMING_CODEC_OPTIONS)
32 |             return doc
33 |         except struct.error, e:
34 |             #print >> sys.stderr, "Parsing Length record failed: %s" % e
35 |             self.eof = True
36 |             raise StopIteration(e)
37 | 
38 |     def read(self):
39 |         try:
40 |             return self._read()
41 |         except StopIteration, e:
42 |             print >> sys.stderr, "Iteration Failure: %s" % e
43 |             return None
44 | 
45 |     def _reads(self):
46 |         r = self._read
47 |         while 1:
48 |             yield r()
49 | 
50 |     def close(self):
51 |         self.fh.close()
52 | 
53 |     __iter__ = reads = _reads
54 | 
55 | class KeyValueBSONInput(BSONInput):
56 |     def read(self):
57 |         try:
58 |             doc = self._read()
59 |         except StopIteration, e:
60 |             print >> sys.stderr, "Key/Value Input iteration failed/stopped: %s" % e
61 |             return None
62 |         if '_id' in doc:
63 |             return doc['_id'], doc
64 |         else:
65 |             raise struct.error("Cannot read Key '_id' from Input Doc '%s'" % doc)
66 | 
67 |     def reads(self):
68 |         it = self._reads()
69 |         n = it.next
70 |         while 1:
71 |             doc = n()
72 |             if '_id' in doc:
73 |                 yield doc['_id'], doc
74 |             else:
75 |                raise struct.error("Cannot read Key '_id' from Input Doc '%s'" % doc)
76 | 
77 |     __iter__ = reads
78 | 


--------------------------------------------------------------------------------
/streaming/language_support/python/pymongo_hadoop/mapper.py:
--------------------------------------------------------------------------------
 1 | from input import BSONInput, KeyValueBSONInput
 2 | from output import BSONOutput, KeyValueBSONOutput
 3 | 
 4 | class BSONMapper(object):
 5 |     """Wraps BSONInput to allow writing mapper functions
 6 |     as generators.
 7 |     """
 8 | 
 9 |     def __init__(self, target, **kwargs):
10 |         """`target` should be a generator function that accepts a
11 |         single argument which will be an instance of :class:`BSONInput`,
12 |         and which yields dictionaries to be emitted. The yielded
13 |         dictionaries should conform to the format expected by
14 |         :class:`BSONInput` (i.e. they should have the key defined
15 |         in a field named `_id`).
16 | 
17 |         Keyword arguments are passed directly to the underlying
18 |         :class:`BSONInput`.
19 |         """
20 | 
21 |         output = BSONOutput()
22 |         input = BSONInput(**kwargs)
23 | 
24 |         generator = target(input)
25 |         for mapped in generator:
26 |             output.write(mapped)
27 | 
28 | class KeyValueBSONMapper(object):
29 |     """Wraps KeyValueBSONInput to allow writing mapper functions
30 |     as generators.
31 |     """
32 | 
33 |     def __init__(self, target, **kwargs):
34 |         """`target` should be a generator function that accepts a
35 |         single argument which will be an instance of
36 |         :class:`KeyValueBSONInput`, and which yields tuples of
37 |         (key, value) to be emitted.
38 | 
39 |         Keyword arguments are passed directly to the underlying
40 |         :class:`KeyValueBSONInput`.
41 |         """
42 | 
43 |         output = KeyValueBSONOutput()
44 |         input = KeyValueBSONInput(**kwargs)
45 | 
46 |         generator = target(input)
47 |         for key_and_value in generator:
48 |             output.write(key_and_value)
49 | 
50 | 


--------------------------------------------------------------------------------
/streaming/language_support/python/setup.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from setuptools import setup, Feature
 3 | except ImportError:
 4 |     from distribute_setup import use_setuptools
 5 |     use_setuptools()
 6 |     from setuptools import setup, Feature
 7 | 
 8 | f = open("README.rst")
 9 | try:
10 |     try:
11 |         readme_content = f.read()
12 |     except:
13 |         readme_content = ""
14 | finally:
15 |     f.close()
16 | 
17 | 
18 | setup(
19 |     name='pymongo_hadoop',
20 |     version='1.1.0',
21 |     maintainer="Michael O'Brien",
22 |     maintainer_email='mikeo@10gen.com',
23 |     long_description=readme_content,
24 |     packages=['pymongo_hadoop'],
25 |     url='https://github.com/mongodb/mongo-hadoop',
26 |     keywords=["mongo", "mongodb", "hadoop", "hdfs", "streaming"],
27 |     install_requires=[
28 |         'pymongo'
29 |     ],
30 | )
31 | 


--------------------------------------------------------------------------------
/streaming/language_support/python/test_install.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | try:
 4 |     import pymongo
 5 |     from bson import _elements_to_dict, InvalidBSON
 6 | except:
 7 |     raise Exception("Cannot find a valid pymongo installation.")
 8 | 
 9 | try:
10 |     from pymongo_hadoop import BSONInput
11 | except:
12 |     raise Exception("Cannot find a valid pymongo_hadoop installation.")
13 | 
14 | print "*** Everything looks OK. All required modules were found."
15 | 


--------------------------------------------------------------------------------
/streaming/language_support/ruby/bin/mongo-hadoop:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | require "thor"
 3 | 
 4 | class MongoHadoop < Thor
 5 |   include Thor::Actions
 6 | 
 7 |   def self.source_root
 8 |     File.dirname(__FILE__)
 9 |   end
10 | 
11 |   desc "create PROJECT_NAME", "Create a new Mongo Hadoop project"
12 |   method_option :assembly, :type => :string, :default => "mongo-hadoop-streaming-assembly*.jar"
13 |   method_option :uri, :type => :string, :aliases => "-h", :default => "mongodb://127.0.0.1"
14 |   method_option :database, :type => :string, :aliases => "-d", :default => "mongo_hadoop"
15 |   method_option :in, :type => :string, :aliases => "-i", :default => "project.in"
16 |   method_option :out, :type => :string, :aliases => "-o", :default => "project.out"
17 | 
18 |   def create(name)
19 |     @name = name
20 |     @streaming_assembly = options[:assembly]
21 | 
22 |     base_uri = options[:uri]
23 |     db = options[:database]
24 |     @input_uri = "#{base_uri}/#{db}.#{options[:in]}"
25 |     @output_uri = "#{base_uri}/#{db}.#{options[:out]}"
26 | 
27 |     create_mapper
28 |     create_reducer
29 |     create_runner
30 |   end
31 | 
32 |   private
33 | 
34 |   def create_mapper
35 |     template '../templates/mapper.tt', "#{@name}/mapper.rb"
36 |     chmod "#{@name}/mapper.rb", 0766, :verbose => false
37 |   end
38 | 
39 |   def create_reducer
40 |     template '../templates/reducer.tt', "#{@name}/reducer.rb"
41 |     chmod "#{@name}/reducer.rb", 0766, :verbose => false
42 |   end
43 | 
44 |   def create_runner
45 |     template '../templates/runner.tt', "#{@name}/run.sh"
46 |     chmod "#{@name}/run.sh", 0766, :verbose => false
47 |   end
48 | end
49 | 
50 | MongoHadoop.start


--------------------------------------------------------------------------------
/streaming/language_support/ruby/lib/mongo-hadoop.rb:
--------------------------------------------------------------------------------
1 | require 'mongo-hadoop/mapper'
2 | require 'mongo-hadoop/reducer'


--------------------------------------------------------------------------------
/streaming/language_support/ruby/lib/mongo-hadoop/input.rb:
--------------------------------------------------------------------------------
 1 | require 'bson'
 2 | 
 3 | class BSONInput
 4 |   include Enumerable
 5 |   
 6 |   def initialize(stream=nil)
 7 |     @stream = stream || $stdin
 8 |   end
 9 |   
10 |   def read
11 |     begin
12 |       BSON.read_bson_document(@stream)
13 |     rescue NoMethodError
14 |       nil
15 |     end
16 |   end
17 | 
18 |   def each
19 |     while(doc = read)
20 |       yield doc
21 |     end
22 |   end
23 | end
24 | 
25 | class BSONKeyValueInput < BSONInput
26 |   def each
27 |     while(doc = read)
28 |       yield doc['_id'], doc
29 |     end
30 |   end
31 | end
32 | 


--------------------------------------------------------------------------------
/streaming/language_support/ruby/lib/mongo-hadoop/mapper.rb:
--------------------------------------------------------------------------------
 1 | require 'mongo-hadoop/input'
 2 | require 'mongo-hadoop/output'
 3 | 
 4 | module MongoHadoop
 5 |   def map
 6 |     input = BSONInput.new
 7 |     output = BSONOutput.new
 8 | 
 9 |     input.each do |doc|
10 |       mapped = yield doc
11 |       mapped = [mapped] unless mapped.is_a?(Array)
12 | 
13 |       mapped.each do |mapped|
14 |         output.write mapped if mapped
15 |       end
16 |     end
17 |   end
18 | 
19 |   def kvmap
20 |     kvinput = BSONKeyValueInput.new
21 |     kvoutput = BSONKeyValueOutput.new
22 | 
23 |     kvinput.each do |key, value|
24 |       mapped = yield key, value
25 |       mapped = [mapped] unless mapped.is_a(Array)
26 | 
27 |       mapped.each do |mapped|
28 |         kvoutput.write mapped if mapped
29 |       end
30 |     end
31 |   end
32 | end
33 | 


--------------------------------------------------------------------------------
/streaming/language_support/ruby/lib/mongo-hadoop/output.rb:
--------------------------------------------------------------------------------
 1 | require 'bson'
 2 | 
 3 | class BSONOutput
 4 |   def initialize(stream=nil)
 5 |     @stream = stream || $stdout
 6 |   end
 7 |   
 8 |   def write(doc)
 9 |     bson_doc = BSON.serialize(doc)
10 |     @stream.write(bson_doc)
11 |     @stream.flush
12 |   end
13 | end
14 | 
15 | class BSONKeyValueOutput < BSONOutput
16 |   def write(pair)
17 |     key, value = *pair
18 | 
19 |     doc = value.is_a?(Hash) ? value : { :value => value }
20 | 
21 |     doc['_id'] = key
22 |     super(doc)
23 |   end
24 | end
25 | 


--------------------------------------------------------------------------------
/streaming/language_support/ruby/lib/mongo-hadoop/reducer.rb:
--------------------------------------------------------------------------------
 1 | require 'mongo-hadoop/input'
 2 | require 'mongo-hadoop/output'
 3 | 
 4 | module MongoHadoop
 5 |   def reduce
 6 |     input = BSONInput.new
 7 |     output = BSONOutput.new
 8 |     
 9 |     grouped = input.group_by { |doc| doc['_id'] }
10 | 
11 |     grouped.each do |key, values|
12 |       output.write yield key, values
13 |     end
14 |   end
15 | 
16 |   def kvreduce
17 |     kvinput = BSONKeyValueInput.new
18 |     kvoutput = BSONKeyValueOutput.new
19 | 
20 |     grouped = kvinput.inject(Hash.new) do |hash, pair|
21 |       key, value = *pair
22 |       hash[key] ||= []
23 |       hash[key] << value
24 |       hash
25 |     end
26 | 
27 |     grouped.each do |key, values|
28 |       kvoutput.write yield key, values
29 |     end
30 |   end
31 | end
32 | 


--------------------------------------------------------------------------------
/streaming/language_support/ruby/mongo-hadoop.gemspec:
--------------------------------------------------------------------------------
 1 | Gem::Specification.new do |s|
 2 |   s.name        = 'mongo-hadoop'
 3 |   s.version     = '1.0.0'
 4 |   s.date        = '2012-05-20'
 5 |   s.summary     = "MongoDB Hadoop streaming support"
 6 |   s.description = "Ruby MongoDB Hadoop streaming support"
 7 |   s.authors     = ["Tyler Brock"]
 8 |   s.email       = 'tyler.brock@gmail.com'
 9 |   s.files       = [
10 |     "bin/mongo-hadoop",
11 |     "lib/mongo-hadoop/input.rb",
12 |     "lib/mongo-hadoop/output.rb",
13 |     "lib/mongo-hadoop/mapper.rb",
14 |     "lib/mongo-hadoop/reducer.rb"
15 |   ]
16 |   s.executables = ['mongo-hadoop']
17 |   s.homepage = 'http://github.com/mongodb/mongo-hadoop'
18 |   s.add_dependency 'bson'
19 |   s.add_dependency 'thor'
20 | end
21 | 


--------------------------------------------------------------------------------
/streaming/language_support/ruby/templates/mapper.tt:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | require 'mongo-hadoop'
3 | 
4 | MongoHadoop::map do |document|
5 |   { :_id => document['_id'] }
6 | end
7 | 


--------------------------------------------------------------------------------
/streaming/language_support/ruby/templates/reducer.tt:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | require 'mongo-hadoop'
3 | 
4 | MongoHadoop::reduce do |key, values|
5 |   { :_id => key, :count => values.size }
6 | end
7 | 


--------------------------------------------------------------------------------
/streaming/language_support/ruby/templates/runner.tt:
--------------------------------------------------------------------------------
1 | hadoop jar <%= @streaming_assembly %> \
2 | -mapper ./mapper.rb \
3 | -reducer ./reducer.rb \
4 | -inputURI  <%= @input_uri %> \
5 | -outputURI <%= @output_uri %> \
6 | -inputformat com.mongodb.hadoop.mapred.MongoInputFormat \
7 | -outputformat com.mongodb.hadoop.mapred.MongoOutputFormat \
8 | 


--------------------------------------------------------------------------------
/streaming/src/main/java/com/mongodb/hadoop/streaming/MongoOutput.java:
--------------------------------------------------------------------------------
 1 | // MongoOutput.java
 2 | /*
 3 |  * Copyright 2010 10gen Inc.
 4 |  * 
 5 |  * Licensed under the Apache License, Version 2.0 (the "License");
 6 |  * you may not use this file except in compliance with the License.
 7 |  * You may obtain a copy of the License at
 8 |  * 
 9 |  * http://www.apache.org/licenses/LICENSE-2.0
10 |  * 
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.mongodb.hadoop.streaming;
19 | 
20 | import com.mongodb.DBObject;
21 | 
22 | public interface MongoOutput {
23 |     void appendAsKey(DBObject o);
24 | 
25 |     void appendAsValue(DBObject o);
26 | }
27 | 


--------------------------------------------------------------------------------
/streaming/src/main/java/com/mongodb/hadoop/streaming/io/MongoIdentifierResolver.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.streaming.io;
 2 | 
 3 | import com.mongodb.hadoop.io.BSONWritable;
 4 | import com.mongodb.hadoop.io.MongoUpdateWritable;
 5 | import org.apache.hadoop.streaming.io.IdentifierResolver;
 6 | 
 7 | public class MongoIdentifierResolver extends IdentifierResolver {
 8 |     public static final String MONGODB_ID = "mongodb";
 9 |     public static final String MONGO_ID = "mongo";
10 |     public static final String BSON_ID = "bson";
11 |     public static final String MONGODB_UPDATE = "mongoUpdate";
12 | 
13 |     @Override
14 |     public void resolve(final String identifier) {
15 |         if (identifier.equalsIgnoreCase(MONGODB_ID)
16 |           || identifier.equalsIgnoreCase(MONGO_ID)
17 |           || identifier.equalsIgnoreCase(BSON_ID)) {
18 |             setInputWriterClass(MongoInputWriter.class);
19 |             setOutputReaderClass(MongoOutputReader.class);
20 |             setOutputKeyClass(BSONWritable.class);
21 |             setOutputValueClass(BSONWritable.class);
22 |         } else if (identifier.equalsIgnoreCase(MONGODB_UPDATE)) {
23 |             setInputWriterClass(MongoUpdateInputWriter.class);
24 |             setOutputReaderClass(MongoUpdateOutputReader.class);
25 |             setOutputKeyClass(BSONWritable.class);
26 |             setOutputValueClass(MongoUpdateWritable.class);
27 |         } else {
28 |             super.resolve(identifier);
29 |         }
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/streaming/src/main/java/com/mongodb/hadoop/streaming/io/MongoInputWriter.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.streaming.io;
 2 | 
 3 | import com.mongodb.hadoop.io.BSONWritable;
 4 | import org.apache.hadoop.streaming.PipeMapRed;
 5 | import org.apache.hadoop.streaming.io.InputWriter;
 6 | 
 7 | import java.io.DataOutput;
 8 | import java.io.IOException;
 9 | 
10 | public class MongoInputWriter extends InputWriter<Object, BSONWritable> {
11 | 
12 |     private DataOutput out;
13 | 
14 |     @Override
15 |     public void initialize(final PipeMapRed pipeMapRed) throws IOException {
16 |         super.initialize(pipeMapRed);
17 |         out = pipeMapRed.getClientOutput();
18 |     }
19 | 
20 |     @Override
21 |     public void writeKey(final Object key) throws IOException {
22 |         // We skip the key COMPLETELY as it's just a copy of _id
23 |         // and readable by the BSON implementation
24 |     }
25 | 
26 |     @Override
27 |     public void writeValue(final BSONWritable value) throws IOException {
28 |         value.write(out);
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/streaming/src/main/java/com/mongodb/hadoop/streaming/io/MongoOutputReader.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.streaming.io;
 2 | 
 3 | import com.mongodb.BasicDBObject;
 4 | import com.mongodb.hadoop.io.BSONWritable;
 5 | import org.apache.commons.logging.Log;
 6 | import org.apache.commons.logging.LogFactory;
 7 | import org.apache.hadoop.streaming.PipeMapRed;
 8 | import org.apache.hadoop.streaming.io.OutputReader;
 9 | 
10 | import java.io.DataInput;
11 | import java.io.IOException;
12 | 
13 | public class MongoOutputReader extends OutputReader<BSONWritable, BSONWritable> {
14 | 
15 |     private DataInput in;
16 |     private static final Log LOG = LogFactory.getLog(MongoOutputReader.class);
17 |     private BSONWritable currentKey;
18 |     private BSONWritable currentValue;
19 | 
20 |     @Override
21 |     public void initialize(final PipeMapRed pipeMapRed) throws IOException {
22 |         super.initialize(pipeMapRed);
23 |         in = pipeMapRed.getClientInput();
24 |         this.currentKey = new BSONWritable();
25 |         this.currentValue = new BSONWritable();
26 |     }
27 | 
28 |     @Override
29 |     public boolean readKeyValue() throws IOException {
30 |         // Actually, just read the value as the key is embedded.
31 |         try {
32 |             currentValue.readFields(in);
33 |             Object id = currentValue.getDoc().get("_id");
34 |             currentKey.setDoc(new BasicDBObject("_id", id));
35 |             // If successful we'll have an _id field
36 |             return id != null;
37 |         } catch (IndexOutOfBoundsException e) {
38 |             // No more data
39 |             LOG.info("No more data; no key/value pair read.");
40 |             return false;
41 |         }
42 |     }
43 | 
44 |     @Override
45 |     public BSONWritable getCurrentKey() throws IOException {
46 |         return currentKey;
47 |     }
48 | 
49 |     @Override
50 |     public BSONWritable getCurrentValue() throws IOException {
51 |         return currentValue;
52 |     }
53 | 
54 |     @Override
55 |     public String getLastOutput() {
56 |         return currentValue.toString();
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/streaming/src/main/java/com/mongodb/hadoop/streaming/io/MongoUpdateInputWriter.java:
--------------------------------------------------------------------------------
 1 | package com.mongodb.hadoop.streaming.io;
 2 | 
 3 | import com.mongodb.hadoop.io.BSONWritable;
 4 | import com.mongodb.hadoop.io.MongoUpdateWritable;
 5 | import org.apache.hadoop.io.Writable;
 6 | import org.apache.hadoop.streaming.PipeMapRed;
 7 | import org.apache.hadoop.streaming.io.InputWriter;
 8 | 
 9 | import java.io.DataOutput;
10 | import java.io.IOException;
11 | 
12 | /**
13 |  * InputWriter capable of handling both BSONWritable and MongoUpdateWritable
14 |  * as value types.
15 |  */
16 | public class MongoUpdateInputWriter extends InputWriter<Object, Writable> {
17 | 
18 |     private DataOutput output;
19 |     private final BSONWritable bsonWritable = new BSONWritable();
20 | 
21 |     @Override
22 |     public void initialize(final PipeMapRed pipeMapRed) throws IOException {
23 |         super.initialize(pipeMapRed);
24 |         output = pipeMapRed.getClientOutput();
25 |     }
26 | 
27 |     @Override
28 |     public void writeKey(final Object key) throws IOException {
29 |         // Nothing to do.
30 |     }
31 | 
32 |     @Override
33 |     public void writeValue(final Writable value) throws IOException {
34 |         if (value instanceof MongoUpdateWritable) {
35 |             // If we're writing to the input of a streaming script, just send
36 |             // back the "query" portion of the MongoUpdateWritable, so that
37 |             // mapper and reducer scripts can operate on a single document.
38 |             bsonWritable.setDoc(((MongoUpdateWritable) value).getQuery());
39 |             bsonWritable.write(output);
40 |         } else if (value instanceof BSONWritable) {
41 |             value.write(output);
42 |         } else {
43 |             throw new IOException("Unexpected Writable type :" + value);
44 |         }
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/sh
 2 | 
 3 | alias g="./gradlew --daemon"
 4 | 
 5 | OPTS=test
 6 | 
 7 | while [ "$1" ]
 8 | do
 9 |     case $1 in
10 |         "examples")
11 |             OPTS="historicalYield sensorData enronEmails"
12 |             ;;
13 |         "all")
14 |             HV="all"
15 |             ;;
16 |     esac
17 | 	shift
18 | done
19 | 
20 | echo Running \"$OPTS\"
21 | 
22 | function browser() {
23 | 	while [ "$1" ]
24 | 	do
25 | 		[ -f $1 ] && open $1
26 | 		shift
27 | 	done
28 | }
29 | 
30 | function run() {
31 | 	g clean jar testJar $OPTS --stacktrace 2>&1 | tee -a build/test.out
32 | 
33 | 
34 | 	for i in "*/build/reports/tests/index.html"
35 | 	do
36 | 		if [ "`grep -i failed $i 2> /dev/null`" ]
37 | 		then
38 | 			echo "********** Found failing tests.  Exiting."
39 | 			browser $i
40 | 			FAILED=true
41 | 		fi
42 | 
43 | 		if [ $FAILED ]
44 | 		then
45 | 			exit
46 | 		fi
47 | 	done
48 | }
49 | 
50 | run
51 | 


--------------------------------------------------------------------------------