├── .evergreen
├── compile.sh
├── config.yml
└── run-tests.sh
├── .gitignore
├── CONTRIBUTORS.md
├── History.md
├── README.md
├── build.gradle
├── clusterConfigs
├── core-site.xml
├── hdfs-site.xml
├── hive-site.xml
└── mapred-site.xml
├── config
├── checkstyle-lite.xml
├── checkstyle.xml
└── findbugs-exclude.xml
├── core
└── src
│ ├── main
│ └── java
│ │ └── com
│ │ └── mongodb
│ │ └── hadoop
│ │ ├── BSONFileInputFormat.java
│ │ ├── BSONFileOutputFormat.java
│ │ ├── BSONPathFilter.java
│ │ ├── GridFSInputFormat.java
│ │ ├── MongoConfig.java
│ │ ├── MongoInputFormat.java
│ │ ├── MongoOutput.java
│ │ ├── MongoOutputFormat.java
│ │ ├── input
│ │ ├── BSONFileRecordReader.java
│ │ ├── BSONFileSplit.java
│ │ ├── GridFSSplit.java
│ │ ├── MongoInputSplit.java
│ │ └── MongoRecordReader.java
│ │ ├── io
│ │ ├── BSONWritable.java
│ │ ├── BSONWritableComparator.java
│ │ ├── DataOutputOutputStreamAdapter.java
│ │ ├── MongoUpdateWritable.java
│ │ └── MongoWritableTypes.java
│ │ ├── mapred
│ │ ├── BSONFileInputFormat.java
│ │ ├── BSONFileOutputFormat.java
│ │ ├── MongoInputFormat.java
│ │ ├── MongoOutputFormat.java
│ │ ├── input
│ │ │ ├── BSONFileRecordReader.java
│ │ │ ├── BSONFileSplit.java
│ │ │ └── MongoRecordReader.java
│ │ └── output
│ │ │ ├── BSONFileRecordWriter.java
│ │ │ ├── MongoOutputCommitter.java
│ │ │ └── MongoRecordWriter.java
│ │ ├── output
│ │ ├── BSONFileRecordWriter.java
│ │ ├── MongoOutputCommitter.java
│ │ └── MongoRecordWriter.java
│ │ ├── splitter
│ │ ├── BSONSplitter.java
│ │ ├── MongoCollectionSplitter.java
│ │ ├── MongoPaginatingSplitter.java
│ │ ├── MongoSplitter.java
│ │ ├── MongoSplitterFactory.java
│ │ ├── MultiCollectionSplitBuilder.java
│ │ ├── MultiMongoCollectionSplitter.java
│ │ ├── SampleSplitter.java
│ │ ├── ShardChunkMongoSplitter.java
│ │ ├── ShardMongoSplitter.java
│ │ ├── SingleMongoSplitter.java
│ │ ├── SplitFailedException.java
│ │ └── StandaloneMongoSplitter.java
│ │ └── util
│ │ ├── BSONComparator.java
│ │ ├── BSONLoader.java
│ │ ├── CompatUtils.java
│ │ ├── MapredMongoConfigUtil.java
│ │ ├── MongoClientURIBuilder.java
│ │ ├── MongoConfigUtil.java
│ │ ├── MongoPathRetriever.java
│ │ ├── MongoTool.java
│ │ └── SplitFriendlyDBCallback.java
│ └── test
│ ├── java
│ └── com
│ │ └── mongodb
│ │ └── hadoop
│ │ ├── BSONFileInputFormatTest.java
│ │ ├── GridFSInputFormatTest.java
│ │ ├── HadoopVersionFilter.java
│ │ ├── MongoConfigUnitTests.java
│ │ ├── MongoOutputCommitterTest.java
│ │ ├── bookstore
│ │ ├── BookstoreConfig.java
│ │ ├── BookstoreTest.java
│ │ ├── TagsMapper.java
│ │ └── TagsReducer.java
│ │ ├── io
│ │ ├── BSONWritableTest.java
│ │ ├── MongoInputSplitTest.java
│ │ └── MongoUpdateWritableTest.java
│ │ ├── mapred
│ │ └── BSONFileInputFormatTest.java
│ │ ├── splitter
│ │ ├── BSONFileRecordReaderTest.java
│ │ ├── BSONSplitterTest.java
│ │ ├── MongoPaginatingSplitterTest.java
│ │ ├── MongoRecordReaderTest.java
│ │ ├── MongoSplitterFactoryTest.java
│ │ ├── MongoSplitterTestUtils.java
│ │ ├── SampleSplitterTest.java
│ │ ├── ShardChunkMongoSplitterTest.java
│ │ └── StandaloneMongoSplitterTest.java
│ │ ├── testutils
│ │ ├── BaseHadoopTest.java
│ │ └── MapReduceJob.java
│ │ └── util
│ │ └── MongoConfigUtilTest.java
│ └── resources
│ └── bookstore-dump
│ ├── inventory.bson
│ ├── orders.bson
│ ├── publishers.bson
│ └── system.indexes.bson
├── examples
├── elastic-mapreduce
│ ├── emr-bootstrap.sh
│ ├── run_emr_job.sh
│ └── update_s3.sh
├── enron
│ ├── hive
│ │ └── hive_enron.q
│ ├── pig
│ │ └── pig_enron.pig
│ ├── run_job.sh
│ ├── spark
│ │ └── src
│ │ │ └── main
│ │ │ └── java
│ │ │ └── com
│ │ │ └── mongodb
│ │ │ └── spark
│ │ │ └── examples
│ │ │ └── enron
│ │ │ ├── DataframeExample.java
│ │ │ ├── Enron.java
│ │ │ └── Message.java
│ └── src
│ │ └── main
│ │ └── java
│ │ └── com
│ │ └── mongodb
│ │ └── hadoop
│ │ └── examples
│ │ └── enron
│ │ ├── EnronMail.java
│ │ ├── EnronMailMapper.java
│ │ ├── EnronMailReducer.java
│ │ └── MailPair.java
├── sensors
│ ├── run_job.sh
│ ├── src
│ │ └── main
│ │ │ └── java
│ │ │ └── com
│ │ │ └── mongodb
│ │ │ └── hadoop
│ │ │ └── examples
│ │ │ └── sensors
│ │ │ ├── DeviceMapper.java
│ │ │ ├── DeviceReducer.java
│ │ │ ├── Devices.java
│ │ │ ├── LogCombiner.java
│ │ │ ├── LogMapper.java
│ │ │ ├── LogReducer.java
│ │ │ ├── Logs.java
│ │ │ └── SensorDataGenerator.java
│ └── testdata_generator.js
├── shakespeare
│ └── src
│ │ └── main
│ │ └── java
│ │ └── com
│ │ └── mongodb
│ │ └── hadoop
│ │ └── examples
│ │ └── shakespeare
│ │ ├── PrepareShakespeare.java
│ │ └── Shakespeare.java
└── treasury_yield
│ ├── pig
│ └── pig_mongo_test.pig
│ ├── run_job.sh
│ └── src
│ ├── main
│ ├── java
│ │ └── com
│ │ │ └── mongodb
│ │ │ └── hadoop
│ │ │ └── examples
│ │ │ └── treasury
│ │ │ ├── TreasuryYieldMapper.java
│ │ │ ├── TreasuryYieldMulti.java
│ │ │ ├── TreasuryYieldReducer.java
│ │ │ ├── TreasuryYieldUpdateReducer.java
│ │ │ └── TreasuryYieldXMLConfig.java
│ └── resources
│ │ ├── commons-logging.properties
│ │ ├── mongo-defaults.xml
│ │ ├── parse_yield_historical.py
│ │ ├── yield_historical_Jan90_Sep10.xml
│ │ └── yield_historical_in.json
│ └── test
│ ├── java
│ └── com
│ │ └── mongodb
│ │ └── hadoop
│ │ ├── BaseShardedTest.java
│ │ ├── JarFinder.java
│ │ ├── StreamingJob.java
│ │ ├── TestSharded.java
│ │ ├── TestStandalone.java
│ │ ├── TestStreaming.java
│ │ └── TreasuryTest.java
│ └── resources
│ ├── commons-logging.properties
│ ├── log4j.properties
│ └── yarn-site.xml
├── flume
└── src
│ └── main
│ └── java
│ └── com
│ └── mongodb
│ └── flume
│ ├── BucketedMongoDBSink.java
│ └── MongoDBSink.java
├── gradle
├── functions.gradle
├── hadoop.gradle
├── maven-deployment.gradle
└── wrapper
│ ├── gradle-wrapper.jar
│ └── gradle-wrapper.properties
├── gradlew
├── gradlew.bat
├── hive
└── src
│ ├── main
│ └── java
│ │ └── com
│ │ └── mongodb
│ │ └── hadoop
│ │ └── hive
│ │ ├── BSONSerDe.java
│ │ ├── MongoStorageHandler.java
│ │ ├── input
│ │ └── HiveMongoInputFormat.java
│ │ └── output
│ │ ├── HiveBSONFileOutputFormat.java
│ │ └── HiveMongoOutputFormat.java
│ └── test
│ ├── java
│ └── com
│ │ └── mongodb
│ │ └── hadoop
│ │ └── hive
│ │ ├── BSONSerDeTest.java
│ │ ├── HiveMappingTest.java
│ │ ├── HiveQueryTest.java
│ │ ├── HiveTest.java
│ │ ├── MongoStorageHandlerTest.java
│ │ ├── Results.java
│ │ ├── TablePropertiesTest.java
│ │ ├── TestBsonToHive.java
│ │ ├── TestHDFSToMongoDB.java
│ │ ├── TestHDFSToMongoDBWithOptions.java
│ │ └── input
│ │ └── HiveMongoInputFormatTest.java
│ └── resources
│ ├── core-site.xml
│ ├── hivetable.properties
│ ├── log4j.properties
│ ├── test_data.txt
│ ├── users.bson
│ └── yarn-site.xml
├── mongo-defaults.xml
├── pig
└── src
│ ├── main
│ └── java
│ │ └── com
│ │ └── mongodb
│ │ └── hadoop
│ │ └── pig
│ │ ├── BSONLoader.java
│ │ ├── BSONStorage.java
│ │ ├── JSONPigReplace.java
│ │ ├── MongoInsertStorage.java
│ │ ├── MongoLoader.java
│ │ ├── MongoStorage.java
│ │ ├── MongoStorageOptions.java
│ │ ├── MongoUpdateStorage.java
│ │ └── udf
│ │ ├── ByteArrayTypeEvalFunc.java
│ │ ├── GenMaxKey.java
│ │ ├── GenMinKey.java
│ │ ├── ObjectIdToSeconds.java
│ │ ├── ToBinary.java
│ │ ├── ToDBRef.java
│ │ ├── ToObjectId.java
│ │ └── types
│ │ ├── PigBoxedBSONValue.java
│ │ ├── PigBoxedBinary.java
│ │ ├── PigBoxedDBRef.java
│ │ ├── PigBoxedMaxKey.java
│ │ ├── PigBoxedMinKey.java
│ │ └── PigBoxedObjectId.java
│ └── test
│ ├── java
│ ├── com
│ │ └── mongodb
│ │ │ └── hadoop
│ │ │ └── pig
│ │ │ ├── BSONStorageTest.java
│ │ │ ├── JSONPigReplaceTest.java
│ │ │ ├── MongoLoaderTest.java
│ │ │ ├── MongoStorageOptionsTest.java
│ │ │ ├── MongoStorageTest.java
│ │ │ ├── PigTest.java
│ │ │ └── UDFTest.java
│ └── helpers
│ │ └── TOBAG.java
│ └── resources
│ ├── dump
│ └── test
│ │ ├── persons_info.bson
│ │ └── persons_info.metadata.json
│ └── pig
│ ├── bson_schemaless.pig
│ ├── bson_test.pig
│ ├── datestest.pig
│ ├── ensure_index.pig
│ ├── ensure_index_2.pig
│ ├── genminmaxkeys.pig
│ ├── oidtoseconds.pig
│ ├── pig_uuid.pig
│ ├── projection.pig
│ ├── replace_mus.pig
│ ├── schemaless.pig
│ ├── tobinary.pig
│ ├── todbref.pig
│ ├── toobjectid.pig
│ ├── udfschemaless.pig
│ ├── update_age_alabis_mus.pig
│ └── update_simple_mus.pig
├── settings.gradle
├── spark
└── src
│ └── main
│ ├── java
│ └── com
│ │ └── mongodb
│ │ └── spark
│ │ ├── PySparkBSONFileInputFormat.java
│ │ ├── PySparkBSONFileOutputFormat.java
│ │ ├── PySparkMongoInputFormat.java
│ │ ├── PySparkMongoOutputFormat.java
│ │ └── pickle
│ │ ├── BSONPickler.java
│ │ ├── BSONValueBox.java
│ │ ├── BinaryConstructor.java
│ │ ├── CalendarTransformer.java
│ │ ├── CodeConstructor.java
│ │ ├── DBRefConstructor.java
│ │ ├── Int64Constructor.java
│ │ ├── MaxKeyConstructor.java
│ │ ├── MinKeyConstructor.java
│ │ ├── ObjectIdConstructor.java
│ │ ├── RegexConstructor.java
│ │ ├── RegisterConstructors.java
│ │ ├── RegisterPickles.java
│ │ └── TimestampConstructor.java
│ ├── python
│ ├── README.rst
│ ├── pymongo_spark.py
│ ├── setup.py
│ └── test
│ │ ├── __init__.py
│ │ └── test_pymongo_spark.py
│ └── scala
│ └── com
│ └── mongodb
│ └── spark
│ └── pickle
│ └── NoopConverter.scala
├── streaming
├── examples
│ ├── enron
│ │ ├── enron_map.js
│ │ ├── enron_map.py
│ │ ├── enron_map.rb
│ │ ├── enron_reduce.js
│ │ ├── enron_reduce.py
│ │ ├── enron_reduce.rb
│ │ ├── run_enron.sh
│ │ ├── run_enron_js.sh
│ │ └── run_enron_rb.sh
│ ├── treasury
│ │ ├── mapper.py
│ │ ├── mapper.rb
│ │ ├── mapper_kv.py
│ │ ├── mapper_kv.rb
│ │ ├── reducer.py
│ │ ├── reducer.rb
│ │ ├── reducer_kv.py
│ │ ├── reducer_kv.rb
│ │ ├── run_treas_kv_py.sh
│ │ ├── run_treas_kv_rb.sh
│ │ ├── run_treas_py.sh
│ │ └── run_treas_rb.sh
│ └── twitter
│ │ ├── README.md
│ │ ├── run_twit_py.sh
│ │ ├── run_twit_rb.sh
│ │ ├── twit_hashtag_map.py
│ │ ├── twit_hashtag_reduce.py
│ │ ├── twit_map.py
│ │ ├── twit_map.rb
│ │ ├── twit_reduce.py
│ │ └── twit_reduce.rb
├── language_support
│ ├── js
│ │ ├── node_mongo_hadoop.js
│ │ └── package.json
│ ├── python
│ │ ├── README.rst
│ │ ├── distribute_setup.py
│ │ ├── pymongo_hadoop
│ │ │ ├── __init__.py
│ │ │ ├── input.py
│ │ │ ├── mapper.py
│ │ │ ├── output.py
│ │ │ └── reducer.py
│ │ ├── setup.py
│ │ └── test_install.py
│ └── ruby
│ │ ├── README.md
│ │ ├── bin
│ │ └── mongo-hadoop
│ │ ├── lib
│ │ ├── mongo-hadoop.rb
│ │ └── mongo-hadoop
│ │ │ ├── input.rb
│ │ │ ├── mapper.rb
│ │ │ ├── output.rb
│ │ │ └── reducer.rb
│ │ ├── mongo-hadoop.gemspec
│ │ └── templates
│ │ ├── mapper.tt
│ │ ├── reducer.tt
│ │ └── runner.tt
└── src
│ ├── main
│ └── java
│ │ └── com
│ │ └── mongodb
│ │ └── hadoop
│ │ └── streaming
│ │ ├── MongoOutput.java
│ │ └── io
│ │ ├── MongoIdentifierResolver.java
│ │ ├── MongoInputWriter.java
│ │ ├── MongoOutputReader.java
│ │ ├── MongoUpdateInputWriter.java
│ │ └── MongoUpdateOutputReader.java
│ └── test
│ └── java
│ └── com
│ └── mongodb
│ └── hadoop
│ └── streaming
│ └── io
│ └── MongoUpdateOutputReaderTest.java
└── test.sh
/.evergreen/compile.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -o xtrace # Write all commands first to stderr
4 | set -o errexit # Exit the script with error if any of the commands fail
5 |
6 | ############################################
7 | # Main Program #
8 | ############################################
9 |
10 | # We always compile with the latest version of java
11 | export JAVA_HOME="/opt/java/jdk8"
12 | ./gradlew -version
13 | ./gradlew -PxmlReports.enabled=true --info -x test clean check jar testClasses javadoc
14 |
--------------------------------------------------------------------------------
/.evergreen/run-tests.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -o xtrace # Write all commands first to stderr
4 | set -o errexit # Exit the script with error if any of the commands fail
5 |
6 | # Supported/used environment variables:
7 | # MONGODB_BINARIES The location of the MongoDB binaries, e.g. /usr/local/bin
8 | # HADOOP_VERSION Sets the version of Hadoop to be used.
9 | # AUTH Set to enable authentication. Values are: "auth" / "noauth" (default)
10 | # JDK Set the version of java to be used. Java versions can be set from the java toolchain /opt/java
11 | # "jdk5", "jdk6", "jdk7", "jdk8"
12 |
13 | MONGODB_BINARIES=${MONGODB_BINARIES:-}
14 | AUTH=${AUTH:-noauth}
15 | JDK=${JDK:-jdk}
16 | PROJECT_DIRECTORY=${PROJECT_DIRECTORY:-}
17 |
18 | export HADOOP_VERSION=${HADOOP_VERSION:-2.7.2}
19 | export HADOOP_PREFIX=$PROJECT_DIRECTORY/hadoop-binaries/hadoop-$HADOOP_VERSION
20 | export HADOOP_HOME=$HADOOP_PREFIX
21 | export HADOOP_USER_CLASSPATH_FIRST=true
22 | export HIVE_HOME=$PROJECT_DIRECTORY/hadoop-binaries/apache-hive-1.2.1-bin
23 |
24 | export JAVA_HOME="/opt/java/${JDK}"
25 |
26 | ./gradlew -version
27 | ./gradlew -Dmongodb_bin_dir=${MONGODB_BINARIES} -Dmongodb_option=${AUTH} -DHADOOP_VERSION=${HADOOP_VERSION} --stacktrace jar testsJar test cleanHadoop
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *#*
2 | *.crc
3 | *.gem
4 | *.iml
5 | *.ipr
6 | *.iws
7 | *.log
8 | *.out
9 | *.pyc
10 | *.splits
11 | *.swp
12 | *~
13 | .DS*
14 | .classpath
15 | .gradle
16 | .idea
17 | .project
18 | TempStatsStore/
19 | WDI_GDF_Data.csv
20 | bin/hadoop-all.sh
21 | build
22 | examples/data
23 | logs
24 | out
25 | metastore_db/
26 | streaming/language_support/python/dist/
27 | streaming/language_support/python/pymongo_hadoop.egg-info/
28 | tags
29 | target
30 | test-*.out
31 | hadoop-binaries
32 |
--------------------------------------------------------------------------------
/CONTRIBUTORS.md:
--------------------------------------------------------------------------------
1 | * Mike O'Brien (mikeo@10gen.com)
2 | * Brendan McAdams brendan@10gen.com
3 | * Eliot Horowitz erh@10gen.com
4 | * Ryan Nitz ryan@10gen.com
5 | * Russell Jurney (@rjurney) (Lots of significant Pig improvements)
6 | * Sarthak Dudhara sarthak.83@gmail.com (BSONWritable comparable interface)
7 | * Priya Manda priyakanth024@gmail.com (Test Harness Code)
8 | * Rushin Shah rushin10@gmail.com (Test Harness Code)
9 | * Joseph Shraibman jks@iname.com (Sharded Input Splits)
10 | * Sumin Xia xiasumin1984@gmail.com (Sharded Input Splits)
11 | * Jeremy Karn
12 | * bpfoster
13 | * Ross Lawley
14 | * Carsten Hufe
15 | * Asya Kamsky
16 | * Thomas Millar
17 | * Justin Lee
18 | * Luke Lovett
19 | * Mariano Semelman
20 | * Jordan Gwyn
21 | * Powerrr
22 |
--------------------------------------------------------------------------------
/clusterConfigs/core-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
16 |
17 |
18 |
19 |
20 |
26 |
27 | fs.default.name
28 | hdfs://localhost:8020
29 |
30 |
31 | hadoop.tmp.dir
32 | @HADOOP_BINARIES@/hadoop-tmpdir
33 |
34 |
35 |
--------------------------------------------------------------------------------
/clusterConfigs/hdfs-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
16 |
17 |
18 |
19 |
20 |
21 | mapred.job.tracker
22 | localhost:8021
23 |
24 |
25 | hadoop.tmp.dir
26 | @HADOOP_BINARIES@/hadoop-tmpdir
27 |
28 |
29 |
--------------------------------------------------------------------------------
/clusterConfigs/hive-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | javax.jdo.option.ConnectionURL
6 | jdbc:derby:;databaseName=@HIVE_HOME@/metastore_db;create=true
7 |
8 |
9 | hive.metastore.warehouse.dir
10 | hdfs://localhost:8020/user/hive/warehouse
11 |
12 |
13 | dfs.datanode.address
14 | 50010
15 |
16 |
17 | hive.aux.jars.path
18 | @HIVE_HOME@/lib/mongo-hadoop-hive.jar
19 |
20 |
21 |
--------------------------------------------------------------------------------
/clusterConfigs/mapred-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
16 |
17 |
18 |
19 |
20 |
21 | mapred.job.tracker
22 | localhost:8021
23 |
24 |
30 |
--------------------------------------------------------------------------------
/config/findbugs-exclude.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/BSONFileOutputFormat.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2010-2013 10gen Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.mongodb.hadoop;
18 |
19 | import com.mongodb.hadoop.output.BSONFileRecordWriter;
20 | import com.mongodb.hadoop.splitter.BSONSplitter;
21 | import com.mongodb.hadoop.util.MongoConfigUtil;
22 | import org.apache.commons.logging.Log;
23 | import org.apache.commons.logging.LogFactory;
24 | import org.apache.hadoop.fs.FSDataOutputStream;
25 | import org.apache.hadoop.fs.FileSystem;
26 | import org.apache.hadoop.fs.Path;
27 | import org.apache.hadoop.mapreduce.RecordWriter;
28 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
29 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
30 |
31 | import java.io.IOException;
32 |
33 | public class BSONFileOutputFormat extends FileOutputFormat {
34 |
35 | @Override
36 | public RecordWriter getRecordWriter(final TaskAttemptContext context) throws IOException {
37 | // Open data output stream
38 |
39 | Path outPath = getDefaultWorkFile(context, ".bson");
40 | LOG.info("output going into " + outPath);
41 |
42 | FileSystem fs = outPath.getFileSystem(context.getConfiguration());
43 | FSDataOutputStream outFile = fs.create(outPath);
44 |
45 | FSDataOutputStream splitFile = null;
46 | if (MongoConfigUtil.getBSONOutputBuildSplits(context.getConfiguration())) {
47 | Path splitPath = new Path(outPath.getParent(), "." + outPath.getName() + ".splits");
48 | splitFile = fs.create(splitPath);
49 | }
50 |
51 | long splitSize = BSONSplitter.getSplitSize(context.getConfiguration(), null);
52 | return new BSONFileRecordWriter(outFile, splitFile, splitSize);
53 | }
54 |
55 | private static final Log LOG = LogFactory.getLog(BSONFileOutputFormat.class);
56 | }
57 |
58 |
--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/BSONPathFilter.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2010-2013 10gen Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 | package com.mongodb.hadoop;
17 |
18 | import org.apache.commons.logging.Log;
19 | import org.apache.commons.logging.LogFactory;
20 | import org.apache.hadoop.fs.Path;
21 | import org.apache.hadoop.fs.PathFilter;
22 |
23 | public class BSONPathFilter implements PathFilter {
24 |
25 | private static final Log LOG = LogFactory.getLog(BSONPathFilter.class);
26 |
27 | public BSONPathFilter() {
28 | LOG.info("path filter constructed.");
29 | }
30 |
31 | public boolean accept(final Path path) {
32 | String pathName = path.getName().toLowerCase();
33 | boolean acceptable = pathName.endsWith(".bson") && !pathName.startsWith(".");
34 | LOG.info(path.toString() + " returning " + acceptable);
35 | return acceptable;
36 | }
37 | }
38 |
39 |
--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/MongoOutput.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2010-2013 10gen Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.mongodb.hadoop;
18 |
19 | // Mongo
20 |
21 | import com.mongodb.DBObject;
22 |
23 | public interface MongoOutput {
24 | void appendAsKey(DBObject o);
25 |
26 | void appendAsValue(DBObject o);
27 | }
28 |
29 |
--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/MongoOutputFormat.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2010-2013 10gen Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.mongodb.hadoop;
18 |
19 | import com.mongodb.hadoop.output.MongoOutputCommitter;
20 | import com.mongodb.hadoop.output.MongoRecordWriter;
21 | import com.mongodb.hadoop.util.MongoConfigUtil;
22 | import org.apache.hadoop.mapreduce.JobContext;
23 | import org.apache.hadoop.mapreduce.OutputCommitter;
24 | import org.apache.hadoop.mapreduce.OutputFormat;
25 | import org.apache.hadoop.mapreduce.RecordWriter;
26 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
27 |
28 | import java.io.IOException;
29 |
30 | public class MongoOutputFormat extends OutputFormat {
31 | public void checkOutputSpecs(final JobContext context) throws IOException {
32 | if (MongoConfigUtil.getOutputURIs(context.getConfiguration()).isEmpty()) {
33 | throw new IOException("No output URI is specified. You must set mongo.output.uri.");
34 | }
35 | }
36 |
37 | public OutputCommitter getOutputCommitter(final TaskAttemptContext context) {
38 | return new MongoOutputCommitter();
39 | }
40 |
41 | /**
42 | * Get the record writer that points to the output collection.
43 | */
44 | public RecordWriter getRecordWriter(final TaskAttemptContext context) {
45 | return new MongoRecordWriter(
46 | MongoConfigUtil.getOutputCollection(context.getConfiguration()),
47 | context);
48 | }
49 |
50 | public MongoOutputFormat() {}
51 |
52 | /**
53 | * @param updateKeys ignored
54 | * @param multiUpdate ignored
55 | * @deprecated this constructor is no longer useful.
56 | */
57 | @Deprecated
58 | public MongoOutputFormat(final String[] updateKeys, final boolean multiUpdate) {
59 | this();
60 | }
61 | }
--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/input/BSONFileSplit.java:
--------------------------------------------------------------------------------
1 | package com.mongodb.hadoop.input;
2 |
3 | import org.apache.hadoop.fs.Path;
4 | import org.apache.hadoop.io.Text;
5 | import org.apache.hadoop.mapreduce.lib.input.FileSplit;
6 |
7 | import java.io.DataInput;
8 | import java.io.DataOutput;
9 | import java.io.IOException;
10 |
11 | public class BSONFileSplit extends FileSplit {
12 |
13 | // CHECKSTYLE:OFF
14 | protected String keyField = "_id";
15 | // CHECKSTYLE:ON
16 |
17 | public BSONFileSplit(final Path file, final long start, final long length,
18 | final String[] hosts) {
19 | super(file, start, length, hosts);
20 | }
21 |
22 | public BSONFileSplit() { this(null, 0, 0, null); }
23 |
24 | public String getKeyField() {
25 | return keyField;
26 | }
27 |
28 | public void setKeyField(final String keyField) {
29 | this.keyField = keyField;
30 | }
31 |
32 | @Override
33 | public void write(final DataOutput out) throws IOException {
34 | super.write(out);
35 | Text.writeString(out, getKeyField());
36 | }
37 |
38 | @Override
39 | public void readFields(final DataInput in) throws IOException {
40 | super.readFields(in);
41 | setKeyField(Text.readString(in));
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/io/BSONWritableComparator.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2010-2013 10gen Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.mongodb.hadoop.io;
18 |
19 | import com.mongodb.hadoop.util.BSONComparator;
20 | import org.apache.commons.logging.Log;
21 | import org.apache.commons.logging.LogFactory;
22 | import org.apache.hadoop.io.WritableComparable;
23 | import org.apache.hadoop.io.WritableComparator;
24 |
25 | public class BSONWritableComparator extends WritableComparator {
26 |
27 | private static final Log LOG = LogFactory.getLog(BSONWritableComparator.class);
28 |
29 | public BSONWritableComparator() {
30 | super(BSONWritable.class, true);
31 | }
32 |
33 | protected BSONWritableComparator(final Class extends WritableComparable> keyClass) {
34 | super(keyClass, true);
35 | }
36 |
37 | protected BSONWritableComparator(final Class extends WritableComparable> keyClass, final boolean createInstances) {
38 | super(keyClass, createInstances);
39 | }
40 |
41 | public int compare(final WritableComparable a, final WritableComparable b) {
42 | if (a instanceof BSONWritable && b instanceof BSONWritable) {
43 | return BSONComparator.getInstance().compare(((BSONWritable) a).getDoc(), ((BSONWritable) b).getDoc());
44 | } else {
45 | //return super.compare( a, b );
46 | return -1;
47 | }
48 | }
49 |
50 | public int compare(final byte[] b1, final int s1, final int l1, final byte[] b2, final int s2, final int l2) {
51 | //return BSONComparator.getInstance().compare(b1, s1, l1, b2, s2, l2);
52 | return super.compare(b1, s1, l1, b2, s2, l2);
53 | }
54 |
55 | public int compare(final Object a, final Object b) {
56 | return BSONComparator.getInstance().compare(((BSONWritable) a).getDoc(), ((BSONWritable) b).getDoc());
57 | //return super.compare( a, b );
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/io/DataOutputOutputStreamAdapter.java:
--------------------------------------------------------------------------------
1 | package com.mongodb.hadoop.io;
2 |
3 | import java.io.DataOutput;
4 | import java.io.IOException;
5 | import java.io.OutputStream;
6 |
7 | class DataOutputOutputStreamAdapter extends OutputStream {
8 | private final DataOutput dataOutput;
9 |
10 | DataOutputOutputStreamAdapter(final DataOutput dataOutput) {
11 | this.dataOutput = dataOutput;
12 | }
13 |
14 | @Override
15 | public void write(final int b) throws IOException {
16 | dataOutput.write(b);
17 | }
18 |
19 | @Override
20 | public void write(final byte[] b) throws IOException {
21 | dataOutput.write(b);
22 | }
23 |
24 | @Override
25 | public void write(final byte[] b, final int off, final int len) throws IOException {
26 | dataOutput.write(b, off, len);
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/io/MongoWritableTypes.java:
--------------------------------------------------------------------------------
1 | package com.mongodb.hadoop.io;
2 |
3 | // CHECKSTYLE:OFF
4 | public interface MongoWritableTypes {
5 | int BSON_WRITABLE = 0;
6 | int MONGO_UPDATE_WRITABLE = 1;
7 | }
8 | // CHECKSTYLE:ON
9 |
--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/mapred/BSONFileOutputFormat.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2010-2013 10gen Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.mongodb.hadoop.mapred;
18 |
19 | import com.mongodb.hadoop.mapred.output.BSONFileRecordWriter;
20 | import com.mongodb.hadoop.splitter.BSONSplitter;
21 | import com.mongodb.hadoop.util.MongoConfigUtil;
22 | import org.apache.commons.logging.Log;
23 | import org.apache.commons.logging.LogFactory;
24 | import org.apache.hadoop.fs.FSDataOutputStream;
25 | import org.apache.hadoop.fs.FileSystem;
26 | import org.apache.hadoop.fs.Path;
27 | import org.apache.hadoop.mapred.FileOutputFormat;
28 | import org.apache.hadoop.mapred.JobConf;
29 | import org.apache.hadoop.mapred.RecordWriter;
30 | import org.apache.hadoop.util.Progressable;
31 |
32 | import java.io.IOException;
33 |
34 | public class BSONFileOutputFormat extends FileOutputFormat {
35 |
36 | public RecordWriter getRecordWriter(final FileSystem ignored, final JobConf job, final String name,
37 | final Progressable progress) throws IOException {
38 | Path outPath = getDefaultWorkFile(job, name, ".bson");
39 | LOG.info("output going into " + outPath);
40 |
41 | FileSystem fs = outPath.getFileSystem(job);
42 | FSDataOutputStream outFile = fs.create(outPath);
43 |
44 | FSDataOutputStream splitFile = null;
45 | if (MongoConfigUtil.getBSONOutputBuildSplits(job)) {
46 | Path splitPath = new Path(outPath.getParent(), "." + outPath.getName() + ".splits");
47 | splitFile = fs.create(splitPath);
48 | }
49 |
50 | long splitSize = BSONSplitter.getSplitSize(job, null);
51 |
52 | return new BSONFileRecordWriter(outFile, splitFile, splitSize);
53 | }
54 |
55 | public static Path getDefaultWorkFile(final JobConf conf, final String name, final String extension) {
56 | return new Path(getWorkOutputPath(conf), getUniqueName(conf, name) + extension);
57 | }
58 |
59 | private static final Log LOG = LogFactory.getLog(BSONFileOutputFormat.class);
60 | }
61 |
62 |
--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/mapred/MongoOutputFormat.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2010-2013 10gen Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.mongodb.hadoop.mapred;
18 |
19 | import com.mongodb.hadoop.mapred.output.MongoRecordWriter;
20 | import com.mongodb.hadoop.util.MongoConfigUtil;
21 | import org.apache.hadoop.fs.FileSystem;
22 | import org.apache.hadoop.mapred.JobConf;
23 | import org.apache.hadoop.mapred.OutputFormat;
24 | import org.apache.hadoop.mapred.RecordWriter;
25 | import org.apache.hadoop.util.Progressable;
26 |
27 | import java.io.IOException;
28 |
29 | @SuppressWarnings("deprecation")
30 | public class MongoOutputFormat implements OutputFormat {
31 | public MongoOutputFormat() {
32 | }
33 |
34 | @Override
35 | public void checkOutputSpecs(final FileSystem ignored, final JobConf job) throws IOException {
36 | if (MongoConfigUtil.getOutputURIs(job).isEmpty()) {
37 | throw new IOException("No output URI is specified. You must set mongo.output.uri.");
38 | }
39 | }
40 |
41 | @Override
42 | public RecordWriter getRecordWriter(
43 | final FileSystem ignored, final JobConf job, final String name,
44 | final Progressable progress) {
45 | return new MongoRecordWriter(job);
46 | }
47 |
48 | }
49 |
--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/mapred/input/BSONFileSplit.java:
--------------------------------------------------------------------------------
1 | package com.mongodb.hadoop.mapred.input;
2 |
3 | import org.apache.hadoop.fs.Path;
4 | import org.apache.hadoop.io.Text;
5 | import org.apache.hadoop.mapred.FileSplit;
6 |
7 | import java.io.DataInput;
8 | import java.io.DataOutput;
9 | import java.io.IOException;
10 |
11 | public class BSONFileSplit extends FileSplit {
12 |
13 | // CHECKSTYLE:OFF
14 | protected String keyField = "_id";
15 | // CHECKSTYLE:ON
16 |
17 |
18 | public BSONFileSplit(final Path file, final long start, final long
19 | length, final String[] hosts) {
20 | super(file, start, length, hosts);
21 | }
22 |
23 | public BSONFileSplit() { this(null, 0, 0, null); }
24 |
25 | public String getKeyField() { return keyField; }
26 |
27 | public void setKeyField(final String keyField) {
28 | this.keyField = keyField;
29 | }
30 |
31 | @Override
32 | public void write(final DataOutput out) throws IOException {
33 | super.write(out);
34 | Text.writeString(out, getKeyField());
35 | }
36 |
37 | @Override
38 | public void readFields(final DataInput in) throws IOException {
39 | super.readFields(in);
40 | setKeyField(Text.readString(in));
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/mapred/output/BSONFileRecordWriter.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2010-2013 10gen Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.mongodb.hadoop.mapred.output;
18 |
19 | import org.apache.hadoop.fs.FSDataOutputStream;
20 | import org.apache.hadoop.mapred.RecordWriter;
21 | import org.apache.hadoop.mapred.Reporter;
22 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
23 |
24 | import java.io.IOException;
25 |
26 |
27 | public class BSONFileRecordWriter extends com.mongodb.hadoop.output.BSONFileRecordWriter implements RecordWriter {
28 |
29 | public BSONFileRecordWriter(final FSDataOutputStream outFile, final FSDataOutputStream splitFile, final long splitSize) {
30 | super(outFile, splitFile, splitSize);
31 | }
32 |
33 | public void close(final Reporter reporter) throws IOException {
34 | this.close((TaskAttemptContext) null);
35 | }
36 |
37 | }
38 |
39 |
--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/mapred/output/MongoOutputCommitter.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2010-2013 10gen Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 |
18 | package com.mongodb.hadoop.mapred.output;
19 |
20 | import org.apache.hadoop.mapred.JobContext;
21 | import org.apache.hadoop.mapred.OutputCommitter;
22 | import org.apache.hadoop.mapred.TaskAttemptContext;
23 |
24 | import java.io.IOException;
25 |
26 | public class MongoOutputCommitter extends OutputCommitter {
27 | private final com.mongodb.hadoop.output.MongoOutputCommitter delegate;
28 |
29 | public MongoOutputCommitter() {
30 | delegate = new com.mongodb.hadoop.output.MongoOutputCommitter();
31 | }
32 |
33 | @Override
34 | public void abortTask(final TaskAttemptContext taskContext)
35 | throws IOException {
36 | delegate.abortTask(taskContext);
37 | }
38 |
39 | @Override
40 | public void commitTask(final TaskAttemptContext taskContext)
41 | throws IOException {
42 | delegate.commitTask(taskContext);
43 | }
44 |
45 | @Override
46 | public boolean needsTaskCommit(final TaskAttemptContext taskContext)
47 | throws IOException {
48 | return delegate.needsTaskCommit(taskContext);
49 | }
50 |
51 | @Override
52 | public void setupJob(final JobContext jobContext) {
53 | delegate.setupJob(jobContext);
54 | }
55 |
56 | @Override
57 | public void setupTask(final TaskAttemptContext taskContext) {
58 | delegate.setupTask(taskContext);
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/mapred/output/MongoRecordWriter.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2010-2013 10gen Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 |
18 | package com.mongodb.hadoop.mapred.output;
19 |
20 | import com.mongodb.hadoop.util.CompatUtils;
21 | import com.mongodb.hadoop.util.MongoConfigUtil;
22 | import org.apache.hadoop.mapred.JobConf;
23 | import org.apache.hadoop.mapred.RecordWriter;
24 | import org.apache.hadoop.mapred.Reporter;
25 |
26 | public class MongoRecordWriter
27 | extends com.mongodb.hadoop.output.MongoRecordWriter
28 | implements RecordWriter {
29 |
30 | /**
31 | * Create a new MongoRecordWriter.
32 | * @param conf the job configuration
33 | */
34 | public MongoRecordWriter(final JobConf conf) {
35 | super(
36 | MongoConfigUtil.getOutputCollection(conf),
37 | CompatUtils.getTaskAttemptContext(conf, conf.get("mapred.task.id")));
38 | }
39 |
40 | @Override
41 | public void close(final Reporter reporter) {
42 | super.close(null);
43 | }
44 |
45 | }
46 |
--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/splitter/MongoSplitter.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2010-2013 10gen Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.mongodb.hadoop.splitter;
18 |
19 | import com.mongodb.hadoop.input.MongoInputSplit;
20 | import com.mongodb.hadoop.util.MongoConfigUtil;
21 | import org.apache.hadoop.conf.Configuration;
22 | import org.apache.hadoop.mapreduce.InputSplit;
23 |
24 | import java.util.ArrayList;
25 | import java.util.List;
26 |
27 | public abstract class MongoSplitter {
28 |
29 | private Configuration configuration;
30 |
31 | public MongoSplitter() {
32 | }
33 |
34 | public MongoSplitter(final Configuration configuration) {
35 | setConfiguration(configuration);
36 | }
37 |
38 | public void setConfiguration(final Configuration conf) {
39 | configuration = conf;
40 | }
41 |
42 | public abstract List calculateSplits() throws SplitFailedException;
43 |
44 | public Configuration getConfiguration() {
45 | return configuration;
46 | }
47 |
48 | /**
49 | * Get a list of nonempty input splits only.
50 | *
51 | * @param splits a list of input splits
52 | * @return a new list of nonempty input splits
53 | */
54 | public static List filterEmptySplits(
55 | final List splits) {
56 | List results = new ArrayList(splits.size());
57 | for (InputSplit split : splits) {
58 | MongoInputSplit mis = (MongoInputSplit) split;
59 | if (mis.getCursor().hasNext()) {
60 | results.add(mis);
61 | } else {
62 | MongoConfigUtil.close(
63 | mis.getCursor().getCollection().getDB().getMongo());
64 | }
65 | }
66 | return results;
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/splitter/SingleMongoSplitter.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2010-2013 10gen Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.mongodb.hadoop.splitter;
18 |
19 | import com.mongodb.MongoClientURI;
20 | import com.mongodb.hadoop.input.MongoInputSplit;
21 | import com.mongodb.hadoop.util.MongoConfigUtil;
22 | import org.apache.commons.logging.Log;
23 | import org.apache.commons.logging.LogFactory;
24 | import org.apache.hadoop.conf.Configuration;
25 | import org.apache.hadoop.mapreduce.InputSplit;
26 |
27 | import java.util.Collections;
28 | import java.util.List;
29 |
30 | import static java.lang.String.format;
31 |
32 | /* This implementation of MongoSplitter does not actually
33 | * do any splitting, it will just create a single input split
34 | * which represents the entire data set within a collection.
35 | */
36 | public class SingleMongoSplitter extends MongoCollectionSplitter {
37 |
38 | private static final Log LOG = LogFactory.getLog(SingleMongoSplitter.class);
39 |
40 | //Create a single split which consists of a single
41 | //a query over the entire collection.
42 |
43 |
44 | public SingleMongoSplitter() {
45 | }
46 |
47 | public SingleMongoSplitter(final Configuration conf) {
48 | super(conf);
49 | }
50 |
51 | @Override
52 | public List calculateSplits() {
53 | if (LOG.isDebugEnabled()) {
54 | MongoClientURI inputURI =
55 | MongoConfigUtil.getInputURI(getConfiguration());
56 | LOG.debug(format("SingleMongoSplitter calculating splits for namespace: %s.%s; hosts: %s",
57 | inputURI.getDatabase(), inputURI.getCollection(), inputURI.getHosts()));
58 | }
59 | return Collections.singletonList(
60 | (InputSplit) new MongoInputSplit(getConfiguration()));
61 | }
62 |
63 | }
64 |
--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/splitter/SplitFailedException.java:
--------------------------------------------------------------------------------
1 | package com.mongodb.hadoop.splitter;
2 |
3 | public class SplitFailedException extends Exception {
4 |
5 | public SplitFailedException(final String message) {
6 | super(message);
7 | }
8 |
9 | public SplitFailedException(final String message, final Throwable cause) {
10 | super(message, cause);
11 | }
12 | }
13 |
14 |
--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/util/MongoPathRetriever.java:
--------------------------------------------------------------------------------
1 | package com.mongodb.hadoop.util;
2 |
3 | import org.bson.BSONObject;
4 |
5 | import java.util.List;
6 |
7 | /**
8 | * Utility class providing a mechanism for retrieving data nested within
9 | * a MongoDB document.
10 | */
11 | public final class MongoPathRetriever {
12 |
13 | private MongoPathRetriever() {}
14 |
15 | /**
16 | * Returns the Object stored at a given path within a MongoDB
17 | * document. Returns null
if the path is not found.
18 | *
19 | * @param document MongoDB document in which to search.
20 | * @param path Dot-separated path to look up.
21 | * @return the Object stored at the path within the document.
22 | */
23 | public static Object get(final BSONObject document, final String path) {
24 | String[] parts = path.split("\\.");
25 | Object o = document;
26 | for (String part : parts) {
27 | if (null == o) {
28 | return null;
29 | } else if (o instanceof List) {
30 | try {
31 | int index = Integer.parseInt(part);
32 | if (((List) o).size() > index && index >= 0) {
33 | o = ((List) o).get(index);
34 | } else {
35 | return null;
36 | }
37 | } catch (NumberFormatException e) {
38 | return null;
39 | }
40 | } else if (o instanceof BSONObject) {
41 | o = ((BSONObject) o).get(part);
42 | } else {
43 | // Hit a leaf before finding the key we were looking for.
44 | return null;
45 | }
46 | }
47 | return o;
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/core/src/main/java/com/mongodb/hadoop/util/SplitFriendlyDBCallback.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Copyright (c) 2010, 2011 10gen, Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
5 | * the License. You may obtain a copy of the License at
6 | *
7 | * http://www.apache.org/licenses/LICENSE-2.0
8 | *
9 | * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
10 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
11 | * specific language governing permissions and limitations under the License.
12 | */
13 |
14 | package com.mongodb.hadoop.util;
15 |
16 | import com.mongodb.DBCallback;
17 | import com.mongodb.DBCallbackFactory;
18 | import com.mongodb.DBCollection;
19 | import com.mongodb.DefaultDBCallback;
20 |
21 | public class SplitFriendlyDBCallback extends DefaultDBCallback {
22 |
23 | static final class MinKey {
24 | }
25 |
26 | static final class MaxKey {
27 | }
28 |
29 | static class SplitFriendlyFactory implements DBCallbackFactory {
30 | public DBCallback create(final DBCollection collection) {
31 | return new DefaultDBCallback(collection);
32 | }
33 | }
34 |
35 | public static final DBCallbackFactory FACTORY = new SplitFriendlyFactory();
36 | public static final MinKey MIN_KEY_TYPE = new MinKey();
37 | public static final MaxKey MAX_KEY_TYPE = new MaxKey();
38 |
39 | public SplitFriendlyDBCallback(final DBCollection coll) {
40 | super(coll);
41 | }
42 |
43 | @Override
44 | public void gotMinKey(final String name) {
45 | cur().put(name, MAX_KEY_TYPE);
46 | }
47 |
48 | @Override
49 | public void gotMaxKey(final String name) {
50 | cur().put(name, MAX_KEY_TYPE);
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/core/src/test/java/com/mongodb/hadoop/BSONFileInputFormatTest.java:
--------------------------------------------------------------------------------
1 | package com.mongodb.hadoop;
2 |
3 | import com.mongodb.hadoop.io.BSONWritable;
4 | import com.mongodb.hadoop.mapred.BSONFileInputFormat;
5 | import org.apache.hadoop.io.NullWritable;
6 | import org.apache.hadoop.mapred.FileSplit;
7 | import org.apache.hadoop.mapred.JobConf;
8 | import org.apache.hadoop.mapred.RecordReader;
9 | import org.junit.Test;
10 |
11 | import java.io.File;
12 | import java.io.IOException;
13 |
14 | import static com.mongodb.hadoop.testutils.BaseHadoopTest.EXAMPLE_DATA_HOME;
15 | import static org.junit.Assert.assertEquals;
16 |
17 | public class BSONFileInputFormatTest {
18 |
19 | @Test
20 | public void enronEmails() throws IOException {
21 | BSONFileInputFormat inputFormat = new BSONFileInputFormat();
22 | JobConf job = new JobConf();
23 | String inputDirectory =
24 | new File(EXAMPLE_DATA_HOME, "/dump/enron_mail/messages.bson")
25 | .getAbsoluteFile().toURI().toString();
26 | // Hadoop 2.X
27 | job.set("mapreduce.input.fileinputformat.inputdir", inputDirectory);
28 | // Hadoop 1.2.X
29 | job.set("mapred.input.dir", inputDirectory);
30 | FileSplit[] splits = inputFormat.getSplits(job, 5);
31 | int count = 0;
32 | BSONWritable writable = new BSONWritable();
33 | for (FileSplit split : splits) {
34 | RecordReader recordReader = inputFormat.getRecordReader(split, job, null);
35 | while (recordReader.next(null, writable)) {
36 | count++;
37 | }
38 | }
39 | assertEquals("There are 501513 messages in the enron corpus", 501513, count);
40 | }
41 | }
--------------------------------------------------------------------------------
/core/src/test/java/com/mongodb/hadoop/HadoopVersionFilter.java:
--------------------------------------------------------------------------------
1 | package com.mongodb.hadoop;
2 |
3 | import com.mongodb.hadoop.testutils.BaseHadoopTest;
4 |
5 | import java.io.File;
6 | import java.io.FileFilter;
7 |
8 | public class HadoopVersionFilter implements FileFilter {
9 | private final boolean findTestJar;
10 | private static final String PROD_FORMAT = String.format("-%s.jar", BaseHadoopTest.PROJECT_VERSION);
11 | private static final String TEST_FORMAT = String.format("%s-tests.jar", BaseHadoopTest.PROJECT_VERSION);
12 |
13 | public HadoopVersionFilter() {
14 | this(false);
15 | }
16 |
17 | public HadoopVersionFilter(final boolean findTestJar) {
18 | this.findTestJar = findTestJar;
19 | }
20 |
21 | @Override
22 | public boolean accept(final File pathname) {
23 | return findTestJar ? pathname.getName().endsWith(TEST_FORMAT) : pathname.getName().endsWith(PROD_FORMAT);
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/core/src/test/java/com/mongodb/hadoop/MongoConfigUnitTests.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2011 10gen Inc.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.mongodb.hadoop;
18 |
19 | // Hadoop
20 |
21 | import org.apache.hadoop.conf.Configuration;
22 | import org.junit.Test;
23 |
24 | import static org.junit.Assert.assertNotNull;
25 |
26 | // JUnit
27 |
28 | /**
29 | * The mongo config unit tests.
30 | */
31 | public final class MongoConfigUnitTests {
32 |
33 | @Test
34 | public void testConstructor() {
35 | assertNotNull(new MongoConfig(new Configuration(false)));
36 | }
37 | }
38 |
39 |
--------------------------------------------------------------------------------
/core/src/test/java/com/mongodb/hadoop/bookstore/BookstoreConfig.java:
--------------------------------------------------------------------------------
1 | package com.mongodb.hadoop.bookstore;
2 |
3 | import com.mongodb.hadoop.MongoConfig;
4 | import com.mongodb.hadoop.MongoInputFormat;
5 | import com.mongodb.hadoop.MongoOutputFormat;
6 | import com.mongodb.hadoop.io.BSONWritable;
7 | import com.mongodb.hadoop.io.MongoUpdateWritable;
8 | import com.mongodb.hadoop.util.MongoTool;
9 | import org.apache.hadoop.conf.Configuration;
10 | import org.apache.hadoop.io.Text;
11 | import org.apache.hadoop.util.ToolRunner;
12 |
13 | public class BookstoreConfig extends MongoTool {
14 | public BookstoreConfig() {
15 | this(new Configuration());
16 | }
17 |
18 | public BookstoreConfig(final Configuration configuration) {
19 | MongoConfig config = new MongoConfig(configuration);
20 | setConf(configuration);
21 |
22 | config.setInputFormat(MongoInputFormat.class);
23 |
24 | config.setMapper(TagsMapper.class);
25 | config.setMapperOutputKey(Text.class);
26 | config.setMapperOutputValue(BSONWritable.class);
27 |
28 | config.setReducer(TagsReducer.class);
29 | config.setOutputKey(Text.class);
30 | config.setOutputValue(MongoUpdateWritable.class);
31 | config.setOutputFormat(MongoOutputFormat.class);
32 | }
33 |
34 | public static void main(final String[] pArgs) throws Exception {
35 | System.exit(ToolRunner.run(new BookstoreConfig(), pArgs));
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/core/src/test/java/com/mongodb/hadoop/bookstore/TagsMapper.java:
--------------------------------------------------------------------------------
1 | package com.mongodb.hadoop.bookstore;
2 |
3 | import com.mongodb.hadoop.io.BSONWritable;
4 | import org.apache.hadoop.io.Text;
5 | import org.apache.hadoop.mapred.JobConf;
6 | import org.apache.hadoop.mapred.OutputCollector;
7 | import org.apache.hadoop.mapred.Reporter;
8 | import org.apache.hadoop.mapreduce.Mapper;
9 | import org.bson.BSONObject;
10 | import org.bson.types.BasicBSONList;
11 |
12 | import java.io.IOException;
13 |
14 | public class TagsMapper extends Mapper