├── .evergreen ├── compile.sh ├── config.yml └── run-tests.sh ├── .gitignore ├── CONTRIBUTORS.md ├── History.md ├── README.md ├── build.gradle ├── clusterConfigs ├── core-site.xml ├── hdfs-site.xml ├── hive-site.xml └── mapred-site.xml ├── config ├── checkstyle-lite.xml ├── checkstyle.xml └── findbugs-exclude.xml ├── core └── src │ ├── main │ └── java │ │ └── com │ │ └── mongodb │ │ └── hadoop │ │ ├── BSONFileInputFormat.java │ │ ├── BSONFileOutputFormat.java │ │ ├── BSONPathFilter.java │ │ ├── GridFSInputFormat.java │ │ ├── MongoConfig.java │ │ ├── MongoInputFormat.java │ │ ├── MongoOutput.java │ │ ├── MongoOutputFormat.java │ │ ├── input │ │ ├── BSONFileRecordReader.java │ │ ├── BSONFileSplit.java │ │ ├── GridFSSplit.java │ │ ├── MongoInputSplit.java │ │ └── MongoRecordReader.java │ │ ├── io │ │ ├── BSONWritable.java │ │ ├── BSONWritableComparator.java │ │ ├── DataOutputOutputStreamAdapter.java │ │ ├── MongoUpdateWritable.java │ │ └── MongoWritableTypes.java │ │ ├── mapred │ │ ├── BSONFileInputFormat.java │ │ ├── BSONFileOutputFormat.java │ │ ├── MongoInputFormat.java │ │ ├── MongoOutputFormat.java │ │ ├── input │ │ │ ├── BSONFileRecordReader.java │ │ │ ├── BSONFileSplit.java │ │ │ └── MongoRecordReader.java │ │ └── output │ │ │ ├── BSONFileRecordWriter.java │ │ │ ├── MongoOutputCommitter.java │ │ │ └── MongoRecordWriter.java │ │ ├── output │ │ ├── BSONFileRecordWriter.java │ │ ├── MongoOutputCommitter.java │ │ └── MongoRecordWriter.java │ │ ├── splitter │ │ ├── BSONSplitter.java │ │ ├── MongoCollectionSplitter.java │ │ ├── MongoPaginatingSplitter.java │ │ ├── MongoSplitter.java │ │ ├── MongoSplitterFactory.java │ │ ├── MultiCollectionSplitBuilder.java │ │ ├── MultiMongoCollectionSplitter.java │ │ ├── SampleSplitter.java │ │ ├── ShardChunkMongoSplitter.java │ │ ├── ShardMongoSplitter.java │ │ ├── SingleMongoSplitter.java │ │ ├── SplitFailedException.java │ │ └── StandaloneMongoSplitter.java │ │ └── util │ │ ├── BSONComparator.java │ │ ├── BSONLoader.java │ │ ├── CompatUtils.java │ │ ├── MapredMongoConfigUtil.java │ │ ├── MongoClientURIBuilder.java │ │ ├── MongoConfigUtil.java │ │ ├── MongoPathRetriever.java │ │ ├── MongoTool.java │ │ └── SplitFriendlyDBCallback.java │ └── test │ ├── java │ └── com │ │ └── mongodb │ │ └── hadoop │ │ ├── BSONFileInputFormatTest.java │ │ ├── GridFSInputFormatTest.java │ │ ├── HadoopVersionFilter.java │ │ ├── MongoConfigUnitTests.java │ │ ├── MongoOutputCommitterTest.java │ │ ├── bookstore │ │ ├── BookstoreConfig.java │ │ ├── BookstoreTest.java │ │ ├── TagsMapper.java │ │ └── TagsReducer.java │ │ ├── io │ │ ├── BSONWritableTest.java │ │ ├── MongoInputSplitTest.java │ │ └── MongoUpdateWritableTest.java │ │ ├── mapred │ │ └── BSONFileInputFormatTest.java │ │ ├── splitter │ │ ├── BSONFileRecordReaderTest.java │ │ ├── BSONSplitterTest.java │ │ ├── MongoPaginatingSplitterTest.java │ │ ├── MongoRecordReaderTest.java │ │ ├── MongoSplitterFactoryTest.java │ │ ├── MongoSplitterTestUtils.java │ │ ├── SampleSplitterTest.java │ │ ├── ShardChunkMongoSplitterTest.java │ │ └── StandaloneMongoSplitterTest.java │ │ ├── testutils │ │ ├── BaseHadoopTest.java │ │ └── MapReduceJob.java │ │ └── util │ │ └── MongoConfigUtilTest.java │ └── resources │ └── bookstore-dump │ ├── inventory.bson │ ├── orders.bson │ ├── publishers.bson │ └── system.indexes.bson ├── examples ├── elastic-mapreduce │ ├── emr-bootstrap.sh │ ├── run_emr_job.sh │ └── update_s3.sh ├── enron │ ├── hive │ │ └── hive_enron.q │ ├── pig │ │ └── pig_enron.pig │ ├── run_job.sh │ ├── spark │ │ └── src │ │ │ └── main │ │ │ └── java │ │ │ └── com │ │ │ └── mongodb │ │ │ └── spark │ │ │ └── examples │ │ │ └── enron │ │ │ ├── DataframeExample.java │ │ │ ├── Enron.java │ │ │ └── Message.java │ └── src │ │ └── main │ │ └── java │ │ └── com │ │ └── mongodb │ │ └── hadoop │ │ └── examples │ │ └── enron │ │ ├── EnronMail.java │ │ ├── EnronMailMapper.java │ │ ├── EnronMailReducer.java │ │ └── MailPair.java ├── sensors │ ├── run_job.sh │ ├── src │ │ └── main │ │ │ └── java │ │ │ └── com │ │ │ └── mongodb │ │ │ └── hadoop │ │ │ └── examples │ │ │ └── sensors │ │ │ ├── DeviceMapper.java │ │ │ ├── DeviceReducer.java │ │ │ ├── Devices.java │ │ │ ├── LogCombiner.java │ │ │ ├── LogMapper.java │ │ │ ├── LogReducer.java │ │ │ ├── Logs.java │ │ │ └── SensorDataGenerator.java │ └── testdata_generator.js ├── shakespeare │ └── src │ │ └── main │ │ └── java │ │ └── com │ │ └── mongodb │ │ └── hadoop │ │ └── examples │ │ └── shakespeare │ │ ├── PrepareShakespeare.java │ │ └── Shakespeare.java └── treasury_yield │ ├── pig │ └── pig_mongo_test.pig │ ├── run_job.sh │ └── src │ ├── main │ ├── java │ │ └── com │ │ │ └── mongodb │ │ │ └── hadoop │ │ │ └── examples │ │ │ └── treasury │ │ │ ├── TreasuryYieldMapper.java │ │ │ ├── TreasuryYieldMulti.java │ │ │ ├── TreasuryYieldReducer.java │ │ │ ├── TreasuryYieldUpdateReducer.java │ │ │ └── TreasuryYieldXMLConfig.java │ └── resources │ │ ├── commons-logging.properties │ │ ├── mongo-defaults.xml │ │ ├── parse_yield_historical.py │ │ ├── yield_historical_Jan90_Sep10.xml │ │ └── yield_historical_in.json │ └── test │ ├── java │ └── com │ │ └── mongodb │ │ └── hadoop │ │ ├── BaseShardedTest.java │ │ ├── JarFinder.java │ │ ├── StreamingJob.java │ │ ├── TestSharded.java │ │ ├── TestStandalone.java │ │ ├── TestStreaming.java │ │ └── TreasuryTest.java │ └── resources │ ├── commons-logging.properties │ ├── log4j.properties │ └── yarn-site.xml ├── flume └── src │ └── main │ └── java │ └── com │ └── mongodb │ └── flume │ ├── BucketedMongoDBSink.java │ └── MongoDBSink.java ├── gradle ├── functions.gradle ├── hadoop.gradle ├── maven-deployment.gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── gradlew ├── gradlew.bat ├── hive └── src │ ├── main │ └── java │ │ └── com │ │ └── mongodb │ │ └── hadoop │ │ └── hive │ │ ├── BSONSerDe.java │ │ ├── MongoStorageHandler.java │ │ ├── input │ │ └── HiveMongoInputFormat.java │ │ └── output │ │ ├── HiveBSONFileOutputFormat.java │ │ └── HiveMongoOutputFormat.java │ └── test │ ├── java │ └── com │ │ └── mongodb │ │ └── hadoop │ │ └── hive │ │ ├── BSONSerDeTest.java │ │ ├── HiveMappingTest.java │ │ ├── HiveQueryTest.java │ │ ├── HiveTest.java │ │ ├── MongoStorageHandlerTest.java │ │ ├── Results.java │ │ ├── TablePropertiesTest.java │ │ ├── TestBsonToHive.java │ │ ├── TestHDFSToMongoDB.java │ │ ├── TestHDFSToMongoDBWithOptions.java │ │ └── input │ │ └── HiveMongoInputFormatTest.java │ └── resources │ ├── core-site.xml │ ├── hivetable.properties │ ├── log4j.properties │ ├── test_data.txt │ ├── users.bson │ └── yarn-site.xml ├── mongo-defaults.xml ├── pig └── src │ ├── main │ └── java │ │ └── com │ │ └── mongodb │ │ └── hadoop │ │ └── pig │ │ ├── BSONLoader.java │ │ ├── BSONStorage.java │ │ ├── JSONPigReplace.java │ │ ├── MongoInsertStorage.java │ │ ├── MongoLoader.java │ │ ├── MongoStorage.java │ │ ├── MongoStorageOptions.java │ │ ├── MongoUpdateStorage.java │ │ └── udf │ │ ├── ByteArrayTypeEvalFunc.java │ │ ├── GenMaxKey.java │ │ ├── GenMinKey.java │ │ ├── ObjectIdToSeconds.java │ │ ├── ToBinary.java │ │ ├── ToDBRef.java │ │ ├── ToObjectId.java │ │ └── types │ │ ├── PigBoxedBSONValue.java │ │ ├── PigBoxedBinary.java │ │ ├── PigBoxedDBRef.java │ │ ├── PigBoxedMaxKey.java │ │ ├── PigBoxedMinKey.java │ │ └── PigBoxedObjectId.java │ └── test │ ├── java │ ├── com │ │ └── mongodb │ │ │ └── hadoop │ │ │ └── pig │ │ │ ├── BSONStorageTest.java │ │ │ ├── JSONPigReplaceTest.java │ │ │ ├── MongoLoaderTest.java │ │ │ ├── MongoStorageOptionsTest.java │ │ │ ├── MongoStorageTest.java │ │ │ ├── PigTest.java │ │ │ └── UDFTest.java │ └── helpers │ │ └── TOBAG.java │ └── resources │ ├── dump │ └── test │ │ ├── persons_info.bson │ │ └── persons_info.metadata.json │ └── pig │ ├── bson_schemaless.pig │ ├── bson_test.pig │ ├── datestest.pig │ ├── ensure_index.pig │ ├── ensure_index_2.pig │ ├── genminmaxkeys.pig │ ├── oidtoseconds.pig │ ├── pig_uuid.pig │ ├── projection.pig │ ├── replace_mus.pig │ ├── schemaless.pig │ ├── tobinary.pig │ ├── todbref.pig │ ├── toobjectid.pig │ ├── udfschemaless.pig │ ├── update_age_alabis_mus.pig │ └── update_simple_mus.pig ├── settings.gradle ├── spark └── src │ └── main │ ├── java │ └── com │ │ └── mongodb │ │ └── spark │ │ ├── PySparkBSONFileInputFormat.java │ │ ├── PySparkBSONFileOutputFormat.java │ │ ├── PySparkMongoInputFormat.java │ │ ├── PySparkMongoOutputFormat.java │ │ └── pickle │ │ ├── BSONPickler.java │ │ ├── BSONValueBox.java │ │ ├── BinaryConstructor.java │ │ ├── CalendarTransformer.java │ │ ├── CodeConstructor.java │ │ ├── DBRefConstructor.java │ │ ├── Int64Constructor.java │ │ ├── MaxKeyConstructor.java │ │ ├── MinKeyConstructor.java │ │ ├── ObjectIdConstructor.java │ │ ├── RegexConstructor.java │ │ ├── RegisterConstructors.java │ │ ├── RegisterPickles.java │ │ └── TimestampConstructor.java │ ├── python │ ├── README.rst │ ├── pymongo_spark.py │ ├── setup.py │ └── test │ │ ├── __init__.py │ │ └── test_pymongo_spark.py │ └── scala │ └── com │ └── mongodb │ └── spark │ └── pickle │ └── NoopConverter.scala ├── streaming ├── examples │ ├── enron │ │ ├── enron_map.js │ │ ├── enron_map.py │ │ ├── enron_map.rb │ │ ├── enron_reduce.js │ │ ├── enron_reduce.py │ │ ├── enron_reduce.rb │ │ ├── run_enron.sh │ │ ├── run_enron_js.sh │ │ └── run_enron_rb.sh │ ├── treasury │ │ ├── mapper.py │ │ ├── mapper.rb │ │ ├── mapper_kv.py │ │ ├── mapper_kv.rb │ │ ├── reducer.py │ │ ├── reducer.rb │ │ ├── reducer_kv.py │ │ ├── reducer_kv.rb │ │ ├── run_treas_kv_py.sh │ │ ├── run_treas_kv_rb.sh │ │ ├── run_treas_py.sh │ │ └── run_treas_rb.sh │ └── twitter │ │ ├── README.md │ │ ├── run_twit_py.sh │ │ ├── run_twit_rb.sh │ │ ├── twit_hashtag_map.py │ │ ├── twit_hashtag_reduce.py │ │ ├── twit_map.py │ │ ├── twit_map.rb │ │ ├── twit_reduce.py │ │ └── twit_reduce.rb ├── language_support │ ├── js │ │ ├── node_mongo_hadoop.js │ │ └── package.json │ ├── python │ │ ├── README.rst │ │ ├── distribute_setup.py │ │ ├── pymongo_hadoop │ │ │ ├── __init__.py │ │ │ ├── input.py │ │ │ ├── mapper.py │ │ │ ├── output.py │ │ │ └── reducer.py │ │ ├── setup.py │ │ └── test_install.py │ └── ruby │ │ ├── README.md │ │ ├── bin │ │ └── mongo-hadoop │ │ ├── lib │ │ ├── mongo-hadoop.rb │ │ └── mongo-hadoop │ │ │ ├── input.rb │ │ │ ├── mapper.rb │ │ │ ├── output.rb │ │ │ └── reducer.rb │ │ ├── mongo-hadoop.gemspec │ │ └── templates │ │ ├── mapper.tt │ │ ├── reducer.tt │ │ └── runner.tt └── src │ ├── main │ └── java │ │ └── com │ │ └── mongodb │ │ └── hadoop │ │ └── streaming │ │ ├── MongoOutput.java │ │ └── io │ │ ├── MongoIdentifierResolver.java │ │ ├── MongoInputWriter.java │ │ ├── MongoOutputReader.java │ │ ├── MongoUpdateInputWriter.java │ │ └── MongoUpdateOutputReader.java │ └── test │ └── java │ └── com │ └── mongodb │ └── hadoop │ └── streaming │ └── io │ └── MongoUpdateOutputReaderTest.java └── test.sh /.evergreen/compile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -o xtrace # Write all commands first to stderr 4 | set -o errexit # Exit the script with error if any of the commands fail 5 | 6 | ############################################ 7 | # Main Program # 8 | ############################################ 9 | 10 | # We always compile with the latest version of java 11 | export JAVA_HOME="/opt/java/jdk8" 12 | ./gradlew -version 13 | ./gradlew -PxmlReports.enabled=true --info -x test clean check jar testClasses javadoc 14 | -------------------------------------------------------------------------------- /.evergreen/run-tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -o xtrace # Write all commands first to stderr 4 | set -o errexit # Exit the script with error if any of the commands fail 5 | 6 | # Supported/used environment variables: 7 | # MONGODB_BINARIES The location of the MongoDB binaries, e.g. /usr/local/bin 8 | # HADOOP_VERSION Sets the version of Hadoop to be used. 9 | # AUTH Set to enable authentication. Values are: "auth" / "noauth" (default) 10 | # JDK Set the version of java to be used. Java versions can be set from the java toolchain /opt/java 11 | # "jdk5", "jdk6", "jdk7", "jdk8" 12 | 13 | MONGODB_BINARIES=${MONGODB_BINARIES:-} 14 | AUTH=${AUTH:-noauth} 15 | JDK=${JDK:-jdk} 16 | PROJECT_DIRECTORY=${PROJECT_DIRECTORY:-} 17 | 18 | export HADOOP_VERSION=${HADOOP_VERSION:-2.7.2} 19 | export HADOOP_PREFIX=$PROJECT_DIRECTORY/hadoop-binaries/hadoop-$HADOOP_VERSION 20 | export HADOOP_HOME=$HADOOP_PREFIX 21 | export HADOOP_USER_CLASSPATH_FIRST=true 22 | export HIVE_HOME=$PROJECT_DIRECTORY/hadoop-binaries/apache-hive-1.2.1-bin 23 | 24 | export JAVA_HOME="/opt/java/${JDK}" 25 | 26 | ./gradlew -version 27 | ./gradlew -Dmongodb_bin_dir=${MONGODB_BINARIES} -Dmongodb_option=${AUTH} -DHADOOP_VERSION=${HADOOP_VERSION} --stacktrace jar testsJar test cleanHadoop -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *#* 2 | *.crc 3 | *.gem 4 | *.iml 5 | *.ipr 6 | *.iws 7 | *.log 8 | *.out 9 | *.pyc 10 | *.splits 11 | *.swp 12 | *~ 13 | .DS* 14 | .classpath 15 | .gradle 16 | .idea 17 | .project 18 | TempStatsStore/ 19 | WDI_GDF_Data.csv 20 | bin/hadoop-all.sh 21 | build 22 | examples/data 23 | logs 24 | out 25 | metastore_db/ 26 | streaming/language_support/python/dist/ 27 | streaming/language_support/python/pymongo_hadoop.egg-info/ 28 | tags 29 | target 30 | test-*.out 31 | hadoop-binaries 32 | -------------------------------------------------------------------------------- /CONTRIBUTORS.md: -------------------------------------------------------------------------------- 1 | * Mike O'Brien (mikeo@10gen.com) 2 | * Brendan McAdams brendan@10gen.com 3 | * Eliot Horowitz erh@10gen.com 4 | * Ryan Nitz ryan@10gen.com 5 | * Russell Jurney (@rjurney) (Lots of significant Pig improvements) 6 | * Sarthak Dudhara sarthak.83@gmail.com (BSONWritable comparable interface) 7 | * Priya Manda priyakanth024@gmail.com (Test Harness Code) 8 | * Rushin Shah rushin10@gmail.com (Test Harness Code) 9 | * Joseph Shraibman jks@iname.com (Sharded Input Splits) 10 | * Sumin Xia xiasumin1984@gmail.com (Sharded Input Splits) 11 | * Jeremy Karn 12 | * bpfoster 13 | * Ross Lawley 14 | * Carsten Hufe 15 | * Asya Kamsky 16 | * Thomas Millar 17 | * Justin Lee 18 | * Luke Lovett 19 | * Mariano Semelman 20 | * Jordan Gwyn 21 | * Powerrr 22 | -------------------------------------------------------------------------------- /clusterConfigs/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 26 | 27 | fs.default.name 28 | hdfs://localhost:8020 29 | 30 | 31 | hadoop.tmp.dir 32 | @HADOOP_BINARIES@/hadoop-tmpdir 33 | 34 | 35 | -------------------------------------------------------------------------------- /clusterConfigs/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | mapred.job.tracker 22 | localhost:8021 23 | 24 | 25 | hadoop.tmp.dir 26 | @HADOOP_BINARIES@/hadoop-tmpdir 27 | 28 | 29 | -------------------------------------------------------------------------------- /clusterConfigs/hive-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | javax.jdo.option.ConnectionURL 6 | jdbc:derby:;databaseName=@HIVE_HOME@/metastore_db;create=true 7 | 8 | 9 | hive.metastore.warehouse.dir 10 | hdfs://localhost:8020/user/hive/warehouse 11 | 12 | 13 | dfs.datanode.address 14 | 50010 15 | 16 | 17 | hive.aux.jars.path 18 | @HIVE_HOME@/lib/mongo-hadoop-hive.jar 19 | 20 | 21 | -------------------------------------------------------------------------------- /clusterConfigs/mapred-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | 17 | 18 | 19 | 20 | 21 | mapred.job.tracker 22 | localhost:8021 23 | 24 | 30 | -------------------------------------------------------------------------------- /config/findbugs-exclude.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /core/src/main/java/com/mongodb/hadoop/BSONFileOutputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2010-2013 10gen Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.mongodb.hadoop; 18 | 19 | import com.mongodb.hadoop.output.BSONFileRecordWriter; 20 | import com.mongodb.hadoop.splitter.BSONSplitter; 21 | import com.mongodb.hadoop.util.MongoConfigUtil; 22 | import org.apache.commons.logging.Log; 23 | import org.apache.commons.logging.LogFactory; 24 | import org.apache.hadoop.fs.FSDataOutputStream; 25 | import org.apache.hadoop.fs.FileSystem; 26 | import org.apache.hadoop.fs.Path; 27 | import org.apache.hadoop.mapreduce.RecordWriter; 28 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 29 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 30 | 31 | import java.io.IOException; 32 | 33 | public class BSONFileOutputFormat extends FileOutputFormat { 34 | 35 | @Override 36 | public RecordWriter getRecordWriter(final TaskAttemptContext context) throws IOException { 37 | // Open data output stream 38 | 39 | Path outPath = getDefaultWorkFile(context, ".bson"); 40 | LOG.info("output going into " + outPath); 41 | 42 | FileSystem fs = outPath.getFileSystem(context.getConfiguration()); 43 | FSDataOutputStream outFile = fs.create(outPath); 44 | 45 | FSDataOutputStream splitFile = null; 46 | if (MongoConfigUtil.getBSONOutputBuildSplits(context.getConfiguration())) { 47 | Path splitPath = new Path(outPath.getParent(), "." + outPath.getName() + ".splits"); 48 | splitFile = fs.create(splitPath); 49 | } 50 | 51 | long splitSize = BSONSplitter.getSplitSize(context.getConfiguration(), null); 52 | return new BSONFileRecordWriter(outFile, splitFile, splitSize); 53 | } 54 | 55 | private static final Log LOG = LogFactory.getLog(BSONFileOutputFormat.class); 56 | } 57 | 58 | -------------------------------------------------------------------------------- /core/src/main/java/com/mongodb/hadoop/BSONPathFilter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2010-2013 10gen Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.mongodb.hadoop; 17 | 18 | import org.apache.commons.logging.Log; 19 | import org.apache.commons.logging.LogFactory; 20 | import org.apache.hadoop.fs.Path; 21 | import org.apache.hadoop.fs.PathFilter; 22 | 23 | public class BSONPathFilter implements PathFilter { 24 | 25 | private static final Log LOG = LogFactory.getLog(BSONPathFilter.class); 26 | 27 | public BSONPathFilter() { 28 | LOG.info("path filter constructed."); 29 | } 30 | 31 | public boolean accept(final Path path) { 32 | String pathName = path.getName().toLowerCase(); 33 | boolean acceptable = pathName.endsWith(".bson") && !pathName.startsWith("."); 34 | LOG.info(path.toString() + " returning " + acceptable); 35 | return acceptable; 36 | } 37 | } 38 | 39 | -------------------------------------------------------------------------------- /core/src/main/java/com/mongodb/hadoop/MongoOutput.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2010-2013 10gen Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.mongodb.hadoop; 18 | 19 | // Mongo 20 | 21 | import com.mongodb.DBObject; 22 | 23 | public interface MongoOutput { 24 | void appendAsKey(DBObject o); 25 | 26 | void appendAsValue(DBObject o); 27 | } 28 | 29 | -------------------------------------------------------------------------------- /core/src/main/java/com/mongodb/hadoop/MongoOutputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2010-2013 10gen Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.mongodb.hadoop; 18 | 19 | import com.mongodb.hadoop.output.MongoOutputCommitter; 20 | import com.mongodb.hadoop.output.MongoRecordWriter; 21 | import com.mongodb.hadoop.util.MongoConfigUtil; 22 | import org.apache.hadoop.mapreduce.JobContext; 23 | import org.apache.hadoop.mapreduce.OutputCommitter; 24 | import org.apache.hadoop.mapreduce.OutputFormat; 25 | import org.apache.hadoop.mapreduce.RecordWriter; 26 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 27 | 28 | import java.io.IOException; 29 | 30 | public class MongoOutputFormat extends OutputFormat { 31 | public void checkOutputSpecs(final JobContext context) throws IOException { 32 | if (MongoConfigUtil.getOutputURIs(context.getConfiguration()).isEmpty()) { 33 | throw new IOException("No output URI is specified. You must set mongo.output.uri."); 34 | } 35 | } 36 | 37 | public OutputCommitter getOutputCommitter(final TaskAttemptContext context) { 38 | return new MongoOutputCommitter(); 39 | } 40 | 41 | /** 42 | * Get the record writer that points to the output collection. 43 | */ 44 | public RecordWriter getRecordWriter(final TaskAttemptContext context) { 45 | return new MongoRecordWriter( 46 | MongoConfigUtil.getOutputCollection(context.getConfiguration()), 47 | context); 48 | } 49 | 50 | public MongoOutputFormat() {} 51 | 52 | /** 53 | * @param updateKeys ignored 54 | * @param multiUpdate ignored 55 | * @deprecated this constructor is no longer useful. 56 | */ 57 | @Deprecated 58 | public MongoOutputFormat(final String[] updateKeys, final boolean multiUpdate) { 59 | this(); 60 | } 61 | } -------------------------------------------------------------------------------- /core/src/main/java/com/mongodb/hadoop/input/BSONFileSplit.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.input; 2 | 3 | import org.apache.hadoop.fs.Path; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapreduce.lib.input.FileSplit; 6 | 7 | import java.io.DataInput; 8 | import java.io.DataOutput; 9 | import java.io.IOException; 10 | 11 | public class BSONFileSplit extends FileSplit { 12 | 13 | // CHECKSTYLE:OFF 14 | protected String keyField = "_id"; 15 | // CHECKSTYLE:ON 16 | 17 | public BSONFileSplit(final Path file, final long start, final long length, 18 | final String[] hosts) { 19 | super(file, start, length, hosts); 20 | } 21 | 22 | public BSONFileSplit() { this(null, 0, 0, null); } 23 | 24 | public String getKeyField() { 25 | return keyField; 26 | } 27 | 28 | public void setKeyField(final String keyField) { 29 | this.keyField = keyField; 30 | } 31 | 32 | @Override 33 | public void write(final DataOutput out) throws IOException { 34 | super.write(out); 35 | Text.writeString(out, getKeyField()); 36 | } 37 | 38 | @Override 39 | public void readFields(final DataInput in) throws IOException { 40 | super.readFields(in); 41 | setKeyField(Text.readString(in)); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /core/src/main/java/com/mongodb/hadoop/io/BSONWritableComparator.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2010-2013 10gen Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.mongodb.hadoop.io; 18 | 19 | import com.mongodb.hadoop.util.BSONComparator; 20 | import org.apache.commons.logging.Log; 21 | import org.apache.commons.logging.LogFactory; 22 | import org.apache.hadoop.io.WritableComparable; 23 | import org.apache.hadoop.io.WritableComparator; 24 | 25 | public class BSONWritableComparator extends WritableComparator { 26 | 27 | private static final Log LOG = LogFactory.getLog(BSONWritableComparator.class); 28 | 29 | public BSONWritableComparator() { 30 | super(BSONWritable.class, true); 31 | } 32 | 33 | protected BSONWritableComparator(final Class keyClass) { 34 | super(keyClass, true); 35 | } 36 | 37 | protected BSONWritableComparator(final Class keyClass, final boolean createInstances) { 38 | super(keyClass, createInstances); 39 | } 40 | 41 | public int compare(final WritableComparable a, final WritableComparable b) { 42 | if (a instanceof BSONWritable && b instanceof BSONWritable) { 43 | return BSONComparator.getInstance().compare(((BSONWritable) a).getDoc(), ((BSONWritable) b).getDoc()); 44 | } else { 45 | //return super.compare( a, b ); 46 | return -1; 47 | } 48 | } 49 | 50 | public int compare(final byte[] b1, final int s1, final int l1, final byte[] b2, final int s2, final int l2) { 51 | //return BSONComparator.getInstance().compare(b1, s1, l1, b2, s2, l2); 52 | return super.compare(b1, s1, l1, b2, s2, l2); 53 | } 54 | 55 | public int compare(final Object a, final Object b) { 56 | return BSONComparator.getInstance().compare(((BSONWritable) a).getDoc(), ((BSONWritable) b).getDoc()); 57 | //return super.compare( a, b ); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /core/src/main/java/com/mongodb/hadoop/io/DataOutputOutputStreamAdapter.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.io; 2 | 3 | import java.io.DataOutput; 4 | import java.io.IOException; 5 | import java.io.OutputStream; 6 | 7 | class DataOutputOutputStreamAdapter extends OutputStream { 8 | private final DataOutput dataOutput; 9 | 10 | DataOutputOutputStreamAdapter(final DataOutput dataOutput) { 11 | this.dataOutput = dataOutput; 12 | } 13 | 14 | @Override 15 | public void write(final int b) throws IOException { 16 | dataOutput.write(b); 17 | } 18 | 19 | @Override 20 | public void write(final byte[] b) throws IOException { 21 | dataOutput.write(b); 22 | } 23 | 24 | @Override 25 | public void write(final byte[] b, final int off, final int len) throws IOException { 26 | dataOutput.write(b, off, len); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /core/src/main/java/com/mongodb/hadoop/io/MongoWritableTypes.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.io; 2 | 3 | // CHECKSTYLE:OFF 4 | public interface MongoWritableTypes { 5 | int BSON_WRITABLE = 0; 6 | int MONGO_UPDATE_WRITABLE = 1; 7 | } 8 | // CHECKSTYLE:ON 9 | -------------------------------------------------------------------------------- /core/src/main/java/com/mongodb/hadoop/mapred/BSONFileOutputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2010-2013 10gen Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.mongodb.hadoop.mapred; 18 | 19 | import com.mongodb.hadoop.mapred.output.BSONFileRecordWriter; 20 | import com.mongodb.hadoop.splitter.BSONSplitter; 21 | import com.mongodb.hadoop.util.MongoConfigUtil; 22 | import org.apache.commons.logging.Log; 23 | import org.apache.commons.logging.LogFactory; 24 | import org.apache.hadoop.fs.FSDataOutputStream; 25 | import org.apache.hadoop.fs.FileSystem; 26 | import org.apache.hadoop.fs.Path; 27 | import org.apache.hadoop.mapred.FileOutputFormat; 28 | import org.apache.hadoop.mapred.JobConf; 29 | import org.apache.hadoop.mapred.RecordWriter; 30 | import org.apache.hadoop.util.Progressable; 31 | 32 | import java.io.IOException; 33 | 34 | public class BSONFileOutputFormat extends FileOutputFormat { 35 | 36 | public RecordWriter getRecordWriter(final FileSystem ignored, final JobConf job, final String name, 37 | final Progressable progress) throws IOException { 38 | Path outPath = getDefaultWorkFile(job, name, ".bson"); 39 | LOG.info("output going into " + outPath); 40 | 41 | FileSystem fs = outPath.getFileSystem(job); 42 | FSDataOutputStream outFile = fs.create(outPath); 43 | 44 | FSDataOutputStream splitFile = null; 45 | if (MongoConfigUtil.getBSONOutputBuildSplits(job)) { 46 | Path splitPath = new Path(outPath.getParent(), "." + outPath.getName() + ".splits"); 47 | splitFile = fs.create(splitPath); 48 | } 49 | 50 | long splitSize = BSONSplitter.getSplitSize(job, null); 51 | 52 | return new BSONFileRecordWriter(outFile, splitFile, splitSize); 53 | } 54 | 55 | public static Path getDefaultWorkFile(final JobConf conf, final String name, final String extension) { 56 | return new Path(getWorkOutputPath(conf), getUniqueName(conf, name) + extension); 57 | } 58 | 59 | private static final Log LOG = LogFactory.getLog(BSONFileOutputFormat.class); 60 | } 61 | 62 | -------------------------------------------------------------------------------- /core/src/main/java/com/mongodb/hadoop/mapred/MongoOutputFormat.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2010-2013 10gen Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.mongodb.hadoop.mapred; 18 | 19 | import com.mongodb.hadoop.mapred.output.MongoRecordWriter; 20 | import com.mongodb.hadoop.util.MongoConfigUtil; 21 | import org.apache.hadoop.fs.FileSystem; 22 | import org.apache.hadoop.mapred.JobConf; 23 | import org.apache.hadoop.mapred.OutputFormat; 24 | import org.apache.hadoop.mapred.RecordWriter; 25 | import org.apache.hadoop.util.Progressable; 26 | 27 | import java.io.IOException; 28 | 29 | @SuppressWarnings("deprecation") 30 | public class MongoOutputFormat implements OutputFormat { 31 | public MongoOutputFormat() { 32 | } 33 | 34 | @Override 35 | public void checkOutputSpecs(final FileSystem ignored, final JobConf job) throws IOException { 36 | if (MongoConfigUtil.getOutputURIs(job).isEmpty()) { 37 | throw new IOException("No output URI is specified. You must set mongo.output.uri."); 38 | } 39 | } 40 | 41 | @Override 42 | public RecordWriter getRecordWriter( 43 | final FileSystem ignored, final JobConf job, final String name, 44 | final Progressable progress) { 45 | return new MongoRecordWriter(job); 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /core/src/main/java/com/mongodb/hadoop/mapred/input/BSONFileSplit.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.mapred.input; 2 | 3 | import org.apache.hadoop.fs.Path; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapred.FileSplit; 6 | 7 | import java.io.DataInput; 8 | import java.io.DataOutput; 9 | import java.io.IOException; 10 | 11 | public class BSONFileSplit extends FileSplit { 12 | 13 | // CHECKSTYLE:OFF 14 | protected String keyField = "_id"; 15 | // CHECKSTYLE:ON 16 | 17 | 18 | public BSONFileSplit(final Path file, final long start, final long 19 | length, final String[] hosts) { 20 | super(file, start, length, hosts); 21 | } 22 | 23 | public BSONFileSplit() { this(null, 0, 0, null); } 24 | 25 | public String getKeyField() { return keyField; } 26 | 27 | public void setKeyField(final String keyField) { 28 | this.keyField = keyField; 29 | } 30 | 31 | @Override 32 | public void write(final DataOutput out) throws IOException { 33 | super.write(out); 34 | Text.writeString(out, getKeyField()); 35 | } 36 | 37 | @Override 38 | public void readFields(final DataInput in) throws IOException { 39 | super.readFields(in); 40 | setKeyField(Text.readString(in)); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /core/src/main/java/com/mongodb/hadoop/mapred/output/BSONFileRecordWriter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2010-2013 10gen Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.mongodb.hadoop.mapred.output; 18 | 19 | import org.apache.hadoop.fs.FSDataOutputStream; 20 | import org.apache.hadoop.mapred.RecordWriter; 21 | import org.apache.hadoop.mapred.Reporter; 22 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 23 | 24 | import java.io.IOException; 25 | 26 | 27 | public class BSONFileRecordWriter extends com.mongodb.hadoop.output.BSONFileRecordWriter implements RecordWriter { 28 | 29 | public BSONFileRecordWriter(final FSDataOutputStream outFile, final FSDataOutputStream splitFile, final long splitSize) { 30 | super(outFile, splitFile, splitSize); 31 | } 32 | 33 | public void close(final Reporter reporter) throws IOException { 34 | this.close((TaskAttemptContext) null); 35 | } 36 | 37 | } 38 | 39 | -------------------------------------------------------------------------------- /core/src/main/java/com/mongodb/hadoop/mapred/output/MongoOutputCommitter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2010-2013 10gen Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | 18 | package com.mongodb.hadoop.mapred.output; 19 | 20 | import org.apache.hadoop.mapred.JobContext; 21 | import org.apache.hadoop.mapred.OutputCommitter; 22 | import org.apache.hadoop.mapred.TaskAttemptContext; 23 | 24 | import java.io.IOException; 25 | 26 | public class MongoOutputCommitter extends OutputCommitter { 27 | private final com.mongodb.hadoop.output.MongoOutputCommitter delegate; 28 | 29 | public MongoOutputCommitter() { 30 | delegate = new com.mongodb.hadoop.output.MongoOutputCommitter(); 31 | } 32 | 33 | @Override 34 | public void abortTask(final TaskAttemptContext taskContext) 35 | throws IOException { 36 | delegate.abortTask(taskContext); 37 | } 38 | 39 | @Override 40 | public void commitTask(final TaskAttemptContext taskContext) 41 | throws IOException { 42 | delegate.commitTask(taskContext); 43 | } 44 | 45 | @Override 46 | public boolean needsTaskCommit(final TaskAttemptContext taskContext) 47 | throws IOException { 48 | return delegate.needsTaskCommit(taskContext); 49 | } 50 | 51 | @Override 52 | public void setupJob(final JobContext jobContext) { 53 | delegate.setupJob(jobContext); 54 | } 55 | 56 | @Override 57 | public void setupTask(final TaskAttemptContext taskContext) { 58 | delegate.setupTask(taskContext); 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /core/src/main/java/com/mongodb/hadoop/mapred/output/MongoRecordWriter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2010-2013 10gen Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | 18 | package com.mongodb.hadoop.mapred.output; 19 | 20 | import com.mongodb.hadoop.util.CompatUtils; 21 | import com.mongodb.hadoop.util.MongoConfigUtil; 22 | import org.apache.hadoop.mapred.JobConf; 23 | import org.apache.hadoop.mapred.RecordWriter; 24 | import org.apache.hadoop.mapred.Reporter; 25 | 26 | public class MongoRecordWriter 27 | extends com.mongodb.hadoop.output.MongoRecordWriter 28 | implements RecordWriter { 29 | 30 | /** 31 | * Create a new MongoRecordWriter. 32 | * @param conf the job configuration 33 | */ 34 | public MongoRecordWriter(final JobConf conf) { 35 | super( 36 | MongoConfigUtil.getOutputCollection(conf), 37 | CompatUtils.getTaskAttemptContext(conf, conf.get("mapred.task.id"))); 38 | } 39 | 40 | @Override 41 | public void close(final Reporter reporter) { 42 | super.close(null); 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /core/src/main/java/com/mongodb/hadoop/splitter/MongoSplitter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2010-2013 10gen Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.mongodb.hadoop.splitter; 18 | 19 | import com.mongodb.hadoop.input.MongoInputSplit; 20 | import com.mongodb.hadoop.util.MongoConfigUtil; 21 | import org.apache.hadoop.conf.Configuration; 22 | import org.apache.hadoop.mapreduce.InputSplit; 23 | 24 | import java.util.ArrayList; 25 | import java.util.List; 26 | 27 | public abstract class MongoSplitter { 28 | 29 | private Configuration configuration; 30 | 31 | public MongoSplitter() { 32 | } 33 | 34 | public MongoSplitter(final Configuration configuration) { 35 | setConfiguration(configuration); 36 | } 37 | 38 | public void setConfiguration(final Configuration conf) { 39 | configuration = conf; 40 | } 41 | 42 | public abstract List calculateSplits() throws SplitFailedException; 43 | 44 | public Configuration getConfiguration() { 45 | return configuration; 46 | } 47 | 48 | /** 49 | * Get a list of nonempty input splits only. 50 | * 51 | * @param splits a list of input splits 52 | * @return a new list of nonempty input splits 53 | */ 54 | public static List filterEmptySplits( 55 | final List splits) { 56 | List results = new ArrayList(splits.size()); 57 | for (InputSplit split : splits) { 58 | MongoInputSplit mis = (MongoInputSplit) split; 59 | if (mis.getCursor().hasNext()) { 60 | results.add(mis); 61 | } else { 62 | MongoConfigUtil.close( 63 | mis.getCursor().getCollection().getDB().getMongo()); 64 | } 65 | } 66 | return results; 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /core/src/main/java/com/mongodb/hadoop/splitter/SingleMongoSplitter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2010-2013 10gen Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.mongodb.hadoop.splitter; 18 | 19 | import com.mongodb.MongoClientURI; 20 | import com.mongodb.hadoop.input.MongoInputSplit; 21 | import com.mongodb.hadoop.util.MongoConfigUtil; 22 | import org.apache.commons.logging.Log; 23 | import org.apache.commons.logging.LogFactory; 24 | import org.apache.hadoop.conf.Configuration; 25 | import org.apache.hadoop.mapreduce.InputSplit; 26 | 27 | import java.util.Collections; 28 | import java.util.List; 29 | 30 | import static java.lang.String.format; 31 | 32 | /* This implementation of MongoSplitter does not actually 33 | * do any splitting, it will just create a single input split 34 | * which represents the entire data set within a collection. 35 | */ 36 | public class SingleMongoSplitter extends MongoCollectionSplitter { 37 | 38 | private static final Log LOG = LogFactory.getLog(SingleMongoSplitter.class); 39 | 40 | //Create a single split which consists of a single 41 | //a query over the entire collection. 42 | 43 | 44 | public SingleMongoSplitter() { 45 | } 46 | 47 | public SingleMongoSplitter(final Configuration conf) { 48 | super(conf); 49 | } 50 | 51 | @Override 52 | public List calculateSplits() { 53 | if (LOG.isDebugEnabled()) { 54 | MongoClientURI inputURI = 55 | MongoConfigUtil.getInputURI(getConfiguration()); 56 | LOG.debug(format("SingleMongoSplitter calculating splits for namespace: %s.%s; hosts: %s", 57 | inputURI.getDatabase(), inputURI.getCollection(), inputURI.getHosts())); 58 | } 59 | return Collections.singletonList( 60 | (InputSplit) new MongoInputSplit(getConfiguration())); 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /core/src/main/java/com/mongodb/hadoop/splitter/SplitFailedException.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.splitter; 2 | 3 | public class SplitFailedException extends Exception { 4 | 5 | public SplitFailedException(final String message) { 6 | super(message); 7 | } 8 | 9 | public SplitFailedException(final String message, final Throwable cause) { 10 | super(message, cause); 11 | } 12 | } 13 | 14 | -------------------------------------------------------------------------------- /core/src/main/java/com/mongodb/hadoop/util/MongoPathRetriever.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.util; 2 | 3 | import org.bson.BSONObject; 4 | 5 | import java.util.List; 6 | 7 | /** 8 | * Utility class providing a mechanism for retrieving data nested within 9 | * a MongoDB document. 10 | */ 11 | public final class MongoPathRetriever { 12 | 13 | private MongoPathRetriever() {} 14 | 15 | /** 16 | * Returns the Object stored at a given path within a MongoDB 17 | * document. Returns null if the path is not found. 18 | * 19 | * @param document MongoDB document in which to search. 20 | * @param path Dot-separated path to look up. 21 | * @return the Object stored at the path within the document. 22 | */ 23 | public static Object get(final BSONObject document, final String path) { 24 | String[] parts = path.split("\\."); 25 | Object o = document; 26 | for (String part : parts) { 27 | if (null == o) { 28 | return null; 29 | } else if (o instanceof List) { 30 | try { 31 | int index = Integer.parseInt(part); 32 | if (((List) o).size() > index && index >= 0) { 33 | o = ((List) o).get(index); 34 | } else { 35 | return null; 36 | } 37 | } catch (NumberFormatException e) { 38 | return null; 39 | } 40 | } else if (o instanceof BSONObject) { 41 | o = ((BSONObject) o).get(part); 42 | } else { 43 | // Hit a leaf before finding the key we were looking for. 44 | return null; 45 | } 46 | } 47 | return o; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /core/src/main/java/com/mongodb/hadoop/util/SplitFriendlyDBCallback.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (c) 2010, 2011 10gen, Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with 5 | * the License. You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on 10 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the 11 | * specific language governing permissions and limitations under the License. 12 | */ 13 | 14 | package com.mongodb.hadoop.util; 15 | 16 | import com.mongodb.DBCallback; 17 | import com.mongodb.DBCallbackFactory; 18 | import com.mongodb.DBCollection; 19 | import com.mongodb.DefaultDBCallback; 20 | 21 | public class SplitFriendlyDBCallback extends DefaultDBCallback { 22 | 23 | static final class MinKey { 24 | } 25 | 26 | static final class MaxKey { 27 | } 28 | 29 | static class SplitFriendlyFactory implements DBCallbackFactory { 30 | public DBCallback create(final DBCollection collection) { 31 | return new DefaultDBCallback(collection); 32 | } 33 | } 34 | 35 | public static final DBCallbackFactory FACTORY = new SplitFriendlyFactory(); 36 | public static final MinKey MIN_KEY_TYPE = new MinKey(); 37 | public static final MaxKey MAX_KEY_TYPE = new MaxKey(); 38 | 39 | public SplitFriendlyDBCallback(final DBCollection coll) { 40 | super(coll); 41 | } 42 | 43 | @Override 44 | public void gotMinKey(final String name) { 45 | cur().put(name, MAX_KEY_TYPE); 46 | } 47 | 48 | @Override 49 | public void gotMaxKey(final String name) { 50 | cur().put(name, MAX_KEY_TYPE); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /core/src/test/java/com/mongodb/hadoop/BSONFileInputFormatTest.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop; 2 | 3 | import com.mongodb.hadoop.io.BSONWritable; 4 | import com.mongodb.hadoop.mapred.BSONFileInputFormat; 5 | import org.apache.hadoop.io.NullWritable; 6 | import org.apache.hadoop.mapred.FileSplit; 7 | import org.apache.hadoop.mapred.JobConf; 8 | import org.apache.hadoop.mapred.RecordReader; 9 | import org.junit.Test; 10 | 11 | import java.io.File; 12 | import java.io.IOException; 13 | 14 | import static com.mongodb.hadoop.testutils.BaseHadoopTest.EXAMPLE_DATA_HOME; 15 | import static org.junit.Assert.assertEquals; 16 | 17 | public class BSONFileInputFormatTest { 18 | 19 | @Test 20 | public void enronEmails() throws IOException { 21 | BSONFileInputFormat inputFormat = new BSONFileInputFormat(); 22 | JobConf job = new JobConf(); 23 | String inputDirectory = 24 | new File(EXAMPLE_DATA_HOME, "/dump/enron_mail/messages.bson") 25 | .getAbsoluteFile().toURI().toString(); 26 | // Hadoop 2.X 27 | job.set("mapreduce.input.fileinputformat.inputdir", inputDirectory); 28 | // Hadoop 1.2.X 29 | job.set("mapred.input.dir", inputDirectory); 30 | FileSplit[] splits = inputFormat.getSplits(job, 5); 31 | int count = 0; 32 | BSONWritable writable = new BSONWritable(); 33 | for (FileSplit split : splits) { 34 | RecordReader recordReader = inputFormat.getRecordReader(split, job, null); 35 | while (recordReader.next(null, writable)) { 36 | count++; 37 | } 38 | } 39 | assertEquals("There are 501513 messages in the enron corpus", 501513, count); 40 | } 41 | } -------------------------------------------------------------------------------- /core/src/test/java/com/mongodb/hadoop/HadoopVersionFilter.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop; 2 | 3 | import com.mongodb.hadoop.testutils.BaseHadoopTest; 4 | 5 | import java.io.File; 6 | import java.io.FileFilter; 7 | 8 | public class HadoopVersionFilter implements FileFilter { 9 | private final boolean findTestJar; 10 | private static final String PROD_FORMAT = String.format("-%s.jar", BaseHadoopTest.PROJECT_VERSION); 11 | private static final String TEST_FORMAT = String.format("%s-tests.jar", BaseHadoopTest.PROJECT_VERSION); 12 | 13 | public HadoopVersionFilter() { 14 | this(false); 15 | } 16 | 17 | public HadoopVersionFilter(final boolean findTestJar) { 18 | this.findTestJar = findTestJar; 19 | } 20 | 21 | @Override 22 | public boolean accept(final File pathname) { 23 | return findTestJar ? pathname.getName().endsWith(TEST_FORMAT) : pathname.getName().endsWith(PROD_FORMAT); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /core/src/test/java/com/mongodb/hadoop/MongoConfigUnitTests.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 10gen Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.mongodb.hadoop; 18 | 19 | // Hadoop 20 | 21 | import org.apache.hadoop.conf.Configuration; 22 | import org.junit.Test; 23 | 24 | import static org.junit.Assert.assertNotNull; 25 | 26 | // JUnit 27 | 28 | /** 29 | * The mongo config unit tests. 30 | */ 31 | public final class MongoConfigUnitTests { 32 | 33 | @Test 34 | public void testConstructor() { 35 | assertNotNull(new MongoConfig(new Configuration(false))); 36 | } 37 | } 38 | 39 | -------------------------------------------------------------------------------- /core/src/test/java/com/mongodb/hadoop/bookstore/BookstoreConfig.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.bookstore; 2 | 3 | import com.mongodb.hadoop.MongoConfig; 4 | import com.mongodb.hadoop.MongoInputFormat; 5 | import com.mongodb.hadoop.MongoOutputFormat; 6 | import com.mongodb.hadoop.io.BSONWritable; 7 | import com.mongodb.hadoop.io.MongoUpdateWritable; 8 | import com.mongodb.hadoop.util.MongoTool; 9 | import org.apache.hadoop.conf.Configuration; 10 | import org.apache.hadoop.io.Text; 11 | import org.apache.hadoop.util.ToolRunner; 12 | 13 | public class BookstoreConfig extends MongoTool { 14 | public BookstoreConfig() { 15 | this(new Configuration()); 16 | } 17 | 18 | public BookstoreConfig(final Configuration configuration) { 19 | MongoConfig config = new MongoConfig(configuration); 20 | setConf(configuration); 21 | 22 | config.setInputFormat(MongoInputFormat.class); 23 | 24 | config.setMapper(TagsMapper.class); 25 | config.setMapperOutputKey(Text.class); 26 | config.setMapperOutputValue(BSONWritable.class); 27 | 28 | config.setReducer(TagsReducer.class); 29 | config.setOutputKey(Text.class); 30 | config.setOutputValue(MongoUpdateWritable.class); 31 | config.setOutputFormat(MongoOutputFormat.class); 32 | } 33 | 34 | public static void main(final String[] pArgs) throws Exception { 35 | System.exit(ToolRunner.run(new BookstoreConfig(), pArgs)); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /core/src/test/java/com/mongodb/hadoop/bookstore/TagsMapper.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.bookstore; 2 | 3 | import com.mongodb.hadoop.io.BSONWritable; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapred.JobConf; 6 | import org.apache.hadoop.mapred.OutputCollector; 7 | import org.apache.hadoop.mapred.Reporter; 8 | import org.apache.hadoop.mapreduce.Mapper; 9 | import org.bson.BSONObject; 10 | import org.bson.types.BasicBSONList; 11 | 12 | import java.io.IOException; 13 | 14 | public class TagsMapper extends Mapper 15 | implements org.apache.hadoop.mapred.Mapper { 17 | 18 | private BSONWritable writable; 19 | 20 | public TagsMapper() { 21 | super(); 22 | writable = new BSONWritable(); 23 | } 24 | 25 | @Override 26 | protected void map(final Object key, final BSONObject value, final Context 27 | context) throws IOException, InterruptedException { 28 | BasicBSONList tags = (BasicBSONList) value.get("tags"); 29 | Text text = new Text(); 30 | value.removeField("tags"); 31 | for (Object tag : tags) { 32 | text.set((String) tag); 33 | writable.setDoc(value); 34 | context.write(text, writable); 35 | } 36 | } 37 | 38 | @Override 39 | public void map(final Object key, final BSONWritable value, final 40 | OutputCollector output, 41 | final Reporter reporter) throws IOException { 42 | BasicBSONList tags = (BasicBSONList) value.getDoc().get("tags"); 43 | Text text = new Text(); 44 | value.getDoc().removeField("tags"); 45 | for (Object tag : tags) { 46 | text.set((String) tag); 47 | output.collect(text, value); 48 | } 49 | } 50 | 51 | @Override 52 | public void configure(final JobConf job) { 53 | 54 | } 55 | 56 | @Override 57 | public void close() throws IOException { 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /core/src/test/java/com/mongodb/hadoop/bookstore/TagsReducer.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.bookstore; 2 | 3 | import com.mongodb.BasicDBObject; 4 | import com.mongodb.hadoop.io.BSONWritable; 5 | import com.mongodb.hadoop.io.MongoUpdateWritable; 6 | import org.apache.hadoop.io.NullWritable; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.mapred.JobConf; 9 | import org.apache.hadoop.mapred.OutputCollector; 10 | import org.apache.hadoop.mapred.Reporter; 11 | import org.apache.hadoop.mapreduce.Reducer; 12 | import org.bson.BSONObject; 13 | import org.bson.BasicBSONObject; 14 | 15 | import java.io.IOException; 16 | import java.util.ArrayList; 17 | import java.util.Iterator; 18 | 19 | public class TagsReducer extends Reducer 20 | implements org.apache.hadoop.mapred.Reducer { 21 | 22 | private MongoUpdateWritable reduceResult; 23 | 24 | public TagsReducer() { 25 | super(); 26 | reduceResult = new MongoUpdateWritable(); 27 | } 28 | 29 | @Override 30 | protected void reduce(final Text key, final Iterable values, final Context context) 31 | throws IOException, InterruptedException { 32 | 33 | BasicDBObject query = new BasicDBObject("_id", key.toString()); 34 | ArrayList books = new ArrayList(); 35 | for (BSONWritable val : values) { 36 | books.add(val.getDoc()); 37 | } 38 | 39 | BasicBSONObject update = new BasicBSONObject("$set", new BasicBSONObject("books", books)); 40 | reduceResult.setQuery(query); 41 | reduceResult.setModifiers(update); 42 | context.write(null, reduceResult); 43 | } 44 | 45 | @Override 46 | public void reduce(final Text key, final Iterator values, final OutputCollector output, 47 | final Reporter reporter) throws IOException { 48 | BasicDBObject query = new BasicDBObject("_id", key.toString()); 49 | ArrayList books = new ArrayList(); 50 | while (values.hasNext()) { 51 | books.add(values.next().getDoc()); 52 | } 53 | 54 | BasicBSONObject update = new BasicBSONObject("$set", new BasicBSONObject("books", books)); 55 | reduceResult.setQuery(query); 56 | reduceResult.setModifiers(update); 57 | output.collect(null, reduceResult); 58 | } 59 | 60 | @Override 61 | public void close() throws IOException { 62 | } 63 | 64 | @Override 65 | public void configure(final JobConf job) { 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /core/src/test/java/com/mongodb/hadoop/io/MongoInputSplitTest.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.io; 2 | 3 | import com.mongodb.hadoop.input.MongoInputSplit; 4 | import com.mongodb.hadoop.util.MongoConfigUtil; 5 | import org.apache.hadoop.conf.Configuration; 6 | import org.junit.Test; 7 | 8 | import static junit.framework.TestCase.assertEquals; 9 | 10 | public class MongoInputSplitTest { 11 | 12 | @Test 13 | public void testConstructor() { 14 | Configuration conf = new Configuration(); 15 | MongoConfigUtil.setFields(conf, "{\"field\": 1}"); 16 | MongoConfigUtil.setAuthURI(conf, "mongodb://auth"); 17 | MongoConfigUtil.setInputURI(conf, "mongodb://input"); 18 | MongoConfigUtil.setInputKey(conf, "field"); 19 | MongoConfigUtil.setMaxSplitKey(conf, "{\"field\": 1e9}"); 20 | MongoConfigUtil.setMinSplitKey(conf, "{\"field\": -1e9}"); 21 | MongoConfigUtil.setNoTimeout(conf, true); 22 | MongoConfigUtil.setQuery(conf, "{\"foo\": 42}"); 23 | MongoConfigUtil.setSort(conf, "{\"foo\": -1}"); 24 | MongoConfigUtil.setSkip(conf, 10); 25 | 26 | MongoInputSplit mis = new MongoInputSplit(conf); 27 | 28 | assertEquals(MongoConfigUtil.getFields(conf), mis.getFields()); 29 | assertEquals(MongoConfigUtil.getAuthURI(conf), mis.getAuthURI()); 30 | assertEquals(MongoConfigUtil.getInputURI(conf), mis.getInputURI()); 31 | assertEquals(MongoConfigUtil.getInputKey(conf), mis.getKeyField()); 32 | assertEquals(MongoConfigUtil.getMaxSplitKey(conf), mis.getMax()); 33 | assertEquals(MongoConfigUtil.getMinSplitKey(conf), mis.getMin()); 34 | assertEquals(MongoConfigUtil.isNoTimeout(conf), mis.getNoTimeout()); 35 | assertEquals(MongoConfigUtil.getQuery(conf), mis.getQuery()); 36 | assertEquals(MongoConfigUtil.getSort(conf), mis.getSort()); 37 | assertEquals(MongoConfigUtil.getLimit(conf), (int) mis.getLimit()); 38 | assertEquals(MongoConfigUtil.getSkip(conf), (int) mis.getSkip()); 39 | 40 | MongoInputSplit mis2 = new MongoInputSplit(mis); 41 | assertEquals(mis, mis2); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /core/src/test/java/com/mongodb/hadoop/mapred/BSONFileInputFormatTest.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.mapred; 2 | 3 | import com.mongodb.hadoop.io.BSONWritable; 4 | import org.apache.hadoop.io.NullWritable; 5 | import org.apache.hadoop.mapred.FileSplit; 6 | import org.apache.hadoop.mapred.JobConf; 7 | import org.apache.hadoop.mapred.RecordReader; 8 | import org.junit.Test; 9 | 10 | import java.io.File; 11 | import java.io.IOException; 12 | 13 | import static com.mongodb.hadoop.testutils.BaseHadoopTest.EXAMPLE_DATA_HOME; 14 | import static org.junit.Assert.assertEquals; 15 | 16 | public class BSONFileInputFormatTest { 17 | 18 | @Test 19 | public void enronEmails() throws IOException { 20 | BSONFileInputFormat inputFormat = new BSONFileInputFormat(); 21 | JobConf job = new JobConf(); 22 | String inputDirectory = 23 | new File(EXAMPLE_DATA_HOME, "/dump/enron_mail/messages.bson") 24 | .getAbsoluteFile().toURI().toString(); 25 | // Hadoop 2.X 26 | job.set("mapreduce.input.fileinputformat.inputdir", inputDirectory); 27 | // Hadoop 1.2.X 28 | job.set("mapred.input.dir", inputDirectory); 29 | FileSplit[] splits = inputFormat.getSplits(job, 5); 30 | int count = 0; 31 | BSONWritable writable = new BSONWritable(); 32 | for (FileSplit split : splits) { 33 | RecordReader recordReader = inputFormat.getRecordReader(split, job, null); 34 | while (recordReader.next(null, writable)) { 35 | count++; 36 | } 37 | } 38 | assertEquals("There are 501513 messages in the enron corpus", 501513, count); 39 | } 40 | } -------------------------------------------------------------------------------- /core/src/test/java/com/mongodb/hadoop/splitter/BSONFileRecordReaderTest.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.splitter; 2 | 3 | import com.mongodb.hadoop.input.BSONFileRecordReader; 4 | import com.mongodb.hadoop.input.BSONFileSplit; 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.mapred.JobConf; 7 | import org.bson.types.ObjectId; 8 | import org.junit.Test; 9 | 10 | import java.io.File; 11 | import java.net.URI; 12 | 13 | import static org.junit.Assert.assertEquals; 14 | import static org.junit.Assert.assertTrue; 15 | 16 | public class BSONFileRecordReaderTest { 17 | 18 | @Test 19 | public void testGetCurrentKey() throws Exception { 20 | URI path = BSONFileRecordReaderTest.class.getResource( 21 | "/bookstore-dump/inventory.bson").toURI(); 22 | File file = new File(path); 23 | 24 | // Default case: "_id" is used as inputKey. 25 | BSONFileRecordReader reader = new BSONFileRecordReader(); 26 | BSONFileSplit split = new BSONFileSplit(new Path(path), 0, 27 | file.length(), 28 | new String[0]); 29 | JobConf conf = new JobConf(); 30 | reader.init(split, conf); 31 | assertTrue(reader.nextKeyValue()); 32 | assertEquals(reader.getCurrentKey(), 33 | new ObjectId("4d2a6084c6237b412fcd5597")); 34 | 35 | // Use a nested field as inputKey. 36 | reader = new BSONFileRecordReader(); 37 | split = new BSONFileSplit(new Path(path), 0, 38 | file.length(), 39 | new String[0]); 40 | split.setKeyField("price.msrp"); 41 | reader.init(split, conf); 42 | assertTrue(reader.nextKeyValue()); 43 | assertEquals(reader.getCurrentKey(), 33); 44 | 45 | // Use a key within an array as the inputKey. 46 | reader = new BSONFileRecordReader(); 47 | split = new BSONFileSplit(new Path(path), 0, 48 | file.length(), 49 | new String[0]); 50 | split.setKeyField("tags.0"); 51 | reader.init(split, conf); 52 | assertTrue(reader.nextKeyValue()); 53 | assertEquals(reader.getCurrentKey(), "html5"); 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /core/src/test/java/com/mongodb/hadoop/splitter/MongoRecordReaderTest.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.splitter; 2 | 3 | import com.mongodb.BasicDBList; 4 | import com.mongodb.BasicDBObject; 5 | import com.mongodb.DBCollection; 6 | import com.mongodb.MongoClient; 7 | import com.mongodb.MongoClientURI; 8 | import com.mongodb.hadoop.input.MongoInputSplit; 9 | import com.mongodb.hadoop.input.MongoRecordReader; 10 | import com.mongodb.hadoop.util.MongoClientURIBuilder; 11 | import org.bson.BasicBSONObject; 12 | import org.junit.Test; 13 | 14 | import static org.junit.Assert.assertEquals; 15 | import static org.junit.Assert.assertTrue; 16 | 17 | public class MongoRecordReaderTest { 18 | 19 | @Test 20 | public void testGetCurrentKey() throws Exception { 21 | MongoClient client = new MongoClient("localhost", 27017); 22 | MongoClientURI uri = new MongoClientURIBuilder() 23 | .collection("mongo_hadoop", "mongo_record_reader_test") 24 | .build(); 25 | DBCollection collection = client.getDB(uri.getDatabase()).getCollection(uri.getCollection()); 26 | collection.drop(); 27 | BasicDBList colors = new BasicDBList(){ 28 | { 29 | add(new BasicBSONObject("red", 255)); 30 | add(new BasicBSONObject("blue", 255)); 31 | add(new BasicBSONObject("green", 0)); 32 | } 33 | }; 34 | collection.insert( 35 | new BasicDBObject("_id", 0) 36 | .append("address", 37 | new BasicDBObject("street", "foo street")) 38 | .append("colors", colors) 39 | ); 40 | 41 | // Default case: "_id" is used as inputKey. 42 | MongoInputSplit split = new MongoInputSplit(); 43 | split.setInputURI(uri); 44 | MongoRecordReader reader = new MongoRecordReader(split); 45 | assertTrue(reader.nextKeyValue()); 46 | assertEquals(reader.getCurrentKey(), 0); 47 | 48 | // Use a nested field as inputKey. 49 | split = new MongoInputSplit(); 50 | split.setInputURI(uri); 51 | split.setKeyField("address.street"); 52 | reader = new MongoRecordReader(split); 53 | assertTrue(reader.nextKeyValue()); 54 | assertEquals(reader.getCurrentKey(), "foo street"); 55 | 56 | // Use a key within an array as the inputKey. 57 | split = new MongoInputSplit(); 58 | split.setInputURI(uri); 59 | split.setKeyField("colors.1"); 60 | reader = new MongoRecordReader(split); 61 | assertTrue(reader.nextKeyValue()); 62 | assertEquals(reader.getCurrentKey(), new BasicBSONObject("blue", 255)); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /core/src/test/java/com/mongodb/hadoop/splitter/MongoSplitterTestUtils.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.splitter; 2 | 3 | import com.mongodb.DBObject; 4 | import com.mongodb.hadoop.input.MongoInputSplit; 5 | import org.apache.hadoop.mapreduce.InputSplit; 6 | 7 | import java.util.List; 8 | 9 | import static org.junit.Assert.assertEquals; 10 | 11 | /** 12 | * Utilities for testing Splitter classes that produce MongoInputSplits. 13 | */ 14 | public final class MongoSplitterTestUtils { 15 | 16 | private MongoSplitterTestUtils() {} 17 | 18 | /** 19 | * Assert that a split has the expected bounds using a range query. 20 | * @param split an instance of MongoInputSplit 21 | * @param min the min bound 22 | * @param max the max bound 23 | */ 24 | public static void assertSplitRange( 25 | final MongoInputSplit split, final Integer min, final Integer max) { 26 | DBObject queryObj = (DBObject) split.getQuery().get("_id"); 27 | assertEquals(min, queryObj.get("$gte")); 28 | assertEquals(max, queryObj.get("$lt")); 29 | } 30 | 31 | /** 32 | * Assert that a list of splits has the expected overall count. 33 | * @param expected the expected count 34 | * @param splits a list of MongoInputSplits 35 | */ 36 | public static void assertSplitsCount( 37 | final long expected, final List splits) { 38 | int splitTotal = 0; 39 | for (InputSplit split : splits) { 40 | // Cursors have been closed; create a copy of the MongoInputSplit. 41 | MongoInputSplit mis = new MongoInputSplit((MongoInputSplit) split); 42 | // Query doesn't play nice with min/max, so use itcount for test. 43 | splitTotal += mis.getCursor().itcount(); 44 | } 45 | assertEquals(expected, splitTotal); 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /core/src/test/java/com/mongodb/hadoop/splitter/ShardChunkMongoSplitterTest.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.splitter; 2 | 3 | import com.mongodb.BasicDBObjectBuilder; 4 | import com.mongodb.DBObject; 5 | import com.mongodb.MongoClientURI; 6 | import com.mongodb.hadoop.testutils.BaseHadoopTest; 7 | import com.mongodb.hadoop.util.MongoConfigUtil; 8 | import org.apache.hadoop.conf.Configuration; 9 | import org.apache.hadoop.mapreduce.InputSplit; 10 | import org.bson.types.MaxKey; 11 | import org.bson.types.MinKey; 12 | import org.junit.Test; 13 | 14 | import java.io.IOException; 15 | import java.util.ArrayList; 16 | import java.util.Arrays; 17 | import java.util.HashMap; 18 | import java.util.List; 19 | import java.util.Map; 20 | 21 | import static org.junit.Assert.assertEquals; 22 | 23 | public class ShardChunkMongoSplitterTest extends BaseHadoopTest { 24 | 25 | private ShardChunkMongoSplitter splitter = new ShardChunkMongoSplitter(); 26 | 27 | private DBObject createChunk(final String key, final Object min, final Object max, final String shardName) { 28 | return new BasicDBObjectBuilder() 29 | .push("min").add(key, min).pop() 30 | .push("max").add(key, max).pop() 31 | .append("shard", shardName).get(); 32 | } 33 | 34 | @Test 35 | public void testSplitPreferredLocations() 36 | throws SplitFailedException, IOException, InterruptedException { 37 | // Create list of chunks. 38 | List chunksList = new ArrayList(){{ 39 | add(createChunk("i", new MinKey(), 500, "sh01")); 40 | add(createChunk("i", 500, new MaxKey(), "sh02")); 41 | }}; 42 | // Create shards map. 43 | Map> shardsMap = new HashMap>() {{ 44 | put("sh01", Arrays.asList("mongo.sh01.dc1:27017", "mongo.sh01.dc2:27017")); 45 | put("sh02", Arrays.asList("mongo.sh02.dc1:27027", "mongo.sh02.dc2:27027")); 46 | }}; 47 | 48 | Configuration conf = new Configuration(); 49 | MongoConfigUtil.setInputMongosHosts( 50 | conf, Arrays.asList("mongo.sh01.dc1:27018", "mongo.sh02.dc2:27018")); 51 | MongoConfigUtil.setInputURI( 52 | conf, new MongoClientURI("mongodb://mongo.dc1:27018,mongo.dc2:27018/hadoop.test")); 53 | splitter.setConfiguration(conf); 54 | 55 | List splits = splitter.calculateSplitsFromChunks(chunksList, shardsMap); 56 | assertEquals("mongo.sh01.dc1:27018", splits.get(0).getLocations()[0]); 57 | assertEquals("mongo.sh02.dc2:27018", splits.get(1).getLocations()[0]); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /core/src/test/java/com/mongodb/hadoop/util/MongoConfigUtilTest.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.util; 2 | 3 | import com.mongodb.MongoClientURI; 4 | import org.apache.hadoop.conf.Configuration; 5 | import org.junit.Test; 6 | 7 | import java.util.List; 8 | 9 | import static org.junit.Assert.assertEquals; 10 | 11 | public class MongoConfigUtilTest { 12 | 13 | private void assertSameURIs( 14 | final String[] expected, final List actual) { 15 | assertEquals(expected.length, actual.size()); 16 | for (int i = 0; i < expected.length; ++i) { 17 | assertEquals(expected[i], actual.get(i).getURI()); 18 | } 19 | } 20 | 21 | @Test 22 | public void testGetMongoURIs() { 23 | Configuration conf = new Configuration(); 24 | String[] connStrings = new String[] { 25 | "mongodb://rshost1:10000,rshost2:10001/foo.bar?replicaSet=rs", 26 | "mongodb://standalone:27017/db.collection" 27 | }; 28 | 29 | // Separated by ", " 30 | conf.set( 31 | MongoConfigUtil.INPUT_URI, 32 | connStrings[0] + ", " + connStrings[1]); 33 | List uris = MongoConfigUtil.getMongoURIs( 34 | conf, MongoConfigUtil.INPUT_URI); 35 | assertSameURIs(connStrings, uris); 36 | 37 | // No delimiter 38 | conf.set(MongoConfigUtil.INPUT_URI, connStrings[0] + connStrings[1]); 39 | uris = MongoConfigUtil.getMongoURIs(conf, MongoConfigUtil.INPUT_URI); 40 | assertSameURIs(connStrings, uris); 41 | 42 | // No value set 43 | uris = MongoConfigUtil.getMongoURIs(conf, "this key does not exist"); 44 | assertEquals(0, uris.size()); 45 | 46 | // Only one input URI. 47 | String connString = connStrings[1]; 48 | conf.set(MongoConfigUtil.INPUT_URI, connString); 49 | uris = MongoConfigUtil.getMongoURIs(conf, MongoConfigUtil.INPUT_URI); 50 | assertSameURIs(new String[] {connString}, uris); 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /core/src/test/resources/bookstore-dump/inventory.bson: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mongodb/mongo-hadoop/20208a027ad8638e56dfcf040773f176d6ee059f/core/src/test/resources/bookstore-dump/inventory.bson -------------------------------------------------------------------------------- /core/src/test/resources/bookstore-dump/orders.bson: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mongodb/mongo-hadoop/20208a027ad8638e56dfcf040773f176d6ee059f/core/src/test/resources/bookstore-dump/orders.bson -------------------------------------------------------------------------------- /core/src/test/resources/bookstore-dump/publishers.bson: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mongodb/mongo-hadoop/20208a027ad8638e56dfcf040773f176d6ee059f/core/src/test/resources/bookstore-dump/publishers.bson -------------------------------------------------------------------------------- /core/src/test/resources/bookstore-dump/system.indexes.bson: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mongodb/mongo-hadoop/20208a027ad8638e56dfcf040773f176d6ee059f/core/src/test/resources/bookstore-dump/system.indexes.bson -------------------------------------------------------------------------------- /examples/elastic-mapreduce/emr-bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | wget -P /home/hadoop/lib http://central.maven.org/maven2/org/mongodb/mongo-java-driver/2.11.1/mongo-java-driver-2.11.1.jar 4 | 5 | # Edit this path to point to the location of the jar you're using. 6 | wget -P /home/hadoop/lib https://s3.amazonaws.com/mongo-hadoop-code/mongo-hadoop-core_1.1.2-1.1.0.jar 7 | -------------------------------------------------------------------------------- /examples/elastic-mapreduce/run_emr_job.sh: -------------------------------------------------------------------------------- 1 | 2 | ~/projects/elastic-mapreduce-ruby/elastic-mapreduce --create --jobflow ENRON000 \ 3 | --instance-type m1.xlarge \ 4 | --bootstrap-action s3://$S3_BUCKET/emr-bootstrap.sh \ 5 | --log-uri s3://$S3_BUCKET/enron_logs \ 6 | --jar s3://$S3_BUCKET/enron-example_1.1.2-1.1.0.jar \ 7 | --arg -D --arg mongo.job.input.format=com.mongodb.hadoop.BSONFileInputFormat \ 8 | --arg -D --arg mapred.input.dir=s3n://mongo-test-data/messages.bson \ 9 | --arg -D --arg mongo.job.mapper=com.mongodb.hadoop.examples.enron.EnronMailMapper \ 10 | --arg -D --arg mongo.job.output.key=com.mongodb.hadoop.examples.enron.MailPair \ 11 | --arg -D --arg mongo.job.output.value=org.apache.hadoop.io.IntWritable \ 12 | --arg -D --arg mongo.job.partitioner= \ 13 | --arg -D --arg mongo.job.reducer=com.mongodb.hadoop.examples.enron.EnronMailReducer \ 14 | --arg -D --arg mongo.job.sort_comparator= \ 15 | --arg -D --arg mongo.job.background= \ 16 | --arg -D --arg mapred.output.dir=s3n://$S3_BUCKET/BSON_OUT \ 17 | --arg -D --arg mongo.job.output.format=com.mongodb.hadoop.BSONFileOutputFormat \ 18 | --arg -D --arg mapred.child.java.opts=-Xmx2048m 19 | #--arg -D --arg mapred.task.profile=true \ 20 | -------------------------------------------------------------------------------- /examples/elastic-mapreduce/update_s3.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | #Take the enron example jars and put them into an S3 bucket. 4 | HERE="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 5 | 6 | s3cp $HERE/emr-bootstrap.sh s3://$S3_BUCKET/emr-bootstrap.sh 7 | s3mod s3://$S3_BUCKET/emr-bootstrap.sh public-read 8 | s3cp $HERE/../../core/target/mongo-hadoop-core_1.1.2-1.1.0.jar s3://$S3_BUCKET/mongo-hadoop-core_1.1.2-1.1.0.jar 9 | s3mod s3://$S3_BUCKET/mongo-hadoop-core_1.1.2-1.1.0.jar public-read 10 | s3cp $HERE/../enron/target/enron-example_1.1.2-1.1.0.jar s3://$S3_BUCKET/enron-example_1.1.2-1.1.0.jar 11 | s3mod s3://$S3_BUCKET/enron-example_1.1.2-1.1.0.jar public-read 12 | -------------------------------------------------------------------------------- /examples/enron/hive/hive_enron.q: -------------------------------------------------------------------------------- 1 | -- Hive doesn't allow hyphens in field names 2 | 3 | -- This hive script takes in the emails from Enron and 4 | -- counts the numbers exchanged between each pair of people 5 | 6 | -- Get the headers struct, which contains the "from" and "to". 7 | -- except the words "from", "to", and "date" are reserved in Hive 8 | DROP TABLE raw; 9 | 10 | CREATE EXTERNAL TABLE raw( 11 | h STRUCT 12 | ) 13 | ROW FORMAT SERDE "com.mongodb.hadoop.hive.BSONSerDe" 14 | WITH SERDEPROPERTIES("mongo.columns.mapping"="{'h.hivefrom':'headers.From', 15 | 'h.hiveto':'headers.To'}") 16 | STORED AS INPUTFORMAT "com.mongodb.hadoop.mapred.BSONFileInputFormat" 17 | OUTPUTFORMAT "com.mongodb.hadoop.hive.output.HiveBSONFileOutputFormat" 18 | LOCATION '${INPUT}'; 19 | 20 | 21 | DROP TABLE send_recip; 22 | CREATE TABLE send_recip ( 23 | f STRING, 24 | t_array ARRAY 25 | ); 26 | 27 | -- Strip the white space from the "hiveto" string 28 | -- Then split the comma delimited string into an array of strings 29 | INSERT OVERWRITE TABLE send_recip 30 | SELECT 31 | h.hivefrom AS f, 32 | split(h.hiveto, "\\s*,\\s*") 33 | AS t_array 34 | FROM raw 35 | WHERE h IS NOT NULL 36 | AND h.hiveto IS NOT NULL; 37 | 38 | 39 | DROP TABLE send_recip_explode; 40 | CREATE TABLE send_recip_explode ( 41 | f STRING, 42 | t STRING, 43 | num INT 44 | ); 45 | 46 | -- Explode the array so that every element in the array gets it 47 | -- own row. Then group by the unique "f" and "t" pair 48 | -- to find the number of emails between the sender and receiver 49 | INSERT OVERWRITE TABLE send_recip_explode 50 | SELECT 51 | f, 52 | t, 53 | count(1) AS num 54 | FROM send_recip 55 | LATERAL VIEW explode(t_array) tmpTable AS t 56 | GROUP BY f, t; 57 | 58 | 59 | DROP TABLE send_recip_counted; 60 | CREATE TABLE send_recip_counted ( 61 | id STRUCT< 62 | t : STRING, 63 | f : STRING 64 | >, 65 | count INT 66 | ) 67 | ROW FORMAT SERDE "com.mongodb.hadoop.hive.BSONSerDe" 68 | WITH SERDEPROPERTIES ("mongo.columns.mapping"="{'id':'_id'}") 69 | STORED AS INPUTFORMAT "com.mongodb.hadoop.mapred.BSONFileInputFormat" 70 | OUTPUTFORMAT "com.mongodb.hadoop.hive.output.HiveBSONFileOutputFormat" 71 | LOCATION '${OUTPUT}'; 72 | 73 | -- Final output with the correct format 74 | INSERT INTO TABLE send_recip_counted 75 | SELECT 76 | named_struct('t', t, 'f', f) AS id, 77 | num AS count 78 | FROM send_recip_explode; 79 | -------------------------------------------------------------------------------- /examples/enron/pig/pig_enron.pig: -------------------------------------------------------------------------------- 1 | --Change these jar locations to point to the correct locations/version on your system. 2 | REGISTER /Users/mike/Downloads/mongo-2.10.1.jar; 3 | REGISTER ../core/target/mongo-hadoop-core_cdh4.3.0-1.1.0.jar 4 | REGISTER ../pig/target/mongo-hadoop-pig_cdh4.3.0-1.1.0.jar 5 | 6 | 7 | raw = LOAD 'file:///tmp/enron_mail/messages.bson' using com.mongodb.hadoop.pig.BSONLoader('','headers:[]') ; 8 | send_recip = FOREACH raw GENERATE $0#'From' as from, $0#'To' as to; 9 | send_recip_filtered = FILTER send_recip BY to IS NOT NULL; 10 | send_recip_split = FOREACH send_recip_filtered GENERATE from as from, FLATTEN(TOKENIZE(to)) as to; 11 | send_recip_split_trimmed = FOREACH send_recip_split GENERATE from as from, TRIM(to) as to; 12 | send_recip_grouped = GROUP send_recip_split_trimmed BY (from, to); 13 | send_recip_counted = FOREACH send_recip_grouped GENERATE group, COUNT($1) as count; 14 | STORE send_recip_counted INTO 'file:///tmp/enron_result.bson' using com.mongodb.hadoop.pig.BSONStorage; 15 | -------------------------------------------------------------------------------- /examples/enron/run_job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | echo "Run this job via gradle from the root directory: ./gradlew enronEmails" -------------------------------------------------------------------------------- /examples/enron/src/main/java/com/mongodb/hadoop/examples/enron/EnronMail.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2011 10gen Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.mongodb.hadoop.examples.enron; 17 | 18 | 19 | import com.mongodb.hadoop.BSONFileInputFormat; 20 | import com.mongodb.hadoop.MongoConfig; 21 | import com.mongodb.hadoop.MongoOutputFormat; 22 | import com.mongodb.hadoop.util.MapredMongoConfigUtil; 23 | import com.mongodb.hadoop.util.MongoConfigUtil; 24 | import com.mongodb.hadoop.util.MongoTool; 25 | import org.apache.hadoop.conf.Configuration; 26 | import org.apache.hadoop.fs.Path; 27 | import org.apache.hadoop.io.IntWritable; 28 | import org.apache.hadoop.mapred.FileInputFormat; 29 | import org.apache.hadoop.mapred.JobConf; 30 | import org.apache.hadoop.util.ToolRunner; 31 | 32 | public class EnronMail extends MongoTool { 33 | public EnronMail() { 34 | JobConf conf = new JobConf(new Configuration()); 35 | if (MongoTool.isMapRedV1()) { 36 | MapredMongoConfigUtil.setInputFormat(conf, 37 | com.mongodb.hadoop.mapred.BSONFileInputFormat.class); 38 | MapredMongoConfigUtil.setOutputFormat(conf, 39 | com.mongodb.hadoop.mapred.MongoOutputFormat.class); 40 | } else { 41 | MongoConfigUtil.setInputFormat(conf, BSONFileInputFormat.class); 42 | MongoConfigUtil.setOutputFormat(conf, MongoOutputFormat.class); 43 | } 44 | FileInputFormat.addInputPath(conf, new Path("/messages")); 45 | MongoConfig config = new MongoConfig(conf); 46 | config.setInputKey("headers.From"); 47 | config.setMapper(EnronMailMapper.class); 48 | config.setReducer(EnronMailReducer.class); 49 | config.setMapperOutputKey(MailPair.class); 50 | config.setMapperOutputValue(IntWritable.class); 51 | config.setOutputKey(MailPair.class); 52 | config.setOutputValue(IntWritable.class); 53 | config.setOutputURI( 54 | "mongodb://localhost:27017/mongo_hadoop.message_pairs"); 55 | setConf(conf); 56 | } 57 | 58 | public static void main(final String[] pArgs) throws Exception { 59 | System.exit(ToolRunner.run(new EnronMail(), pArgs)); 60 | } 61 | } 62 | 63 | -------------------------------------------------------------------------------- /examples/enron/src/main/java/com/mongodb/hadoop/examples/enron/EnronMailMapper.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.examples.enron; 2 | 3 | import com.mongodb.hadoop.io.BSONWritable; 4 | import org.apache.hadoop.io.IntWritable; 5 | import org.apache.hadoop.mapred.JobConf; 6 | import org.apache.hadoop.mapred.OutputCollector; 7 | import org.apache.hadoop.mapred.Reporter; 8 | import org.apache.hadoop.mapreduce.Mapper; 9 | import org.bson.BSONObject; 10 | 11 | import java.io.IOException; 12 | 13 | public class EnronMailMapper extends Mapper 14 | implements org.apache.hadoop.mapred.Mapper { 15 | 16 | private final IntWritable intw; 17 | private final MailPair mp; 18 | 19 | public EnronMailMapper() { 20 | super(); 21 | intw = new IntWritable(1); 22 | mp = new MailPair(); 23 | } 24 | 25 | @Override 26 | public void map(final Object key, final BSONObject val, 27 | final Context context) 28 | throws IOException, InterruptedException { 29 | 30 | BSONObject headers = (BSONObject) val.get("headers"); 31 | String to = (String) headers.get("To"); 32 | if (null != to) { 33 | String[] recipients = to.split(","); 34 | for (final String recip1 : recipients) { 35 | String recip = recip1.trim(); 36 | if (recip.length() > 0) { 37 | mp.setFrom((String) key); 38 | mp.setTo(recip); 39 | context.write(mp, intw); 40 | } 41 | } 42 | } 43 | } 44 | 45 | @Override 46 | public void map(final Object key, final BSONWritable writable, final OutputCollector output, 47 | final Reporter reporter) throws IOException { 48 | BSONObject headers = (BSONObject) writable.getDoc().get("headers"); 49 | String to = (String) headers.get("To"); 50 | String from = (String) headers.get("From"); 51 | if (null != to) { 52 | String[] recipients = to.split(","); 53 | for (final String recip1 : recipients) { 54 | String recip = recip1.trim(); 55 | if (recip.length() > 0) { 56 | mp.setFrom(from); 57 | mp.setTo(recip); 58 | output.collect(mp, intw); 59 | } 60 | } 61 | } 62 | } 63 | 64 | @Override 65 | public void close() throws IOException { 66 | } 67 | 68 | @Override 69 | public void configure(final JobConf job) { 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /examples/enron/src/main/java/com/mongodb/hadoop/examples/enron/MailPair.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.examples.enron; 2 | 3 | import org.apache.hadoop.io.WritableComparable; 4 | 5 | import java.io.DataInput; 6 | import java.io.DataOutput; 7 | import java.io.IOException; 8 | 9 | 10 | public class MailPair implements WritableComparable { 11 | private String from; 12 | private String to; 13 | 14 | public MailPair() { 15 | } 16 | 17 | public MailPair(final String from, final String to) { 18 | this.from = from; 19 | this.to = to; 20 | } 21 | 22 | public String getFrom() { 23 | return from; 24 | } 25 | 26 | public void setFrom(final String from) { 27 | this.from = from; 28 | } 29 | 30 | public String getTo() { 31 | return to; 32 | } 33 | 34 | public void setTo(final String to) { 35 | this.to = to; 36 | } 37 | 38 | public void readFields(final DataInput in) throws IOException { 39 | this.from = in.readUTF(); 40 | this.to = in.readUTF(); 41 | } 42 | 43 | public void write(final DataOutput out) throws IOException { 44 | out.writeUTF(this.from); 45 | out.writeUTF(this.to); 46 | } 47 | 48 | @Override 49 | public boolean equals(final Object o) { 50 | if (o instanceof MailPair) { 51 | MailPair mp = (MailPair) o; 52 | return from.equals(mp.from) && to.equals(mp.to); 53 | } 54 | return false; 55 | } 56 | 57 | @Override 58 | public int hashCode() { 59 | int result = from != null ? from.hashCode() : 0; 60 | result = 31 * result + (to != null ? to.hashCode() : 0); 61 | return result; 62 | } 63 | 64 | @Override 65 | public int compareTo(final Object o) { 66 | if (!(o instanceof MailPair)) { 67 | return -1; 68 | } 69 | MailPair mp = (MailPair) o; 70 | int first = from.compareTo(mp.from); 71 | if (first != 0) { 72 | return first; 73 | } 74 | int second = to.compareTo(mp.to); 75 | if (second != 0) { 76 | return second; 77 | } 78 | return 0; 79 | } 80 | 81 | } 82 | -------------------------------------------------------------------------------- /examples/sensors/run_job.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | echo "Run this job via gradle from the root directory: ./gradlew sensorData" -------------------------------------------------------------------------------- /examples/sensors/src/main/java/com/mongodb/hadoop/examples/sensors/DeviceMapper.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.examples.sensors; 2 | 3 | import com.mongodb.hadoop.io.BSONWritable; 4 | import org.apache.hadoop.io.Text; 5 | import org.apache.hadoop.mapred.JobConf; 6 | import org.apache.hadoop.mapred.OutputCollector; 7 | import org.apache.hadoop.mapred.Reporter; 8 | import org.apache.hadoop.mapreduce.Mapper; 9 | import org.bson.BSONObject; 10 | 11 | import java.io.IOException; 12 | 13 | public class DeviceMapper extends Mapper 14 | implements org.apache.hadoop.mapred.Mapper { 15 | 16 | private final Text keyText; 17 | private final Text valueText; 18 | 19 | public DeviceMapper() { 20 | super(); 21 | keyText = new Text(); 22 | valueText = new Text(); 23 | } 24 | 25 | @Override 26 | public void map(final Object key, final BSONObject val, final Context context) throws IOException, InterruptedException { 27 | String keyOut = (String) val.get("owner") + " " + (String) val.get("type"); 28 | keyText.set(keyOut); 29 | valueText.set(val.get("_id").toString()); 30 | context.write(keyText, valueText); 31 | } 32 | 33 | @Override 34 | public void map(final Object key, final BSONWritable value, final OutputCollector output, 35 | final Reporter reporter) throws IOException { 36 | BSONObject val = value.getDoc(); 37 | 38 | String keyOut = (String) val.get("owner") + " " + (String) val.get("type"); 39 | keyText.set(keyOut); 40 | valueText.set(val.get("_id").toString()); 41 | output.collect(keyText, valueText); 42 | } 43 | 44 | @Override 45 | public void close() throws IOException { 46 | } 47 | 48 | @Override 49 | public void configure(final JobConf job) { 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /examples/sensors/src/main/java/com/mongodb/hadoop/examples/sensors/DeviceReducer.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.examples.sensors; 2 | 3 | import com.mongodb.hadoop.io.MongoUpdateWritable; 4 | import org.apache.hadoop.io.NullWritable; 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapred.JobConf; 7 | import org.apache.hadoop.mapred.OutputCollector; 8 | import org.apache.hadoop.mapred.Reporter; 9 | import org.apache.hadoop.mapreduce.Reducer; 10 | import org.bson.BasicBSONObject; 11 | import org.bson.types.ObjectId; 12 | 13 | import java.io.IOException; 14 | import java.util.ArrayList; 15 | import java.util.Iterator; 16 | 17 | public class DeviceReducer extends Reducer 18 | implements org.apache.hadoop.mapred.Reducer { 19 | 20 | private MongoUpdateWritable reduceResult; 21 | 22 | public DeviceReducer() { 23 | super(); 24 | reduceResult = new MongoUpdateWritable(); 25 | } 26 | 27 | @Override 28 | public void reduce(final Text pKey, final Iterable pValues, final Context pContext) throws IOException, InterruptedException { 29 | BasicBSONObject query = new BasicBSONObject("_id", pKey.toString()); 30 | ArrayList devices = new ArrayList(); 31 | for (Text val : pValues) { 32 | devices.add(new ObjectId(val.toString())); 33 | } 34 | 35 | BasicBSONObject update = new BasicBSONObject("$pushAll", new BasicBSONObject("devices", devices)); 36 | reduceResult.setQuery(query); 37 | reduceResult.setModifiers(update); 38 | pContext.write(null, reduceResult); 39 | } 40 | 41 | @Override 42 | public void reduce(final Text key, final Iterator values, final OutputCollector output, 43 | final Reporter reporter) throws IOException { 44 | BasicBSONObject query = new BasicBSONObject("_id", key.toString()); 45 | ArrayList devices = new ArrayList(); 46 | while (values.hasNext()) { 47 | Text val = values.next(); 48 | devices.add(new ObjectId(val.toString())); 49 | } 50 | 51 | BasicBSONObject update = new BasicBSONObject("$pushAll", new BasicBSONObject("devices", devices)); 52 | reduceResult.setQuery(query); 53 | reduceResult.setModifiers(update); 54 | output.collect(null, reduceResult); 55 | } 56 | 57 | @Override 58 | public void close() throws IOException { 59 | } 60 | 61 | @Override 62 | public void configure(final JobConf job) { 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /examples/sensors/src/main/java/com/mongodb/hadoop/examples/sensors/Devices.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.examples.sensors; 2 | 3 | import com.mongodb.hadoop.MongoInputFormat; 4 | import com.mongodb.hadoop.MongoOutputFormat; 5 | import com.mongodb.hadoop.io.BSONWritable; 6 | import com.mongodb.hadoop.util.MapredMongoConfigUtil; 7 | import com.mongodb.hadoop.util.MongoConfigUtil; 8 | import com.mongodb.hadoop.util.MongoTool; 9 | import org.apache.hadoop.conf.Configuration; 10 | import org.apache.hadoop.io.IntWritable; 11 | import org.apache.hadoop.io.Text; 12 | import org.apache.hadoop.util.ToolRunner; 13 | 14 | import java.net.UnknownHostException; 15 | 16 | public class Devices extends MongoTool { 17 | 18 | public Devices() throws UnknownHostException { 19 | setConf(new Configuration()); 20 | 21 | if (MongoTool.isMapRedV1()) { 22 | MapredMongoConfigUtil.setInputFormat(getConf(), com.mongodb.hadoop.mapred.MongoInputFormat.class); 23 | MapredMongoConfigUtil.setOutputFormat(getConf(), com.mongodb.hadoop.mapred.MongoOutputFormat.class); 24 | } else { 25 | MongoConfigUtil.setInputFormat(getConf(), MongoInputFormat.class); 26 | MongoConfigUtil.setOutputFormat(getConf(), MongoOutputFormat.class); 27 | } 28 | 29 | MongoConfigUtil.setInputURI(getConf(), "mongodb://localhost:27017/mongo_hadoop.devices"); 30 | MongoConfigUtil.setOutputURI(getConf(), "mongodb://localhost:27017/mongo_hadoop.logs_aggregate"); 31 | 32 | MongoConfigUtil.setMapper(getConf(), DeviceMapper.class); 33 | MongoConfigUtil.setReducer(getConf(), DeviceReducer.class); 34 | MongoConfigUtil.setMapperOutputKey(getConf(), Text.class); 35 | MongoConfigUtil.setMapperOutputValue(getConf(), Text.class); 36 | MongoConfigUtil.setOutputKey(getConf(), IntWritable.class); 37 | MongoConfigUtil.setOutputValue(getConf(), BSONWritable.class); 38 | 39 | new SensorDataGenerator().run(); 40 | } 41 | 42 | public static void main(final String[] pArgs) throws Exception { 43 | System.exit(ToolRunner.run(new Devices(), pArgs)); 44 | } 45 | } -------------------------------------------------------------------------------- /examples/sensors/src/main/java/com/mongodb/hadoop/examples/sensors/LogCombiner.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.examples.sensors; 2 | 3 | import org.apache.commons.logging.Log; 4 | import org.apache.commons.logging.LogFactory; 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.Text; 7 | import org.apache.hadoop.mapred.JobConf; 8 | import org.apache.hadoop.mapred.OutputCollector; 9 | import org.apache.hadoop.mapred.Reporter; 10 | import org.apache.hadoop.mapreduce.Reducer; 11 | 12 | import java.io.IOException; 13 | import java.util.Iterator; 14 | 15 | public class LogCombiner extends Reducer 16 | implements org.apache.hadoop.mapred.Reducer { 17 | 18 | private static final Log LOG = LogFactory.getLog(LogCombiner.class); 19 | 20 | @Override 21 | public void reduce(final Text pKey, final Iterable pValues, final Context pContext) 22 | throws IOException, InterruptedException { 23 | 24 | int count = 0; 25 | for (IntWritable val : pValues) { 26 | count += val.get(); 27 | } 28 | 29 | pContext.write(pKey, new IntWritable(count)); 30 | } 31 | 32 | @Override 33 | public void reduce(final Text key, final Iterator values, final OutputCollector output, 34 | final Reporter reporter) throws IOException { 35 | int count = 0; 36 | while (values.hasNext()) { 37 | count += values.next().get(); 38 | } 39 | 40 | output.collect(key, new IntWritable(count)); 41 | } 42 | 43 | @Override 44 | public void close() throws IOException { 45 | } 46 | 47 | @Override 48 | public void configure(final JobConf job) { 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /examples/sensors/src/main/java/com/mongodb/hadoop/examples/sensors/LogMapper.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.examples.sensors; 2 | 3 | import com.mongodb.hadoop.io.BSONWritable; 4 | import org.apache.hadoop.io.IntWritable; 5 | import org.apache.hadoop.io.Text; 6 | import org.apache.hadoop.mapred.JobConf; 7 | import org.apache.hadoop.mapred.OutputCollector; 8 | import org.apache.hadoop.mapred.Reporter; 9 | import org.apache.hadoop.mapreduce.Mapper; 10 | import org.bson.BSONObject; 11 | 12 | import java.io.IOException; 13 | 14 | public class LogMapper extends Mapper 15 | implements org.apache.hadoop.mapred.Mapper { 16 | 17 | private final Text keyText; 18 | private final IntWritable valueInt; 19 | 20 | public LogMapper() { 21 | super(); 22 | keyText = new Text(); 23 | valueInt = new IntWritable(1); 24 | } 25 | 26 | @Override 27 | public void map(final Object key, final BSONObject val, final Context context) throws IOException, InterruptedException { 28 | keyText.set(val.get("d_id").toString()); 29 | context.write(keyText, valueInt); 30 | } 31 | 32 | @Override 33 | public void map(final Object key, final BSONWritable value, final OutputCollector output, final Reporter reporter) 34 | throws IOException { 35 | keyText.set(value.getDoc().get("d_id").toString()); 36 | output.collect(keyText, valueInt); 37 | } 38 | 39 | @Override 40 | public void close() throws IOException { 41 | } 42 | 43 | @Override 44 | public void configure(final JobConf job) { 45 | } 46 | } 47 | 48 | -------------------------------------------------------------------------------- /examples/sensors/src/main/java/com/mongodb/hadoop/examples/sensors/Logs.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.examples.sensors; 2 | 3 | import com.mongodb.hadoop.MongoInputFormat; 4 | import com.mongodb.hadoop.MongoOutputFormat; 5 | import com.mongodb.hadoop.util.MapredMongoConfigUtil; 6 | import com.mongodb.hadoop.util.MongoConfigUtil; 7 | import com.mongodb.hadoop.util.MongoTool; 8 | import org.apache.hadoop.conf.Configuration; 9 | import org.apache.hadoop.fs.FileSystem; 10 | import org.apache.hadoop.io.IntWritable; 11 | import org.apache.hadoop.io.Text; 12 | import org.apache.hadoop.util.ToolRunner; 13 | 14 | import java.net.UnknownHostException; 15 | 16 | public class Logs extends MongoTool { 17 | 18 | public Logs() throws UnknownHostException { 19 | Configuration conf = new Configuration(); 20 | setConf(conf); 21 | boolean mrv1Job; 22 | try { 23 | FileSystem.class.getDeclaredField("DEFAULT_FS"); 24 | mrv1Job = false; 25 | } catch (NoSuchFieldException e) { 26 | mrv1Job = true; 27 | } 28 | if (mrv1Job) { 29 | MapredMongoConfigUtil.setInputFormat(getConf(), com.mongodb.hadoop.mapred.MongoInputFormat.class); 30 | MapredMongoConfigUtil.setOutputFormat(getConf(), com.mongodb.hadoop.mapred.MongoOutputFormat.class); 31 | } else { 32 | MongoConfigUtil.setInputFormat(getConf(), MongoInputFormat.class); 33 | MongoConfigUtil.setOutputFormat(getConf(), MongoOutputFormat.class); 34 | } 35 | 36 | 37 | MongoConfigUtil.setInputURI(getConf(), "mongodb://localhost:27017/mongo_hadoop.logs"); 38 | MongoConfigUtil.setOutputURI(getConf(), "mongodb://localhost:27017/mongo_hadoop.logs_aggregate"); 39 | 40 | MongoConfigUtil.setMapper(getConf(), LogMapper.class); 41 | MongoConfigUtil.setReducer(getConf(), LogReducer.class); 42 | MongoConfigUtil.setCombiner(getConf(), LogCombiner.class); 43 | 44 | MongoConfigUtil.setOutputKey(getConf(), Text.class); 45 | MongoConfigUtil.setOutputValue(getConf(), IntWritable.class); 46 | } 47 | 48 | public static void main(final String[] pArgs) throws Exception { 49 | System.exit(ToolRunner.run(new Logs(), pArgs)); 50 | } 51 | } -------------------------------------------------------------------------------- /examples/sensors/testdata_generator.js: -------------------------------------------------------------------------------- 1 | NUM_DEVICES = 1000; 2 | NUM_LOGS = NUM_DEVICES * 50 * 1000 3 | setVerboseShell(false); 4 | 5 | db.devices.remove() 6 | db.logs.remove() 7 | 8 | function getRandomInRange(from, to, fixed) { 9 | return (Math.random() * (to - from) + from).toFixed(fixed) * 1; 10 | } 11 | 12 | function getRandomString (len) { 13 | var possible = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; 14 | var randomString = ''; 15 | for (var i = 0; i < len; i++) { 16 | var randomPoz = Math.floor(Math.random() * possible.length); 17 | randomString += possible.substring(randomPoz,randomPoz+1); 18 | } 19 | return randomString; 20 | } 21 | 22 | function randomDate(start, end) { 23 | return new Date(start.getTime() + Math.random() * (end.getTime() - start.getTime())) 24 | } 25 | 26 | function choose(choices) { 27 | index = Math.floor(Math.random() * choices.length); 28 | return choices[index]; 29 | } 30 | 31 | function getRandomInt (min, max) { 32 | return Math.floor(Math.random() * (max - min) + min); 33 | } 34 | 35 | owners = [] 36 | for(var i=0;i<10;i++){ 37 | owners.push(getRandomString(10)); 38 | } 39 | 40 | models = [] 41 | for(var i=0;i<10;i++){ 42 | models.push(getRandomInt(10, 20)); 43 | } 44 | 45 | types = ["temp", "humidity", "pressure", "sound", "light"] 46 | 47 | 48 | device_ids = [] 49 | // devices 50 | // 51 | for(var i=0;i params = new TreeMap(); 26 | params.put(MongoConfigUtil.INPUT_QUERY, "{_id:{$gt:{$date:883440000000}}}"); 27 | new StreamingJob() 28 | .params(params) 29 | .inputUris(getInputUri()) 30 | .outputUris(getOutputUri()) 31 | .execute(); 32 | 33 | DBCollection collection = getClient(getInputUri()).getDB("mongo_hadoop").getCollection("yield_historical.out"); 34 | assertEquals(14, collection.count()); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /examples/treasury_yield/src/test/resources/commons-logging.properties: -------------------------------------------------------------------------------- 1 | # commons-logging.properties 2 | # jdk handlers 3 | handlers=java.util.logging.ConsoleHandler, java.util.logging.FileHandler 4 | 5 | # default log level 6 | .level=DEBUG 7 | 8 | # Specific logger level 9 | #MyClassLogger.level=FINE 10 | 11 | # FileHandler options - can also be set to the ConsoleHandler 12 | # FileHandler level can be set to override the global level: 13 | #java.util.logging.FileHandler.level=WARN 14 | 15 | # log file name for the File Handler 16 | java.util.logging.FileHandler.pattern=/tmp/javalog%u.log 17 | 18 | # Specify the style of output (simple or xml) 19 | java.util.logging.ConsoleHandler.formatter=java.util.logging.SimpleFormatter 20 | java.util.logging.FileHandler.formatter=java.util.logging.SimpleFormatter 21 | 22 | # Optional - Limit the size of the file (in bytes) 23 | java.util.logging.FileHandler.limit=50000 24 | 25 | # Optional - The number of files to cycle through, by 26 | # appending an integer to the base file name: 27 | java.util.logging.FileHandler.count=1 -------------------------------------------------------------------------------- /examples/treasury_yield/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n -------------------------------------------------------------------------------- /examples/treasury_yield/src/test/resources/yarn-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mongodb/mongo-hadoop/20208a027ad8638e56dfcf040773f176d6ee059f/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | #Mon Mar 09 18:25:42 PDT 2015 2 | distributionBase=GRADLE_USER_HOME 3 | distributionPath=wrapper/dists 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | distributionUrl=https\://services.gradle.org/distributions/gradle-2.2.1-all.zip 7 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @if "%DEBUG%" == "" @echo off 2 | @rem ########################################################################## 3 | @rem 4 | @rem Gradle startup script for Windows 5 | @rem 6 | @rem ########################################################################## 7 | 8 | @rem Set local scope for the variables with windows NT shell 9 | if "%OS%"=="Windows_NT" setlocal 10 | 11 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 12 | set DEFAULT_JVM_OPTS= 13 | 14 | set DIRNAME=%~dp0 15 | if "%DIRNAME%" == "" set DIRNAME=. 16 | set APP_BASE_NAME=%~n0 17 | set APP_HOME=%DIRNAME% 18 | 19 | @rem Find java.exe 20 | if defined JAVA_HOME goto findJavaFromJavaHome 21 | 22 | set JAVA_EXE=java.exe 23 | %JAVA_EXE% -version >NUL 2>&1 24 | if "%ERRORLEVEL%" == "0" goto init 25 | 26 | echo. 27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 28 | echo. 29 | echo Please set the JAVA_HOME variable in your environment to match the 30 | echo location of your Java installation. 31 | 32 | goto fail 33 | 34 | :findJavaFromJavaHome 35 | set JAVA_HOME=%JAVA_HOME:"=% 36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 37 | 38 | if exist "%JAVA_EXE%" goto init 39 | 40 | echo. 41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 42 | echo. 43 | echo Please set the JAVA_HOME variable in your environment to match the 44 | echo location of your Java installation. 45 | 46 | goto fail 47 | 48 | :init 49 | @rem Get command-line arguments, handling Windowz variants 50 | 51 | if not "%OS%" == "Windows_NT" goto win9xME_args 52 | if "%@eval[2+2]" == "4" goto 4NT_args 53 | 54 | :win9xME_args 55 | @rem Slurp the command line arguments. 56 | set CMD_LINE_ARGS= 57 | set _SKIP=2 58 | 59 | :win9xME_args_slurp 60 | if "x%~1" == "x" goto execute 61 | 62 | set CMD_LINE_ARGS=%* 63 | goto execute 64 | 65 | :4NT_args 66 | @rem Get arguments from the 4NT Shell from JP Software 67 | set CMD_LINE_ARGS=%$ 68 | 69 | :execute 70 | @rem Setup the command line 71 | 72 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 73 | 74 | @rem Execute Gradle 75 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% 76 | 77 | :end 78 | @rem End local scope for the variables with windows NT shell 79 | if "%ERRORLEVEL%"=="0" goto mainEnd 80 | 81 | :fail 82 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 83 | rem the _cmd.exe /c_ return code! 84 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 85 | exit /b 1 86 | 87 | :mainEnd 88 | if "%OS%"=="Windows_NT" endlocal 89 | 90 | :omega 91 | -------------------------------------------------------------------------------- /hive/src/test/java/com/mongodb/hadoop/hive/HiveQueryTest.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.hive; 2 | 3 | import com.mongodb.MongoClient; 4 | import com.mongodb.client.MongoCollection; 5 | import org.bson.Document; 6 | import org.junit.After; 7 | import org.junit.Before; 8 | import org.junit.Test; 9 | 10 | import java.sql.SQLException; 11 | 12 | import static org.junit.Assert.assertEquals; 13 | 14 | public class HiveQueryTest extends HiveTest { 15 | 16 | private static MongoCollection coll; 17 | 18 | @Before 19 | public void setUp() { 20 | MongoClient client = new MongoClient("localhost:27017"); 21 | coll = client.getDatabase("mongo_hadoop").getCollection("hive_query"); 22 | for (int i = 0; i < 1000; ++i) { 23 | coll.insertOne(new Document("i", i).append("j", i % 5)); 24 | } 25 | } 26 | 27 | @After 28 | public void tearDown() { 29 | coll.drop(); 30 | dropTable("querytest"); 31 | } 32 | 33 | @Test 34 | public void testQueryPushdown() throws SQLException { 35 | execute( 36 | "CREATE EXTERNAL TABLE querytest (id STRING, i INT, j INT) " 37 | + "STORED BY \"com.mongodb.hadoop.hive.MongoStorageHandler\" " 38 | + "WITH SERDEPROPERTIES(\"mongo.columns.mapping\"=" 39 | + "'{\"id\":\"_id\"}') " 40 | + "TBLPROPERTIES(\"mongo.uri\"=" 41 | + "\"mongodb://localhost:27017/mongo_hadoop.hive_query\")"); 42 | Results results = query("SELECT * FROM querytest WHERE i > 20"); 43 | assertEquals(979, results.size()); 44 | } 45 | 46 | @Test 47 | public void testQueryPushdownWithQueryTable() throws SQLException { 48 | execute( 49 | "CREATE EXTERNAL TABLE querytest (id STRING, i INT, j INT) " 50 | + "STORED BY \"com.mongodb.hadoop.hive.MongoStorageHandler\" " 51 | + "WITH SERDEPROPERTIES(\"mongo.columns.mapping\"=" 52 | + "'{\"id\":\"_id\"}') " 53 | + "TBLPROPERTIES(\"mongo.uri\"=" 54 | + "\"mongodb://localhost:27017/mongo_hadoop.hive_query\"," 55 | + "\"mongo.input.query\"='{\"j\":0}')"); 56 | Results results = query("SELECT * FROM querytest WHERE i > 20"); 57 | assertEquals(195, results.size()); 58 | 59 | results = query("SELECT * from querytest WHERE j > 2"); 60 | assertEquals(0, results.size()); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /hive/src/test/java/com/mongodb/hadoop/hive/TablePropertiesTest.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.hive; 2 | 3 | import com.mongodb.MongoClient; 4 | import com.mongodb.MongoClientURI; 5 | import com.mongodb.client.MongoCollection; 6 | import org.bson.Document; 7 | import org.junit.After; 8 | import org.junit.Before; 9 | import org.junit.Test; 10 | 11 | import java.sql.SQLException; 12 | import java.util.ArrayList; 13 | 14 | import static org.junit.Assert.assertEquals; 15 | 16 | public class TablePropertiesTest extends HiveTest { 17 | 18 | private MongoCollection collection; 19 | 20 | @Before 21 | public void setUp() { 22 | MongoClientURI clientURI = new MongoClientURI( 23 | "mongodb://localhost:27017/mongo_hadoop.tabletest"); 24 | MongoClient client = new MongoClient(clientURI); 25 | 26 | // Seed some documents into MongoDB. 27 | collection = client 28 | .getDatabase(clientURI.getDatabase()) 29 | .getCollection(clientURI.getCollection()); 30 | ArrayList documents = new ArrayList(1000); 31 | for (int i = 0; i < 1000; ++i) { 32 | documents.add(new Document("i", i)); 33 | } 34 | collection.insertMany(documents); 35 | 36 | // Make sure table doesn't exist already. 37 | dropTable("props_file_test"); 38 | } 39 | 40 | @After 41 | public void tearDown() { 42 | // Tear down collection. 43 | collection.drop(); 44 | 45 | // Drop Hive table. 46 | dropTable("props_file_test"); 47 | } 48 | 49 | @Test 50 | public void testPropertiesFile() throws SQLException { 51 | // Create the table. 52 | execute( 53 | "CREATE TABLE props_file_test" 54 | + " (id STRING, i INT)" 55 | + " STORED BY 'com.mongodb.hadoop.hive.MongoStorageHandler'" 56 | + " WITH SERDEPROPERTIES('mongo.columns.mapping'='{\"id\":\"_id\"}')" 57 | + " TBLPROPERTIES('mongo.properties.path'='" 58 | + getPath("hivetable.properties") + "')"); 59 | 60 | // Read and write some data through the table. 61 | Results results = query("SELECT i FROM props_file_test WHERE i >= 20"); 62 | assertEquals(490, results.size()); 63 | 64 | execute( 65 | "INSERT INTO props_file_test VALUES ('55d5005b6e32ab5664606195', 42)"); 66 | assertEquals(2, collection.count(new Document("i", 42))); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /hive/src/test/java/com/mongodb/hadoop/hive/TestHDFSToMongoDBWithOptions.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.hive; 2 | 3 | import com.mongodb.DBObject; 4 | import com.mongodb.util.JSON; 5 | import org.junit.After; 6 | import org.junit.Before; 7 | import org.junit.Test; 8 | 9 | import java.sql.SQLException; 10 | import java.util.Map; 11 | import java.util.Set; 12 | 13 | import static org.junit.Assert.assertEquals; 14 | import static org.junit.Assert.assertNotEquals; 15 | import static org.junit.Assert.assertNotNull; 16 | import static org.junit.Assert.assertTrue; 17 | 18 | public class TestHDFSToMongoDBWithOptions extends HiveTest { 19 | @Before 20 | public void setUp() throws SQLException { 21 | loadDataIntoHDFSHiveTable(); 22 | loadDataIntoMongoDBHiveTable(true); 23 | } 24 | 25 | @After 26 | public void tearDown() throws SQLException { 27 | dropTable(MONGO_BACKED_TABLE); 28 | dropTable(HDFS_BACKED_TABLE); 29 | } 30 | 31 | @Test 32 | @SuppressWarnings("unchecked") 33 | public void testMongoMapping() { 34 | DBObject doc = getCollection(MONGO_COLLECTION).findOne(); 35 | String[] propsSplit = SERDE_PROPERTIES.split("="); 36 | 37 | int propsSplitLen = propsSplit.length; 38 | assertEquals(propsSplitLen % 2, 0); 39 | 40 | // now read in the 'mongo.columns.mapping' mapping 41 | String colsMap = null; 42 | for (int i = 0; i < propsSplit.length && colsMap == null; i++) { 43 | final String entry = propsSplit[i]; 44 | if (entry.toLowerCase().equals("'mongo.columns.mapping'") && i - 1 < propsSplitLen) { 45 | colsMap = propsSplit[i + 1]; 46 | } 47 | } 48 | 49 | assertNotNull(colsMap); 50 | // first remove '' around colsMap 51 | colsMap = colsMap.substring(1, colsMap.length() - 1); 52 | Set docKeys = doc.keySet(); 53 | 54 | for (String s : ((Map) JSON.parse(colsMap)).values()) { 55 | assertTrue(docKeys.contains(s)); 56 | } 57 | } 58 | 59 | @Test 60 | public void testCountSameTable() { 61 | Results hiveData = getAllDataFromTable(HDFS_BACKED_TABLE); 62 | Results mongoData = getAllDataFromTable(MONGO_BACKED_TABLE); 63 | assertNotEquals(hiveData.size(), 0); 64 | assertNotEquals(mongoData.size(), 0); 65 | 66 | assertEquals(hiveData, mongoData); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /hive/src/test/resources/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 16 | -------------------------------------------------------------------------------- /hive/src/test/resources/hivetable.properties: -------------------------------------------------------------------------------- 1 | mongo.uri=mongodb://localhost:27017/mongo_hadoop.tabletest 2 | mongo.input.query={"i": {"$mod": [2, 0]}} -------------------------------------------------------------------------------- /hive/src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # suppress inspection "UnusedProperty" for whole file 2 | log4j.rootLogger=info, stdout, R 3 | 4 | log4j.logger.com.mongodb=debug, R 5 | log4j.logger.org.apache=ERROR, R 6 | log4j.logger.com.jolbox.bonecp=ERROR, R 7 | log4j.logger.org.slf4j=ERROR, R 8 | log4j.logger.org.datanucleus=ERROR, R 9 | log4j.logger.org.datanucleus.util=ERROR, R 10 | log4j.logger.org.mortbay=ERROR, R 11 | log4j.logger.org.jboss=ERROR, R 12 | 13 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 14 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 15 | 16 | # Pattern to output the caller's file name and line number. 17 | log4j.appender.stdout.layout.ConversionPattern=%5p [%t] (%F:%L) - %m%n 18 | 19 | log4j.appender.R=org.apache.log4j.RollingFileAppender 20 | log4j.appender.R.File=/tmp/hive.log4j 21 | 22 | log4j.appender.R.MaxFileSize=100KB 23 | # Keep one backup file 24 | log4j.appender.R.MaxBackupIndex=1 25 | 26 | log4j.appender.R.layout=org.apache.log4j.PatternLayout 27 | log4j.appender.R.layout.ConversionPattern=%p %t %c - %m%n -------------------------------------------------------------------------------- /hive/src/test/resources/test_data.txt: -------------------------------------------------------------------------------- 1 | 1 Tom 28 2 | 2 Alice 18 3 | 3 Bob 29 4 | 101 Scott 10 5 | 102 Randall 100 6 | 103 Mike 100 7 | 104 Jesse 152 8 | -------------------------------------------------------------------------------- /hive/src/test/resources/users.bson: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mongodb/mongo-hadoop/20208a027ad8638e56dfcf040773f176d6ee059f/hive/src/test/resources/users.bson -------------------------------------------------------------------------------- /hive/src/test/resources/yarn-site.xml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pig/src/main/java/com/mongodb/hadoop/pig/udf/ByteArrayTypeEvalFunc.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.pig.udf; 2 | 3 | import org.apache.pig.EvalFunc; 4 | import org.apache.pig.data.DataType; 5 | import org.apache.pig.impl.logicalLayer.schema.Schema; 6 | 7 | /** 8 | * Convenience abstract implementation of Pig's EvalFunc that automatically 9 | * tells Pig that the return type of the UDF is a DataByteArray. 10 | * 11 | * Subclasses specify what subclass of DataByteArray to use in the type 12 | * parameter T. 13 | */ 14 | public abstract class ByteArrayTypeEvalFunc extends EvalFunc { 15 | @Override 16 | public Schema outputSchema(final Schema input) { 17 | return new Schema(new Schema.FieldSchema(null, DataType.BYTEARRAY)); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /pig/src/main/java/com/mongodb/hadoop/pig/udf/GenMaxKey.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.pig.udf; 2 | 3 | import com.mongodb.hadoop.pig.udf.types.PigBoxedMaxKey; 4 | import org.apache.pig.data.Tuple; 5 | 6 | import java.io.IOException; 7 | 8 | /** 9 | * Pig UDF that always returns MaxKey(). 10 | */ 11 | public class GenMaxKey extends ByteArrayTypeEvalFunc { 12 | @Override 13 | public PigBoxedMaxKey exec(final Tuple input) throws IOException { 14 | return new PigBoxedMaxKey(); 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /pig/src/main/java/com/mongodb/hadoop/pig/udf/GenMinKey.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.pig.udf; 2 | 3 | import com.mongodb.hadoop.pig.udf.types.PigBoxedMinKey; 4 | import org.apache.pig.data.Tuple; 5 | 6 | import java.io.IOException; 7 | 8 | /** 9 | * Pig UDF that always returns MinKey(). 10 | */ 11 | public class GenMinKey extends ByteArrayTypeEvalFunc { 12 | @Override 13 | public PigBoxedMinKey exec(final Tuple input) throws IOException { 14 | return new PigBoxedMinKey(); 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /pig/src/main/java/com/mongodb/hadoop/pig/udf/ObjectIdToSeconds.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.pig.udf; 2 | 3 | import com.mongodb.hadoop.pig.udf.types.PigBoxedObjectId; 4 | import org.apache.pig.EvalFunc; 5 | import org.apache.pig.data.DataByteArray; 6 | import org.apache.pig.data.DataType; 7 | import org.apache.pig.data.Tuple; 8 | import org.apache.pig.impl.logicalLayer.schema.Schema; 9 | import org.bson.types.ObjectId; 10 | 11 | import java.io.IOException; 12 | 13 | /** 14 | * Pig UDF that extracts the timestamp from an ObjectId. 15 | */ 16 | public class ObjectIdToSeconds extends EvalFunc { 17 | 18 | public Integer exec(final Tuple input) throws IOException { 19 | if (null == input || input.size() == 0) { 20 | return null; 21 | } 22 | Object oid = input.get(0); 23 | if (oid instanceof PigBoxedObjectId) { 24 | return ((PigBoxedObjectId) oid).getObject().getTimestamp(); 25 | } else if (oid instanceof String) { 26 | return new ObjectId((String) oid).getTimestamp(); 27 | } else if (oid instanceof DataByteArray) { 28 | return new ObjectId(((DataByteArray) oid).get()).getTimestamp(); 29 | } 30 | throw new IOException( 31 | "Not an ObjectId, so cannot convert to seconds: " + oid); 32 | } 33 | 34 | @Override 35 | public Schema outputSchema(final Schema input) { 36 | return new Schema(new Schema.FieldSchema("seconds", DataType.INTEGER)); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /pig/src/main/java/com/mongodb/hadoop/pig/udf/ToBinary.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.pig.udf; 2 | 3 | import com.mongodb.hadoop.pig.udf.types.PigBoxedBinary; 4 | import org.apache.pig.data.DataByteArray; 5 | import org.apache.pig.data.Tuple; 6 | 7 | import java.io.IOException; 8 | 9 | /** 10 | * Pig UDF that transforms the incoming value into a BSON Binary object. 11 | */ 12 | public class ToBinary extends ByteArrayTypeEvalFunc { 13 | @Override 14 | public PigBoxedBinary exec(final Tuple input) throws IOException { 15 | if (null == input || input.size() == 0) { 16 | return null; 17 | } 18 | Object o = input.get(0); 19 | if (o instanceof String) { 20 | return new PigBoxedBinary(((String) o).getBytes()); 21 | } else if (o instanceof DataByteArray) { 22 | return new PigBoxedBinary(((DataByteArray) o).get()); 23 | } 24 | throw new IOException( 25 | "Need String or DataByteArray to build a Binary, not " + o); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /pig/src/main/java/com/mongodb/hadoop/pig/udf/ToDBRef.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.pig.udf; 2 | 3 | import com.mongodb.hadoop.pig.udf.types.PigBoxedDBRef; 4 | import org.apache.pig.data.Tuple; 5 | import org.bson.types.ObjectId; 6 | 7 | import java.io.IOException; 8 | import java.util.Map; 9 | 10 | /** 11 | * Pig UDF that transforms the incoming value into a MongoDB DBRef. 12 | */ 13 | public class ToDBRef extends ByteArrayTypeEvalFunc { 14 | @Override 15 | public PigBoxedDBRef exec(final Tuple input) throws IOException { 16 | if (null == input || input.size() == 0) { 17 | return null; 18 | } 19 | Object o = input.get(0); 20 | if (o instanceof Map) { 21 | Object collectionName = ((Map) o).get("$ref"); 22 | Object id = ((Map) o).get("$id"); 23 | if (null == collectionName || null == id) { 24 | throw new IOException( 25 | "Map must contain both $ref and $id fields: " + o); 26 | } 27 | byte[] collectionNameBytes = 28 | ((String) collectionName).getBytes(); 29 | byte[] dbrefBytes = 30 | new byte[12 + 1 + collectionNameBytes.length]; 31 | byte[] oidBytes = new ObjectId((String) id).toByteArray(); 32 | System.arraycopy( 33 | collectionNameBytes, 0, 34 | dbrefBytes, 0, collectionNameBytes.length); 35 | dbrefBytes[collectionNameBytes.length] = 0; 36 | System.arraycopy( 37 | oidBytes, 0, 38 | dbrefBytes, collectionNameBytes.length + 1, 12); 39 | return new PigBoxedDBRef(dbrefBytes); 40 | } 41 | throw new IOException("Need a Map to build a DBRef, not " + o); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /pig/src/main/java/com/mongodb/hadoop/pig/udf/ToObjectId.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.pig.udf; 2 | 3 | import com.mongodb.hadoop.pig.udf.types.PigBoxedObjectId; 4 | import org.apache.pig.data.DataByteArray; 5 | import org.apache.pig.data.Tuple; 6 | import org.bson.types.ObjectId; 7 | 8 | import java.io.IOException; 9 | 10 | /** 11 | * UDF that transforms the incoming value into a BSON ObjectId. 12 | */ 13 | public class ToObjectId extends ByteArrayTypeEvalFunc { 14 | public PigBoxedObjectId exec(final Tuple input) throws IOException { 15 | if (null == input || input.size() == 0) { 16 | return null; 17 | } 18 | Object o = input.get(0); 19 | if (o instanceof String) { 20 | return new PigBoxedObjectId( 21 | new ObjectId((String) o).toByteArray()); 22 | } else if (o instanceof DataByteArray) { 23 | return new PigBoxedObjectId(((DataByteArray) o).get()); 24 | } 25 | throw new IOException( 26 | "Need a String or DataByteArray to build an ObjectId, not " + o); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /pig/src/main/java/com/mongodb/hadoop/pig/udf/types/PigBoxedBSONValue.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.pig.udf.types; 2 | 3 | import org.apache.pig.data.DataByteArray; 4 | 5 | public abstract class PigBoxedBSONValue extends DataByteArray { 6 | public PigBoxedBSONValue() {} 7 | 8 | public PigBoxedBSONValue(final byte[] b) { 9 | super(b); 10 | } 11 | 12 | public abstract T getObject(); 13 | } 14 | -------------------------------------------------------------------------------- /pig/src/main/java/com/mongodb/hadoop/pig/udf/types/PigBoxedBinary.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.pig.udf.types; 2 | 3 | import org.bson.types.Binary; 4 | 5 | public class PigBoxedBinary extends PigBoxedBSONValue { 6 | public PigBoxedBinary(final byte[] b) { 7 | super(b); 8 | } 9 | 10 | @Override 11 | public Binary getObject() { 12 | return new Binary(get()); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /pig/src/main/java/com/mongodb/hadoop/pig/udf/types/PigBoxedDBRef.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.pig.udf.types; 2 | 3 | import com.mongodb.DBRef; 4 | import org.bson.types.ObjectId; 5 | 6 | import java.util.Arrays; 7 | 8 | public class PigBoxedDBRef extends PigBoxedBSONValue { 9 | public PigBoxedDBRef(final byte[] b) { 10 | super(b); 11 | } 12 | 13 | @Override 14 | public DBRef getObject() { 15 | byte[] bytes = get(); 16 | ObjectId id = new ObjectId( 17 | Arrays.copyOfRange(bytes, bytes.length - 12, bytes.length)); 18 | String collectionName = new String( 19 | Arrays.copyOfRange(bytes, 0, bytes.length - 13)); 20 | 21 | return new DBRef(collectionName, id); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /pig/src/main/java/com/mongodb/hadoop/pig/udf/types/PigBoxedMaxKey.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.pig.udf.types; 2 | 3 | import org.bson.types.MaxKey; 4 | 5 | public class PigBoxedMaxKey extends PigBoxedBSONValue { 6 | @Override 7 | public MaxKey getObject() { 8 | return new MaxKey(); 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /pig/src/main/java/com/mongodb/hadoop/pig/udf/types/PigBoxedMinKey.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.pig.udf.types; 2 | 3 | import org.bson.types.MinKey; 4 | 5 | public class PigBoxedMinKey extends PigBoxedBSONValue { 6 | @Override 7 | public MinKey getObject() { 8 | return new MinKey(); 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /pig/src/main/java/com/mongodb/hadoop/pig/udf/types/PigBoxedObjectId.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.pig.udf.types; 2 | 3 | import org.bson.types.ObjectId; 4 | 5 | public class PigBoxedObjectId extends PigBoxedBSONValue { 6 | public PigBoxedObjectId(final byte[] b) { 7 | super(b); 8 | } 9 | 10 | @Override 11 | public ObjectId getObject() { 12 | return new ObjectId(get()); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /pig/src/test/java/com/mongodb/hadoop/pig/BSONStorageTest.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.pig; 2 | 3 | import org.apache.pig.ResourceSchema; 4 | import org.apache.pig.impl.util.Utils; 5 | import org.junit.Test; 6 | 7 | import static org.junit.Assert.assertNull; 8 | 9 | 10 | public class BSONStorageTest { 11 | @Test 12 | public void testNullMap() throws Exception { 13 | ResourceSchema schema = new ResourceSchema(Utils.getSchemaFromString("m:map[]")); 14 | 15 | assertNull(BSONStorage.getTypeForBSON(null, schema.getFields()[0], null)); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /pig/src/test/java/com/mongodb/hadoop/pig/MongoStorageTest.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.pig; 2 | 3 | import com.mongodb.BasicDBObjectBuilder; 4 | import com.mongodb.DBObject; 5 | import org.apache.pig.ResourceSchema; 6 | import org.apache.pig.impl.util.Utils; 7 | import org.junit.Test; 8 | 9 | import java.util.HashMap; 10 | import java.util.Map; 11 | import java.util.Set; 12 | 13 | import static org.junit.Assert.assertEquals; 14 | 15 | public class MongoStorageTest { 16 | @Test 17 | public void testMap() throws Exception { 18 | MongoStorage ms = new MongoStorage(); 19 | BasicDBObjectBuilder builder = BasicDBObjectBuilder.start(); 20 | ResourceSchema schema = new ResourceSchema(Utils.getSchemaFromString("m:map[]")); 21 | 22 | Map val = new HashMap(); 23 | val.put("f1", 1); 24 | val.put("f2", "2"); 25 | 26 | ms.writeField(builder, schema.getFields()[0], val); 27 | 28 | DBObject out = builder.get(); 29 | 30 | Set outKeySet = out.keySet(); 31 | 32 | assertEquals(2, outKeySet.size()); 33 | assertEquals(1, out.get("f1")); 34 | assertEquals("2", out.get("f2")); 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /pig/src/test/java/helpers/TOBAG.java: -------------------------------------------------------------------------------- 1 | package helpers; 2 | 3 | import org.apache.pig.EvalFunc; 4 | import org.apache.pig.data.BagFactory; 5 | import org.apache.pig.data.DataBag; 6 | import org.apache.pig.data.Tuple; 7 | import org.apache.pig.data.TupleFactory; 8 | 9 | import java.io.IOException; 10 | 11 | /* 12 | * TOBAG : converts a tuple to a bag of one-item tuples 13 | */ 14 | public class TOBAG extends EvalFunc { 15 | private TupleFactory mTupleFactory = TupleFactory.getInstance(); 16 | private BagFactory mBagFactory = BagFactory.getInstance(); 17 | 18 | public DataBag exec(final Tuple input) throws IOException { 19 | if (input == null || input.size() == 0) { 20 | return null; 21 | } 22 | 23 | try { 24 | DataBag output = mBagFactory.newDefaultBag(); 25 | Tuple nested = (Tuple) input.get(0); 26 | for (Object o : nested.getAll()) { 27 | output.add(mTupleFactory.newTuple(o)); 28 | } 29 | 30 | return output; 31 | } catch (Exception e) { 32 | return null; 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /pig/src/test/resources/dump/test/persons_info.bson: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mongodb/mongo-hadoop/20208a027ad8638e56dfcf040773f176d6ee059f/pig/src/test/resources/dump/test/persons_info.bson -------------------------------------------------------------------------------- /pig/src/test/resources/dump/test/persons_info.metadata.json: -------------------------------------------------------------------------------- 1 | { "indexes" : [ { "v" : 1, "key" : { "_id" : 1 }, "ns" : "test.persons_info", "name" : "_id_" } ] } -------------------------------------------------------------------------------- /pig/src/test/resources/pig/bson_schemaless.pig: -------------------------------------------------------------------------------- 1 | REGISTER @PROJECT_HOME@/core/build/libs/mongo-hadoop-core-@PROJECT_VERSION@.jar 2 | REGISTER @PROJECT_HOME@/pig/build/libs/mongo-hadoop-pig-@PROJECT_VERSION@.jar 3 | 4 | -- Load data from BSON, providing no schema. 5 | persons_info = 6 | LOAD '@PROJECT_HOME@/pig/src/test/resources/dump/test/persons_info.bson' 7 | USING com.mongodb.hadoop.pig.BSONLoader; 8 | 9 | -- Insert into MongoDB. 10 | STORE persons_info 11 | INTO 'mongodb://localhost:27017/mongo_hadoop.bson_schemaless' 12 | USING com.mongodb.hadoop.pig.MongoInsertStorage; 13 | 14 | -- Get the results back from mongo. 15 | results = LOAD 'mongodb://localhost:27017/mongo_hadoop.bson_schemaless' 16 | USING com.mongodb.hadoop.pig.MongoLoader('first, last, age'); 17 | -------------------------------------------------------------------------------- /pig/src/test/resources/pig/bson_test.pig: -------------------------------------------------------------------------------- 1 | REGISTER @PROJECT_HOME@/core/build/libs/mongo-hadoop-core-@PROJECT_VERSION@.jar 2 | REGISTER @PROJECT_HOME@/pig/build/libs/mongo-hadoop-pig-@PROJECT_VERSION@.jar 3 | 4 | -- Load data from BSON. 5 | persons_info = 6 | LOAD '@PROJECT_HOME@/pig/src/test/resources/dump/test/persons_info.bson' 7 | USING com.mongodb.hadoop.pig.BSONLoader; 8 | 9 | -- Make sure the BSON doesn't already exist. 10 | rmf file://@PIG_RESOURCES@/pig/test_output 11 | 12 | STORE persons_info 13 | INTO 'file://@PIG_RESOURCES@/pig/test_output' 14 | USING com.mongodb.hadoop.pig.BSONStorage; 15 | 16 | persons_read = 17 | LOAD 'file://@PIG_RESOURCES@/pig/test_output' 18 | USING com.mongodb.hadoop.pig.BSONLoader( 19 | 'id', 'first: chararray, last: chararray, age: double') 20 | AS (first: chararray, last: chararray, age: double); 21 | 22 | DUMP persons_read; 23 | -------------------------------------------------------------------------------- /pig/src/test/resources/pig/datestest.pig: -------------------------------------------------------------------------------- 1 | data = 2 | LOAD 'mongodb://localhost:27017/mongo_hadoop.pigtests' 3 | USING com.mongodb.hadoop.pig.MongoLoader('today:datetime'); 4 | 5 | STORE data 6 | INTO 'mongodb://localhost:27017/mongo_hadoop.datetest' 7 | USING com.mongodb.hadoop.pig.MongoInsertStorage; 8 | -------------------------------------------------------------------------------- /pig/src/test/resources/pig/ensure_index.pig: -------------------------------------------------------------------------------- 1 | -- Load data from BSON. 2 | persons_info = 3 | LOAD '@PROJECT_HOME@/pig/src/test/resources/dump/test/persons_info.bson' 4 | USING com.mongodb.hadoop.pig.BSONLoader( 5 | 'id', 'first: chararray, last: chararray, age: double') 6 | AS (first: chararray, last: chararray, age: double); 7 | 8 | -- Dump into mongo, ensure index on last name. 9 | STORE persons_info 10 | INTO 'mongodb://localhost:27017/mongo_hadoop.ensure_indexes' 11 | USING com.mongodb.hadoop.pig.MongoStorage( 12 | '{last: 1}, {}' 13 | ); 14 | -------------------------------------------------------------------------------- /pig/src/test/resources/pig/ensure_index_2.pig: -------------------------------------------------------------------------------- 1 | -- Load data from BSON. 2 | persons_info = 3 | LOAD '@PROJECT_HOME@/pig/src/test/resources/dump/test/persons_info.bson' 4 | USING com.mongodb.hadoop.pig.BSONLoader( 5 | 'id', 'first: chararray, last: chararray, age: double') 6 | AS (first: chararray, last: chararray, age: double); 7 | 8 | -- Dump into mongo, ensure index on first name. 9 | STORE persons_info 10 | INTO 'mongodb://localhost:27017/mongo_hadoop.ensure_indexes' 11 | USING com.mongodb.hadoop.pig.MongoStorage( 12 | '{first: 1}, {}' 13 | ); -------------------------------------------------------------------------------- /pig/src/test/resources/pig/genminmaxkeys.pig: -------------------------------------------------------------------------------- 1 | data = 2 | LOAD 'mongodb://localhost:27017/mongo_hadoop.udftest.input' 3 | USING com.mongodb.hadoop.pig.MongoLoader; 4 | 5 | create_min_max_keys = 6 | FOREACH data 7 | GENERATE com.mongodb.hadoop.pig.udf.GenMaxKey() AS newMax, 8 | com.mongodb.hadoop.pig.udf.GenMinKey() AS newMin; 9 | 10 | STORE create_min_max_keys 11 | INTO 'mongodb://localhost:27017/mongo_hadoop.udftest.output' 12 | USING com.mongodb.hadoop.pig.MongoInsertStorage; 13 | -------------------------------------------------------------------------------- /pig/src/test/resources/pig/oidtoseconds.pig: -------------------------------------------------------------------------------- 1 | data = 2 | LOAD 'mongodb://localhost:27017/mongo_hadoop.udftest.input' 3 | USING com.mongodb.hadoop.pig.MongoLoader; 4 | 5 | calc_seconds = 6 | FOREACH data 7 | GENERATE com.mongodb.hadoop.pig.udf.ToObjectId($0#'_id') AS id, 8 | com.mongodb.hadoop.pig.udf.ObjectIdToSeconds($0#'_id') AS seconds, 9 | -- Make sure we can nest UDFs. 10 | com.mongodb.hadoop.pig.udf.ObjectIdToSeconds( 11 | com.mongodb.hadoop.pig.udf.ToObjectId($0#'_id')) AS seconds2; 12 | 13 | STORE calc_seconds 14 | INTO 'mongodb://localhost:27017/mongo_hadoop.udftest.output' 15 | USING com.mongodb.hadoop.pig.MongoInsertStorage('id'); 16 | -------------------------------------------------------------------------------- /pig/src/test/resources/pig/pig_uuid.pig: -------------------------------------------------------------------------------- 1 | REGISTER @PROJECT_HOME@/core/build/libs/mongo-hadoop-core-@PROJECT_VERSION@.jar 2 | REGISTER @PROJECT_HOME@/pig/build/libs/mongo-hadoop-pig-@PROJECT_VERSION@.jar 3 | 4 | uuids = 5 | LOAD 'mongodb://localhost:27017/mongo_hadoop.uuid_test' 6 | USING com.mongodb.hadoop.pig.MongoLoader('uuid'); 7 | 8 | STORE uuids INTO 'test_results'; 9 | -------------------------------------------------------------------------------- /pig/src/test/resources/pig/projection.pig: -------------------------------------------------------------------------------- 1 | data = 2 | LOAD 'mongodb://localhost:27017/mongo_hadoop.projection_test' 3 | USING com.mongodb.hadoop.pig.MongoLoader('id:chararray,i:int,d:[]', 'id'); 4 | 5 | -- Pig only pushes projections with subfields when the outer field is a map (d). 6 | projected = 7 | FOREACH data 8 | GENERATE $1 AS age, d#'s' AS name, d#'k' AS ssn; 9 | 10 | STORE projected INTO 'test_results'; 11 | -------------------------------------------------------------------------------- /pig/src/test/resources/pig/replace_mus.pig: -------------------------------------------------------------------------------- 1 | documents = 2 | LOAD 'mongodb://localhost:27017/mongo_hadoop.replace_test' 3 | USING com.mongodb.hadoop.pig.MongoLoader('id:chararray,i:int', 'id'); 4 | 5 | increment_number = 6 | FOREACH documents 7 | GENERATE com.mongodb.hadoop.pig.udf.ToObjectId(id) AS id, 8 | i + 1 AS i; 9 | 10 | STORE increment_number 11 | INTO 'mongodb://localhost:27017/mongo_hadoop.replace_test' 12 | USING com.mongodb.hadoop.pig.MongoUpdateStorage( 13 | '{_id:"\$id"}', -- query 14 | '{i:"\$i"}', -- replacement 15 | 'id:bytearray,i:int', -- schema 16 | '', -- toIgnore (none) 17 | '{replace:true}' -- update options 18 | ); 19 | -------------------------------------------------------------------------------- /pig/src/test/resources/pig/schemaless.pig: -------------------------------------------------------------------------------- 1 | -- no schema provided 2 | data = LOAD 'mongodb://localhost:27017/mongo_hadoop.pig.schemaless' 3 | USING com.mongodb.hadoop.pig.MongoLoader; 4 | 5 | -- no schema or id provided 6 | STORE data INTO 'mongodb://localhost:27017/mongo_hadoop.pig.schemaless.out' 7 | USING com.mongodb.hadoop.pig.MongoInsertStorage; -------------------------------------------------------------------------------- /pig/src/test/resources/pig/tobinary.pig: -------------------------------------------------------------------------------- 1 | data = 2 | LOAD 'mongodb://localhost:27017/mongo_hadoop.udftest.input' 3 | USING com.mongodb.hadoop.pig.MongoLoader('binary:bytearray'); 4 | 5 | create_bson_binary = 6 | FOREACH data 7 | GENERATE com.mongodb.hadoop.pig.udf.ToBinary(binary) AS binary; 8 | 9 | STORE create_bson_binary 10 | INTO 'mongodb://localhost:27017/mongo_hadoop.udftest.output' 11 | USING com.mongodb.hadoop.pig.MongoInsertStorage; 12 | -------------------------------------------------------------------------------- /pig/src/test/resources/pig/todbref.pig: -------------------------------------------------------------------------------- 1 | data = 2 | LOAD 'mongodb://localhost:27017/mongo_hadoop.udftest.input' 3 | USING com.mongodb.hadoop.pig.MongoLoader('dbref:[]'); 4 | 5 | create_dbref = 6 | FOREACH data 7 | GENERATE com.mongodb.hadoop.pig.udf.ToDBRef($0) AS dbref; 8 | 9 | STORE create_dbref 10 | INTO 'mongodb://localhost:27017/mongo_hadoop.udftest.output' 11 | USING com.mongodb.hadoop.pig.MongoInsertStorage; 12 | -------------------------------------------------------------------------------- /pig/src/test/resources/pig/toobjectid.pig: -------------------------------------------------------------------------------- 1 | data = 2 | LOAD 'mongodb://localhost:27017/mongo_hadoop.udftest.input' 3 | USING com.mongodb.hadoop.pig.MongoLoader( 4 | 'id:chararray,oidBytes:bytearray', 'id'); 5 | 6 | create_objids = 7 | FOREACH data 8 | GENERATE com.mongodb.hadoop.pig.udf.ToObjectId(id) AS id, 9 | com.mongodb.hadoop.pig.udf.ToObjectId(oidBytes) AS otherid; 10 | 11 | STORE create_objids 12 | INTO 'mongodb://localhost:27017/mongo_hadoop.udftest.output' 13 | USING com.mongodb.hadoop.pig.MongoInsertStorage('id'); 14 | -------------------------------------------------------------------------------- /pig/src/test/resources/pig/udfschemaless.pig: -------------------------------------------------------------------------------- 1 | data = 2 | LOAD 'mongodb://localhost:27017/mongo_hadoop.udftest.input' 3 | USING com.mongodb.hadoop.pig.MongoLoader; 4 | 5 | create_objids = 6 | FOREACH data 7 | GENERATE com.mongodb.hadoop.pig.udf.ToObjectId($0#'_id'); 8 | 9 | STORE create_objids 10 | INTO 'mongodb://localhost:27017/mongo_hadoop.udftest.output' 11 | USING com.mongodb.hadoop.pig.MongoInsertStorage; 12 | -------------------------------------------------------------------------------- /pig/src/test/resources/pig/update_age_alabis_mus.pig: -------------------------------------------------------------------------------- 1 | REGISTER @PROJECT_HOME@/core/build/libs/mongo-hadoop-core-@PROJECT_VERSION@.jar 2 | REGISTER @PROJECT_HOME@/pig/build/libs/mongo-hadoop-pig-@PROJECT_VERSION@.jar 3 | 4 | -- Load data from BSON. 5 | persons_info = 6 | LOAD '@PROJECT_HOME@/pig/src/test/resources/dump/test/persons_info.bson' 7 | USING com.mongodb.hadoop.pig.BSONLoader( 8 | 'id', 'first: chararray, last: chararray, age: double') 9 | AS (first: chararray, last: chararray, age: double); 10 | 11 | -- Insert into MongoDB. 12 | STORE persons_info 13 | INTO 'mongodb://localhost:27017/mongo_hadoop.update_mus' 14 | USING com.mongodb.hadoop.pig.MongoInsertStorage; 15 | 16 | -- Perform the update (everyone gets a little older). 17 | STORE persons_info INTO 'mongodb://localhost:27017/mongo_hadoop.update_mus' 18 | USING com.mongodb.hadoop.pig.MongoUpdateStorage( 19 | '{}', 20 | '{\$inc:{age:1}}', 21 | 'first, last, age', '', 22 | '{multi : true}'); 23 | 24 | -- Get the results back from mongo. 25 | results = 26 | LOAD 'mongodb://localhost:27017/mongo_hadoop.update_mus' 27 | USING com.mongodb.hadoop.pig.MongoLoader('first, last, age'); 28 | -------------------------------------------------------------------------------- /pig/src/test/resources/pig/update_simple_mus.pig: -------------------------------------------------------------------------------- 1 | REGISTER @PROJECT_HOME@/core/build/libs/mongo-hadoop-core-@PROJECT_VERSION@.jar 2 | REGISTER @PROJECT_HOME@/pig/build/libs/mongo-hadoop-pig-@PROJECT_VERSION@.jar 3 | REGISTER @PROJECT_HOME@/pig/build/libs/mongo-hadoop-pig-@PROJECT_VERSION@-tests.jar 4 | 5 | -- Load data from BSON. 6 | persons_info = 7 | LOAD '@PROJECT_HOME@/pig/src/test/resources/dump/test/persons_info.bson' 8 | USING com.mongodb.hadoop.pig.BSONLoader; 9 | 10 | -- Parse data from BSON into tuples so we can address fields when doing an 11 | -- update. Explicitly define the schema for the 'cars' bag so we can write it 12 | -- out later with MongoInsertStorage. 13 | to_store = 14 | FOREACH persons_info 15 | GENERATE 16 | $0#'first' as first, 17 | $0#'last' as last, 18 | helpers.TOBAG($0#'cars') as cars: bag{t: tuple(car: chararray)}; 19 | 20 | -- Insert into MongoDB. 21 | STORE to_store 22 | INTO 'mongodb://localhost:27017/mongo_hadoop.update_mus' 23 | USING com.mongodb.hadoop.pig.MongoInsertStorage; 24 | 25 | -- Perform the update (everyone gets 2x their cars). 26 | STORE to_store 27 | INTO 'mongodb://localhost:27017/mongo_hadoop.update_mus' 28 | USING com.mongodb.hadoop.pig.MongoUpdateStorage( 29 | '{first:"\$first", last:"\$last"}', 30 | '{\$pushAll:{cars:"\$cars"}}'); 31 | 32 | -- Get the results back from mongo. 33 | results = LOAD 'mongodb://localhost:27017/mongo_hadoop.update_mus' 34 | USING com.mongodb.hadoop.pig.MongoLoader('first, last, cars'); 35 | -------------------------------------------------------------------------------- /settings.gradle: -------------------------------------------------------------------------------- 1 | include 'core', 'hive', 'pig', 'streaming', 'flume', 2 | 'spark', 'examples/treasury_yield', 'examples/enron', 3 | 'examples/enron/spark', 'examples/sensors', 4 | 'examples/shakespeare' 5 | -------------------------------------------------------------------------------- /spark/src/main/java/com/mongodb/spark/PySparkBSONFileInputFormat.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.spark; 2 | 3 | import com.mongodb.hadoop.BSONFileInputFormat; 4 | import com.mongodb.spark.pickle.RegisterConstructors; 5 | import com.mongodb.spark.pickle.RegisterPickles; 6 | 7 | public class PySparkBSONFileInputFormat extends BSONFileInputFormat { 8 | private static final RegisterPickles PICKLES = new RegisterPickles(); 9 | private static final RegisterConstructors CONSTRUCTORS = 10 | new RegisterConstructors(); 11 | 12 | static { 13 | PICKLES.register(); 14 | CONSTRUCTORS.register(); 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /spark/src/main/java/com/mongodb/spark/PySparkBSONFileOutputFormat.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.spark; 2 | 3 | import com.mongodb.hadoop.BSONFileOutputFormat; 4 | import com.mongodb.spark.pickle.RegisterConstructors; 5 | import com.mongodb.spark.pickle.RegisterPickles; 6 | 7 | public class PySparkBSONFileOutputFormat 8 | extends BSONFileOutputFormat { 9 | private static final RegisterPickles PICKLES = new RegisterPickles(); 10 | private static final RegisterConstructors CONSTRUCTORS = 11 | new RegisterConstructors(); 12 | 13 | static { 14 | PICKLES.register(); 15 | CONSTRUCTORS.register(); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /spark/src/main/java/com/mongodb/spark/PySparkMongoInputFormat.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.spark; 2 | 3 | import com.mongodb.hadoop.MongoInputFormat; 4 | import com.mongodb.spark.pickle.RegisterConstructors; 5 | import com.mongodb.spark.pickle.RegisterPickles; 6 | 7 | /** 8 | * InputFormat that attaches custom Picklers and IObjectConstructors for 9 | * reading and writing BSON types with PyMongo. 10 | */ 11 | public class PySparkMongoInputFormat extends MongoInputFormat { 12 | private static final RegisterPickles PICKLES = new RegisterPickles(); 13 | private static final RegisterConstructors CONSTRUCTORS = 14 | new RegisterConstructors(); 15 | 16 | static { 17 | PICKLES.register(); 18 | CONSTRUCTORS.register(); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /spark/src/main/java/com/mongodb/spark/PySparkMongoOutputFormat.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.spark; 2 | 3 | import com.mongodb.hadoop.MongoOutputFormat; 4 | import com.mongodb.spark.pickle.RegisterConstructors; 5 | import com.mongodb.spark.pickle.RegisterPickles; 6 | 7 | public class PySparkMongoOutputFormat 8 | extends MongoOutputFormat { 9 | private static final RegisterPickles PICKLES = new RegisterPickles(); 10 | private static final RegisterConstructors CONSTRUCTORS = 11 | new RegisterConstructors(); 12 | 13 | static { 14 | PICKLES.register(); 15 | CONSTRUCTORS.register(); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /spark/src/main/java/com/mongodb/spark/pickle/BSONValueBox.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.spark.pickle; 2 | 3 | import com.mongodb.hadoop.io.BSONWritable; 4 | import org.apache.hadoop.io.Writable; 5 | import org.bson.BasicBSONObject; 6 | import org.bson.Transformer; 7 | 8 | import java.io.DataInput; 9 | import java.io.DataOutput; 10 | import java.io.IOException; 11 | import java.io.Serializable; 12 | 13 | /** 14 | * Base class for containers that hold BSON values. 15 | * These containers are used when unpickling objects from Python. Generally, 16 | * these objects implement a "__setstate__" method that allows their internal 17 | * state to be set after they are created. 18 | * 19 | * @param the type of BSON value to be held. 20 | */ 21 | abstract class BSONValueBox implements Writable, Serializable { 22 | 23 | private static final Transformer TRANSFORMER = new Transformer() { 24 | @Override 25 | public Object transform(final Object objectToTransform) { 26 | if (!(objectToTransform instanceof BSONValueBox)) { 27 | throw new IllegalArgumentException( 28 | "Can only transform instances of BSONValueBox, not " 29 | + objectToTransform); 30 | } 31 | return ((BSONValueBox) objectToTransform).get(); 32 | } 33 | }; 34 | 35 | public abstract T get(); 36 | 37 | static Transformer getTransformer() { 38 | return TRANSFORMER; 39 | } 40 | 41 | /** 42 | * Inflate a BSONValueBox from a DataInput. 43 | * This method is here so that BSONValueBox implements Hadoop's Writable 44 | * interface, which is a requirement to use this type with Spark Hadoop 45 | * RDDs. However, you should never call this method directly. 46 | * 47 | * @param in the DataInput. 48 | * @throws IOException is always thrown when this method is called. 49 | */ 50 | @Override 51 | public void readFields(final DataInput in) throws IOException { 52 | throw new IOException("Cannot read fields into a BSONValueBox."); 53 | } 54 | 55 | /** 56 | * Write a BSONValueBox type to a DataOutput. 57 | * This method is here so that BSONValueBox implements Hadoop's Writable 58 | * interface, which is a requirement to use this type with Spark's Hadoop 59 | * RDDs. Calling this method will write into the output a document of the 60 | * form: 61 | * 62 | * {"value": (boxed value)} 63 | * 64 | * @param out the DataOutput 65 | * @throws IOException when there is an error writing to the DataOutput 66 | */ 67 | @Override 68 | public void write(final DataOutput out) throws IOException { 69 | (new BSONWritable(new BasicBSONObject("value", get()))).write(out); 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /spark/src/main/java/com/mongodb/spark/pickle/BinaryConstructor.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.spark.pickle; 2 | 3 | import net.razorvine.pickle.IObjectConstructor; 4 | import net.razorvine.pickle.PickleException; 5 | import org.bson.BSON; 6 | import org.bson.types.Binary; 7 | 8 | import java.util.HashMap; 9 | 10 | public class BinaryConstructor implements IObjectConstructor { 11 | 12 | public static class BinaryBox extends BSONValueBox { 13 | private Binary value = null; 14 | static { 15 | BSON.addEncodingHook(BinaryBox.class, getTransformer()); 16 | } 17 | 18 | public BinaryBox(final String data, final int type) { 19 | byte[] byteData = new byte[data.length()]; 20 | for (int i = 0; i < byteData.length; ++i) { 21 | byteData[i] = (byte) data.charAt(i); 22 | } 23 | this.value = new Binary((byte) type, byteData); 24 | } 25 | 26 | // CHECKSTYLE:OFF 27 | public void __setstate__(final HashMap hm) { 28 | // State has already been set from constructor. 29 | } 30 | // CHECKSTYLE:ON 31 | 32 | @Override 33 | public Binary get() { 34 | return value; 35 | } 36 | } 37 | 38 | @Override 39 | public Object construct(final Object[] args) { 40 | if (args.length != 2) { 41 | throw new PickleException( 42 | "Binary constructor requires 2 arguments, not " + args.length); 43 | } 44 | if (!((args[0] instanceof String) && (args[1] instanceof Integer))) { 45 | throw new PickleException( 46 | "Binary constructor takes a String and an Integer, " 47 | + "not a " + args[0].getClass().getName() 48 | + " and a " + args[1].getClass().getName()); 49 | } 50 | return new BinaryBox((String) args[0], (Integer) args[1]); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /spark/src/main/java/com/mongodb/spark/pickle/CalendarTransformer.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.spark.pickle; 2 | 3 | import org.bson.Transformer; 4 | 5 | import java.util.Calendar; 6 | import java.util.TimeZone; 7 | 8 | /** 9 | * Transformer that turns java.util.Calendar objects into java.util.Date 10 | * objects. 11 | * 12 | * This class is needed because Spark constructs pickled Python 13 | * datetime.datetime objects into java.util.GregorianCalendar instances instead 14 | * of java.util.Date objects. 15 | */ 16 | public class CalendarTransformer implements Transformer { 17 | @Override 18 | public Object transform(final Object objectToTransform) { 19 | Calendar calendar = (Calendar) objectToTransform; 20 | calendar.setTimeZone(TimeZone.getTimeZone("UTC")); 21 | return calendar.getTime(); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /spark/src/main/java/com/mongodb/spark/pickle/CodeConstructor.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.spark.pickle; 2 | 3 | import net.razorvine.pickle.IObjectConstructor; 4 | import net.razorvine.pickle.PickleException; 5 | import org.bson.BSON; 6 | import org.bson.BasicBSONObject; 7 | import org.bson.types.Code; 8 | import org.bson.types.CodeWScope; 9 | 10 | import java.util.HashMap; 11 | import java.util.Map; 12 | 13 | public class CodeConstructor implements IObjectConstructor { 14 | 15 | public static class CodeBox extends BSONValueBox { 16 | private String code; 17 | private Code value; 18 | static { 19 | BSON.addEncodingHook(CodeBox.class, getTransformer()); 20 | } 21 | 22 | public CodeBox(final String code) { 23 | this.code = code; 24 | } 25 | 26 | // CHECKSTYLE:OFF 27 | public void __setstate__(final HashMap state) { 28 | // CHECKSTYLE:ON 29 | Object scope = state.get("_Code__scope"); 30 | if (!(scope instanceof Map)) { 31 | throw new PickleException( 32 | "Expected a Map for key \"_Code__scope\", not a " 33 | + scope.getClass().getName()); 34 | } 35 | Map scopeMap = (Map) scope; 36 | if (!scopeMap.isEmpty()) { 37 | this.value = new CodeWScope(this.code, 38 | new BasicBSONObject(scopeMap)); 39 | } else { 40 | this.value = new Code(this.code); 41 | } 42 | } 43 | 44 | @Override 45 | public Code get() { 46 | return value; 47 | } 48 | } 49 | 50 | @Override 51 | public Object construct(final Object[] args) { 52 | if (args.length != 1) { 53 | throw new PickleException( 54 | "Code constructor requires 1 argument, not " + args.length); 55 | } 56 | if (!(args[0] instanceof String)) { 57 | throw new PickleException( 58 | "Code constructor requries a String, not a " 59 | + args[0].getClass().getName()); 60 | } 61 | return new CodeBox((String) args[0]); 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /spark/src/main/java/com/mongodb/spark/pickle/DBRefConstructor.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.spark.pickle; 2 | 3 | import com.mongodb.DBRef; 4 | import net.razorvine.pickle.IObjectConstructor; 5 | import net.razorvine.pickle.PickleException; 6 | import org.bson.BSON; 7 | 8 | import java.util.HashMap; 9 | 10 | public class DBRefConstructor implements IObjectConstructor { 11 | 12 | public static class DBRefBox extends BSONValueBox { 13 | private DBRef value; 14 | static { 15 | BSON.addEncodingHook(DBRefBox.class, getTransformer()); 16 | } 17 | 18 | // CHECKSTYLE:OFF 19 | public void __setstate__(final HashMap state) { 20 | // CHECKSTYLE:ON 21 | Object collection = state.get("_DBRef__collection"); 22 | if (!(collection instanceof String)) { 23 | throw new PickleException( 24 | "Expected a String for key \"_DBRef__colledction\", not a " 25 | + collection.getClass().getName()); 26 | } 27 | this.value = new DBRef( 28 | (String) collection, state.get("_DBRef__id")); 29 | } 30 | 31 | @Override 32 | public DBRef get() { 33 | return value; 34 | } 35 | } 36 | 37 | @Override 38 | public Object construct(final Object[] args) { 39 | if (args.length != 0) { 40 | throw new PickleException( 41 | "DBRef constructor requires 0 arguments, not " + args.length); 42 | } 43 | return new DBRefBox(); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /spark/src/main/java/com/mongodb/spark/pickle/Int64Constructor.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.spark.pickle; 2 | 3 | import net.razorvine.pickle.IObjectConstructor; 4 | import net.razorvine.pickle.PickleException; 5 | import org.bson.BSON; 6 | 7 | import java.util.HashMap; 8 | 9 | public class Int64Constructor implements IObjectConstructor { 10 | 11 | public static class Int64Box extends BSONValueBox { 12 | private Long value; 13 | static { 14 | BSON.addEncodingHook(Int64Box.class, getTransformer()); 15 | } 16 | 17 | public Int64Box(final Long value) { 18 | this.value = value; 19 | } 20 | 21 | // CHECKSTYLE:OFF 22 | public void __setstate__(HashMap state) { 23 | // No state to set. 24 | } 25 | // CHECKSTYLE:ON 26 | 27 | @Override 28 | public Long get() { 29 | return this.value; 30 | } 31 | } 32 | 33 | @Override 34 | public Object construct(final Object[] args) { 35 | if (args.length != 1) { 36 | throw new PickleException( 37 | "Int64 constructor requires 1 argument, not " + args.length); 38 | } 39 | if (!((args[0] instanceof Integer) || (args[0] instanceof Long))) { 40 | throw new PickleException( 41 | "Int64 constructor requires an Integer or Long, not a " 42 | + args[0].getClass().getName()); 43 | } 44 | return new Int64Box((Long) args[0]); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /spark/src/main/java/com/mongodb/spark/pickle/MaxKeyConstructor.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.spark.pickle; 2 | 3 | import net.razorvine.pickle.IObjectConstructor; 4 | import net.razorvine.pickle.PickleException; 5 | import org.bson.BSON; 6 | import org.bson.types.MaxKey; 7 | 8 | import java.util.HashMap; 9 | 10 | public class MaxKeyConstructor implements IObjectConstructor { 11 | 12 | public static class MaxKeyBox extends BSONValueBox { 13 | private static final MaxKey MAX_KEY = new MaxKey(); 14 | static { 15 | BSON.addEncodingHook(MaxKeyBox.class, getTransformer()); 16 | } 17 | 18 | // CHECKSTYLE:OFF 19 | public void __setstate__(final HashMap state) { 20 | // no state to set here. 21 | } 22 | // CHECKSTYLE:ON 23 | 24 | @Override 25 | public MaxKey get() { 26 | return MAX_KEY; 27 | } 28 | } 29 | 30 | @Override 31 | public Object construct(final Object[] args) { 32 | if (args.length != 0) { 33 | throw new PickleException( 34 | "MaxKey constructor requires 0 arguments, not " + args.length); 35 | } 36 | return new MaxKeyBox(); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /spark/src/main/java/com/mongodb/spark/pickle/MinKeyConstructor.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.spark.pickle; 2 | 3 | import net.razorvine.pickle.IObjectConstructor; 4 | import net.razorvine.pickle.PickleException; 5 | import org.bson.BSON; 6 | import org.bson.types.MinKey; 7 | 8 | import java.util.HashMap; 9 | 10 | public class MinKeyConstructor implements IObjectConstructor { 11 | 12 | public static class MinKeyBox extends BSONValueBox { 13 | private static final MinKey MIN_KEY = new MinKey(); 14 | static { 15 | BSON.addEncodingHook(MinKeyBox.class, getTransformer()); 16 | } 17 | 18 | // CHECKSTYLE:OFF 19 | public void __setstate__(final HashMap state) { 20 | // no state to set here. 21 | } 22 | // CHECKSTYLE:ON 23 | 24 | @Override 25 | public MinKey get() { 26 | return MIN_KEY; 27 | } 28 | } 29 | 30 | @Override 31 | public Object construct(final Object[] args) { 32 | if (args.length != 0) { 33 | throw new PickleException( 34 | "MinKey constructor requires 0 arguments, not " + args.length); 35 | } 36 | return new MinKeyBox(); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /spark/src/main/java/com/mongodb/spark/pickle/ObjectIdConstructor.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.spark.pickle; 2 | 3 | import net.razorvine.pickle.IObjectConstructor; 4 | import net.razorvine.pickle.PickleException; 5 | import org.bson.BSON; 6 | import org.bson.types.ObjectId; 7 | 8 | public class ObjectIdConstructor implements IObjectConstructor { 9 | 10 | public static class ObjectIdBox extends BSONValueBox { 11 | private ObjectId oid; 12 | static { 13 | BSON.addEncodingHook(ObjectIdBox.class, getTransformer()); 14 | } 15 | 16 | // CHECKSTYLE:OFF 17 | public void __setstate__(final String state) { 18 | // CHECKSTYLE:ON 19 | byte[] oidBytes = new byte[state.length()]; 20 | for (int i = 0; i < state.length(); ++i) { 21 | oidBytes[i] = (byte) state.charAt(i); 22 | } 23 | this.oid = new ObjectId(oidBytes); 24 | } 25 | 26 | @Override 27 | public ObjectId get() { 28 | return this.oid; 29 | } 30 | } 31 | 32 | @Override 33 | public Object construct(final Object[] args) { 34 | if (args.length != 0) { 35 | throw new PickleException( 36 | "ObjectId constructor requires 0 arguments, not " + args.length); 37 | } 38 | return new ObjectIdBox(); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /spark/src/main/java/com/mongodb/spark/pickle/RegexConstructor.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.spark.pickle; 2 | 3 | import net.razorvine.pickle.IObjectConstructor; 4 | import net.razorvine.pickle.PickleException; 5 | import org.bson.BSON; 6 | 7 | import java.util.HashMap; 8 | import java.util.regex.Pattern; 9 | 10 | public class RegexConstructor implements IObjectConstructor { 11 | 12 | public static class RegexBox extends BSONValueBox { 13 | private Pattern value; 14 | static { 15 | BSON.addEncodingHook(RegexBox.class, getTransformer()); 16 | } 17 | 18 | private static int pythonFlagsToJavaFlags(final int pythonFlags) { 19 | int javaFlags = 0; 20 | if ((pythonFlags & 2) > 0) { 21 | javaFlags |= Pattern.CASE_INSENSITIVE; 22 | } 23 | if ((pythonFlags & 64) > 0) { 24 | javaFlags |= Pattern.COMMENTS; 25 | } 26 | if ((pythonFlags & 16) > 0) { 27 | javaFlags |= Pattern.DOTALL; 28 | } 29 | if ((pythonFlags & 8) > 0) { 30 | javaFlags |= Pattern.MULTILINE; 31 | } 32 | if ((pythonFlags & 32) > 0) { 33 | // 0x100 == Pattern.UNICODE_CHARACTER_CLASS in Java >= 7. 34 | javaFlags |= (Pattern.UNICODE_CASE | 0x100); 35 | } 36 | return javaFlags; 37 | } 38 | 39 | @SuppressWarnings("MagicConstant") 40 | // CHECKSTYLE:OFF 41 | public void __setstate__(final HashMap state) { 42 | // CHECKSTYLE:ON 43 | Object pattern = state.get("pattern"); 44 | Object flags = state.get("flags"); 45 | if (!((pattern instanceof String) && (flags instanceof Integer))) { 46 | throw new PickleException( 47 | "Expected a String for key \"pattern\" and an Integer for " 48 | + "key \"flags\", not a " + pattern.getClass().getName() 49 | + " and a " + flags.getClass().getName()); 50 | } 51 | value = Pattern.compile( 52 | (String) pattern, pythonFlagsToJavaFlags((Integer) flags)); 53 | } 54 | 55 | @Override 56 | public Pattern get() { 57 | return value; 58 | } 59 | } 60 | 61 | @Override 62 | public Object construct(final Object[] args) { 63 | if (args.length != 0) { 64 | throw new PickleException( 65 | "Regex constructor requires 0 arguments, not " + args.length); 66 | } 67 | return new RegexBox(); 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /spark/src/main/java/com/mongodb/spark/pickle/RegisterConstructors.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.spark.pickle; 2 | 3 | import net.razorvine.pickle.Unpickler; 4 | import org.bson.BSON; 5 | 6 | public class RegisterConstructors { 7 | public void register() { 8 | Unpickler.registerConstructor("bson.binary", "Binary", 9 | new com.mongodb.spark.pickle.BinaryConstructor()); 10 | Unpickler.registerConstructor("bson.code", "Code", 11 | new com.mongodb.spark.pickle.CodeConstructor()); 12 | Unpickler.registerConstructor("bson.dbref", "DBRef", 13 | new com.mongodb.spark.pickle.DBRefConstructor()); 14 | Unpickler.registerConstructor("bson.int64", "Int64", 15 | new com.mongodb.spark.pickle.Int64Constructor()); 16 | Unpickler.registerConstructor("bson.max_key", "MaxKey", 17 | new com.mongodb.spark.pickle.MaxKeyConstructor()); 18 | Unpickler.registerConstructor("bson.min_key", "MinKey", 19 | new com.mongodb.spark.pickle.MinKeyConstructor()); 20 | Unpickler.registerConstructor("bson.timestamp", "Timestamp", 21 | new com.mongodb.spark.pickle.TimestampConstructor()); 22 | Unpickler.registerConstructor("bson.regex", "Regex", 23 | new com.mongodb.spark.pickle.RegexConstructor()); 24 | Unpickler.registerConstructor("bson.objectid", "ObjectId", 25 | new com.mongodb.spark.pickle.ObjectIdConstructor()); 26 | 27 | BSON.addEncodingHook( 28 | java.util.GregorianCalendar.class, 29 | new CalendarTransformer()); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /spark/src/main/java/com/mongodb/spark/pickle/RegisterPickles.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.spark.pickle; 2 | 3 | import net.razorvine.pickle.Pickler; 4 | 5 | public class RegisterPickles { 6 | private static final BSONPickler PICKLER = new BSONPickler(); 7 | 8 | public void register() { 9 | Pickler.registerCustomPickler(org.bson.types.ObjectId.class, PICKLER); 10 | Pickler.registerCustomPickler(org.bson.types.Binary.class, PICKLER); 11 | Pickler.registerCustomPickler(org.bson.types.Code.class, PICKLER); 12 | Pickler.registerCustomPickler(org.bson.types.CodeWScope.class, PICKLER); 13 | Pickler.registerCustomPickler( 14 | org.bson.types.CodeWithScope.class, PICKLER); 15 | Pickler.registerCustomPickler(org.bson.types.MaxKey.class, PICKLER); 16 | Pickler.registerCustomPickler(org.bson.types.MinKey.class, PICKLER); 17 | Pickler.registerCustomPickler( 18 | org.bson.types.BSONTimestamp.class, PICKLER); 19 | Pickler.registerCustomPickler(com.mongodb.DBRef.class, PICKLER); 20 | Pickler.registerCustomPickler(java.util.regex.Pattern.class, PICKLER); 21 | Pickler.registerCustomPickler(java.util.Date.class, PICKLER); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /spark/src/main/java/com/mongodb/spark/pickle/TimestampConstructor.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.spark.pickle; 2 | 3 | import net.razorvine.pickle.IObjectConstructor; 4 | import net.razorvine.pickle.PickleException; 5 | import org.bson.BSON; 6 | import org.bson.types.BSONTimestamp; 7 | 8 | import java.util.HashMap; 9 | 10 | public class TimestampConstructor implements IObjectConstructor { 11 | 12 | public static class TimestampBox extends BSONValueBox { 13 | private BSONTimestamp value; 14 | static { 15 | BSON.addEncodingHook(TimestampBox.class, getTransformer()); 16 | } 17 | 18 | // CHECKSTYLE:OFF 19 | public void __setstate__(final HashMap state) { 20 | // CHECKSTYLE:ON 21 | Object time = state.get("_Timestamp__time"); 22 | Object inc = state.get("_Timestamp__inc"); 23 | if (!((time instanceof Integer) && (inc instanceof Integer))) { 24 | throw new PickleException( 25 | "Excpected Integer for keys \"_Timestamp__time\" and " 26 | + "\"Timestamp__inc\", not a " 27 | + time.getClass().getName() + " and a " 28 | + inc.getClass().getName()); 29 | } 30 | value = new BSONTimestamp((Integer) time, (Integer) inc); 31 | } 32 | 33 | public BSONTimestamp get() { 34 | return value; 35 | } 36 | } 37 | 38 | @Override 39 | public Object construct(final Object[] args) { 40 | if (args.length != 0) { 41 | throw new PickleException( 42 | "Timestamp constructor requires 0 arguments, not " + args.length); 43 | } 44 | return new TimestampBox(); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /spark/src/main/python/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # Copyright 2015 MongoDB, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | _classifiers = """ 17 | Development Status :: 4 - Beta 18 | Intended Audience :: Developers 19 | License :: OSI Approved :: Apache Software License 20 | Operating System :: OS Independent 21 | Programming Language :: Python :: 2.6 22 | Programming Language :: Python :: 2.7 23 | Topic :: Database :: Front-Ends 24 | Topic :: Scientfic/Engineering :: Interface Engine/Protocol Translator 25 | """ 26 | 27 | try: 28 | from setuptools import setup, find_packages 29 | except ImportError: 30 | from ez_setup import use_setuptools 31 | use_setuptools() 32 | from setuptools import setup, find_packages 33 | 34 | extra_opts = {} 35 | try: 36 | with open('README.rst', 'r') as fd: 37 | extra_opts['long_description'] = fd.read() 38 | except IOError: 39 | pass # Install without README.rst 40 | 41 | setup( 42 | name='pymongo-spark', 43 | version='0.1.dev0', 44 | author='MongoDB, Inc.', 45 | author_email='mongodb-user@googlegroups.com', 46 | description='Utilities for using Spark with PyMongo', 47 | keywords=['spark', 'mongodb', 'mongo', 'hadoop', 'pymongo'], 48 | license="http://www.apache.org/licenses/LICENSE-2.0.html", 49 | platforms=['any'], 50 | url='https://github.com/mongodb/mongo-hadoop', 51 | install_requires=['pymongo>=3.0.3'], 52 | packages=find_packages(exclude=('test',)), 53 | classifiers=_classifiers.splitlines(), 54 | test_suite='test', 55 | **extra_opts 56 | ) 57 | -------------------------------------------------------------------------------- /spark/src/main/scala/com/mongodb/spark/pickle/NoopConverter.scala: -------------------------------------------------------------------------------- 1 | package com.mongodb.spark.pickle 2 | 3 | import org.apache.spark.api.python.Converter 4 | 5 | 6 | class NoopConverter extends Converter[Any, Any] { 7 | override def convert(obj: Any): Any = { obj } 8 | } 9 | -------------------------------------------------------------------------------- /streaming/examples/enron/enron_map.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | var node_mongo_hadoop = require('node_mongo_hadoop') 4 | 5 | 6 | var trimString = function(str){ 7 | return String(str).replace(/^\s+|\s+$/g, ''); 8 | } 9 | 10 | function mapFunc(doc, callback){ 11 | if(doc.headers && doc.headers.From && doc.headers.To){ 12 | var from_field = doc['headers']['From'] 13 | var to_field = doc['headers']['To'] 14 | var recips = [] 15 | to_field.split(',').forEach(function(to){ 16 | callback( {'_id': {'f':from_field, 't':trimString(to)}, 'count': 1} ) 17 | }); 18 | } 19 | } 20 | 21 | node_mongo_hadoop.MapBSONStream(mapFunc); 22 | -------------------------------------------------------------------------------- /streaming/examples/enron/enron_map.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | sys.path.append(".") 5 | 6 | from pymongo_hadoop import BSONMapper 7 | 8 | def mapper(documents): 9 | i = 0 10 | for doc in documents: 11 | i = i + 1 12 | if 'headers' in doc and 'To' in doc['headers'] and 'From' in doc['headers']: 13 | from_field = doc['headers']['From'] 14 | to_field = doc['headers']['To'] 15 | recips = [x.strip() for x in to_field.split(',')] 16 | for r in recips: 17 | yield {'_id': {'f':from_field, 't':r}, 'count': 1} 18 | 19 | BSONMapper(mapper) 20 | print >> sys.stderr, "Done Mapping." 21 | -------------------------------------------------------------------------------- /streaming/examples/enron/enron_map.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | require 'mongo-hadoop' 3 | 4 | MongoHadoop.map do |document| 5 | if document.has_key?('headers') 6 | headers = document['headers'] 7 | if ['To', 'From'].all? { |header| headers.has_key? (header) } 8 | to_field = headers['To'] 9 | from_field = headers['From'] 10 | recipients = to_field.split(',').map { |recipient| recipient.strip } 11 | recipients.map { |recipient| {:_id => {:f => from_field, :t => recipient}, :count => 1} } 12 | end 13 | end 14 | end 15 | -------------------------------------------------------------------------------- /streaming/examples/enron/enron_reduce.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | var node_mongo_hadoop = require('node_mongo_hadoop') 4 | 5 | function reduceFunc(key, values, callback){ 6 | var count = 0; 7 | values.forEach(function(v){ 8 | count += v.count 9 | }); 10 | callback( {'_id':key, 'count':count } ); 11 | } 12 | 13 | node_mongo_hadoop.ReduceBSONStream(reduceFunc); 14 | -------------------------------------------------------------------------------- /streaming/examples/enron/enron_reduce.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | sys.path.append(".") 5 | 6 | from pymongo_hadoop import BSONReducer 7 | 8 | def reducer(key, values): 9 | print >> sys.stderr, "Processing from/to %s" % str(key) 10 | _count = 0 11 | for v in values: 12 | _count += v['count'] 13 | return {'_id': key, 'count': _count} 14 | 15 | BSONReducer(reducer) 16 | -------------------------------------------------------------------------------- /streaming/examples/enron/enron_reduce.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | require 'mongo-hadoop' 3 | 4 | MongoHadoop.reduce do |key, values| 5 | count = values.reduce { |sum, current| sum += current['count'] } 6 | 7 | { :_id => key, :count => count } 8 | end 9 | -------------------------------------------------------------------------------- /streaming/examples/enron/run_enron.sh: -------------------------------------------------------------------------------- 1 | hadoop jar target/mongo-hadoop-streaming-assembly*.jar -mapper examples/enron/enron_map.py -reducer examples/enron/enron_reduce.py -inputURI mongodb://127.0.0.1/enron_mail.messages -outputURI mongodb://127.0.0.1/enron_mail.output -file examples/enron/enron_map.py -file examples/enron/enron_reduce.py 2 | -------------------------------------------------------------------------------- /streaming/examples/enron/run_enron_js.sh: -------------------------------------------------------------------------------- 1 | hadoop jar target/mongo-hadoop-streaming-assembly*.jar -mapper examples/enron/enron_map.js -reducer examples/enron/enron_reduce.js -inputURI mongodb://127.0.0.1/enron_mail.messages -outputURI mongodb://127.0.0.1/enron_mail.output -file examples/enron/enron_map.js -file examples/enron/enron_reduce.js 2 | -------------------------------------------------------------------------------- /streaming/examples/enron/run_enron_rb.sh: -------------------------------------------------------------------------------- 1 | hadoop jar target/mongo-hadoop-streaming-assembly*.jar -mapper examples/enron/enron_map.rb -reducer examples/enron/enron_reduce.rb -inputURI mongodb://127.0.0.1/enron_mail.messages -outputURI mongodb://127.0.0.1/enron_mail.output -file examples/enron/enron_map.py -file examples/enron/enron_reduce.py 2 | -------------------------------------------------------------------------------- /streaming/examples/treasury/mapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os 5 | sys.path.append(".") 6 | 7 | try: 8 | from pymongo_hadoop import BSONMapper 9 | import pymongo_hadoop 10 | print >> sys.stderr, "pymongo_hadoop is not installed or in path - will try to import from source tree." 11 | except: 12 | here = os.path.abspath(__file__) 13 | module_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(here))), 14 | 'language_support', 15 | 'python') 16 | sys.path.append(module_dir) 17 | print >> sys.stderr, sys.path 18 | from pymongo_hadoop import BSONMapper 19 | 20 | def mapper(documents): 21 | print >> sys.stderr, "Running python mapper." 22 | 23 | for doc in documents: 24 | yield {'_id': doc['_id'].year, 'bc10Year': doc['bc10Year']} 25 | 26 | print >> sys.stderr, "Python mapper finished." 27 | 28 | BSONMapper(mapper) 29 | -------------------------------------------------------------------------------- /streaming/examples/treasury/mapper.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | require 'mongo-hadoop' 3 | 4 | MongoHadoop.map do |document| 5 | { :_id => document['_id'].year, :bc10Year => document['bc10Year'] } 6 | end 7 | -------------------------------------------------------------------------------- /streaming/examples/treasury/mapper_kv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | sys.path.append(".") 5 | 6 | from pymongo_hadoop import KeyValueBSONMapper 7 | 8 | def mapper(entries): 9 | for (k, v) in entries: 10 | yield (k.year, v['bc10Year']) 11 | 12 | KeyValueBSONMapper(mapper) 13 | print >> sys.stderr, "Done Mapping." 14 | -------------------------------------------------------------------------------- /streaming/examples/treasury/mapper_kv.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | require 'mongo-hadoop' 3 | 4 | MongoHadoop.kvmap do |key, value| 5 | [key.year, value['bc10Year']] 6 | end 7 | -------------------------------------------------------------------------------- /streaming/examples/treasury/reducer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os 5 | sys.path.append(".") 6 | 7 | try: 8 | from pymongo_hadoop import BSONReducer 9 | import pymongo_hadoop 10 | except: 11 | here = os.path.abspath(__file__) 12 | module_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(here))), 13 | 'language_support', 14 | 'python') 15 | sys.path.append(module_dir) 16 | from pymongo_hadoop import BSONReducer 17 | 18 | def reducer(key, values): 19 | print >> sys.stderr, "Processing Key: %s" % key 20 | _count = _sum = 0 21 | for v in values: 22 | _count += 1 23 | _sum += v['bc10Year'] 24 | return {'_id': key, 'avg': _sum / _count, 25 | 'count': _count, 'sum': _sum } 26 | 27 | BSONReducer(reducer) 28 | -------------------------------------------------------------------------------- /streaming/examples/treasury/reducer.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | require 'mongo-hadoop' 3 | 4 | MongoHadoop.reduce do |key, values| 5 | count = sum = 0 6 | 7 | values.each do |value| 8 | count += 1 9 | sum += value['bc10Year'] 10 | end 11 | 12 | { :_id => key, :average => sum / count } 13 | end 14 | -------------------------------------------------------------------------------- /streaming/examples/treasury/reducer_kv.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | sys.path.append(".") 5 | 6 | from pymongo_hadoop import KeyValueBSONReducer, KeyValueBSONInput 7 | 8 | def reducer(key, values): 9 | print >> sys.stderr, "Processing Key: %s" % key 10 | _count = _sum = 0 11 | for v in values: 12 | _count += 1 13 | _sum += v['value'] 14 | return (key, _sum / _count) 15 | 16 | 17 | KeyValueBSONReducer(reducer, input_fh=KeyValueBSONInput()) 18 | -------------------------------------------------------------------------------- /streaming/examples/treasury/reducer_kv.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | require 'mongo-hadoop' 3 | 4 | MongoHadoop.kvreduce do |key, values| 5 | count = sum = 0 6 | 7 | values.each do |value| 8 | count += 1 9 | sum += value['value'] 10 | end 11 | 12 | [key, sum / count] 13 | end 14 | -------------------------------------------------------------------------------- /streaming/examples/treasury/run_treas_kv_py.sh: -------------------------------------------------------------------------------- 1 | hadoop jar target/mongo-hadoop-streaming-assembly*.jar -mapper examples/treasury/mapper_kv.py -reducer examples/treasury/reducer_kv.py -inputformat com.mongodb.hadoop.mapred.MongoInputFormat -outputformat com.mongodb.hadoop.mapred.MongoOutputFormat -inputURI mongodb://127.0.0.1/demo.yield_historical.in -outputURI mongodb://127.0.0.1/demo.yield_historical.streaming.kv.out -------------------------------------------------------------------------------- /streaming/examples/treasury/run_treas_kv_rb.sh: -------------------------------------------------------------------------------- 1 | hadoop jar target/mongo-hadoop-streaming-assembly*.jar -mapper examples/treasury/mapper_kv.rb -reducer examples/treasury/reducer_kv.rb -inputformat com.mongodb.hadoop.mapred.MongoInputFormat -outputformat com.mongodb.hadoop.mapred.MongoOutputFormat -inputURI mongodb://127.0.0.1/demo.yield_historical.in -outputURI mongodb://127.0.0.1/demo.yield_historical.streaming.kv.out 2 | -------------------------------------------------------------------------------- /streaming/examples/treasury/run_treas_py.sh: -------------------------------------------------------------------------------- 1 | hadoop jar target/mongo-hadoop-streaming-assembly*.jar -mapper examples/treasury/mapper.py -reducer examples/treasury/reducer.py -inputformat com.mongodb.hadoop.mapred.MongoInputFormat -outputformat com.mongodb.hadoop.mapred.MongoOutputFormat -inputURI mongodb://127.0.0.1/demo.yield_historical.in -outputURI mongodb://127.0.0.1/demo.yield_historical.streaming.out 2 | -------------------------------------------------------------------------------- /streaming/examples/treasury/run_treas_rb.sh: -------------------------------------------------------------------------------- 1 | hadoop jar target/mongo-hadoop-streaming-assembly*.jar -mapper examples/treasury/mapper.rb -reducer examples/treasury/reducer.rb -inputURI mongodb://127.0.0.1/demo.yield_historical.in -outputURI mongodb://127.0.0.1/demo.yield_historical.streaming.out 2 | -------------------------------------------------------------------------------- /streaming/examples/twitter/README.md: -------------------------------------------------------------------------------- 1 | Importing Live Twitter Data, you'll need a twitter login and password: 2 | 3 | curl https://stream.twitter.com/1/statuses/sample.json -u: | mongoimport -d test -c live 4 | 5 | This will continue streaming until you ^C it. 6 | -------------------------------------------------------------------------------- /streaming/examples/twitter/run_twit_py.sh: -------------------------------------------------------------------------------- 1 | hadoop jar target/mongo-hadoop-streaming-assembly*.jar -mapper examples/twitter/twit_map.py -reducer examples/twitter/twit_reduce.py -inputURI mongodb://127.0.0.1/test.live -outputURI mongodb://127.0.0.1/test.twit_reduction -file examples/twitter/twit_map.py -file examples/twitter/twit_reduce.py 2 | -------------------------------------------------------------------------------- /streaming/examples/twitter/run_twit_rb.sh: -------------------------------------------------------------------------------- 1 | hadoop jar target/mongo-hadoop-streaming-assembly*.jar -mapper examples/twitter/twit_map.rb -reducer examples/twitter/twit_reduce.rb -inputURI mongodb://127.0.0.1/test.live -outputURI mongodb://127.0.0.1/test.twit_reduction 2 | -------------------------------------------------------------------------------- /streaming/examples/twitter/twit_hashtag_map.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | sys.path.append(".") 5 | 6 | from pymongo_hadoop import BSONMapper 7 | 8 | def mapper(documents): 9 | for doc in documents: 10 | for hashtag in doc['entities']['hashtags']: 11 | yield {'_id': hashtag['text'], 'count': 1} 12 | 13 | BSONMapper(mapper) 14 | print >> sys.stderr, "Done Mapping." 15 | -------------------------------------------------------------------------------- /streaming/examples/twitter/twit_hashtag_reduce.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | sys.path.append(".") 5 | 6 | from pymongo_hadoop import BSONReducer 7 | 8 | def reducer(key, values): 9 | print >> sys.stderr, "Processing Hashtag %s" % key.encode('utf8') 10 | _count = 0 11 | for v in values: 12 | _count += v['count'] 13 | return {'_id': key.encode('utf8'), 'count': _count} 14 | 15 | BSONReducer(reducer) 16 | -------------------------------------------------------------------------------- /streaming/examples/twitter/twit_map.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | sys.path.append(".") 5 | 6 | from pymongo_hadoop import BSONMapper 7 | 8 | def mapper(documents): 9 | for doc in documents: 10 | if 'user' in doc: 11 | yield {'_id': doc['user']['time_zone'], 'count': 1} 12 | 13 | BSONMapper(mapper) 14 | print >> sys.stderr, "Done Mapping." 15 | -------------------------------------------------------------------------------- /streaming/examples/twitter/twit_map.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | require 'mongo-hadoop' 3 | 4 | MongoHadoop.map do |document| 5 | { :_id => document['user']['time_zone'], :count => 1 } 6 | end 7 | -------------------------------------------------------------------------------- /streaming/examples/twitter/twit_reduce.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | sys.path.append(".") 5 | 6 | from pymongo_hadoop import BSONReducer 7 | 8 | def reducer(key, values): 9 | print >> sys.stderr, "Processing Timezone %s" % key 10 | _count = 0 11 | for v in values: 12 | _count += v['count'] 13 | return {'_id': key, 'count': _count} 14 | 15 | BSONReducer(reducer) 16 | -------------------------------------------------------------------------------- /streaming/examples/twitter/twit_reduce.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | require 'mongo-hadoop' 3 | 4 | # Function that takes key and array of values, iterates over all of the values, 5 | # and returns a single document with the reduced data (summary) for that key. 6 | 7 | MongoHadoop.reduce do |key, values| 8 | count = 0 9 | 10 | values.each do |value| 11 | count += value['count'] 12 | end 13 | 14 | { :_id => key, :count => count } 15 | end 16 | -------------------------------------------------------------------------------- /streaming/language_support/js/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "author": "Mike O'Brien (http://mpobrien.net)", 3 | "name": "node_mongo_hadoop", 4 | "description": "Bindings to connect to the MongoDB adapter for writing Map/Reduce jobs in Javascript with Hadoop Streaming.", 5 | "version": "0.0.2", 6 | "homepage": "api.mongodb.org/hadoop", 7 | "repository": { 8 | "type": "git", 9 | "url": "git@github.com:mpobrien/node_mongo_hadoop.git" 10 | }, 11 | "main": "./node_mongo_hadoop", 12 | "dependencies": { 13 | "mongodb": "*", 14 | "buffers": "*", 15 | "underscore": "*" 16 | }, 17 | "devDependencies": {}, 18 | "optionalDependencies": {}, 19 | "engines": { 20 | "node": "*" 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /streaming/language_support/python/README.rst: -------------------------------------------------------------------------------- 1 | ============== 2 | pymongo_hadoop 3 | ============== 4 | :Info: See `documentation `_ for more information. See `github `_ for the latest source. 5 | :Author: Brendan McAdams 6 | :Maintainer: Mike O'Brien 7 | 8 | About 9 | ===== 10 | 11 | The pymongo_hadoop module contains basic classes for using python 12 | scripts for Hadoop Streaming jobs with the mongo-hadoop adapter. 13 | 14 | Issues / Questions / Feedback 15 | ============================= 16 | 17 | Any issues with, questions about, or feedback for PyMongo should be 18 | sent to the mongodb-user list on Google Groups. For confirmed issues 19 | or feature requests, open a case on `jira 20 | `_. Please do not e-mail any of the 21 | developers directly with issues or questions - you're more likely to 22 | get an answer on the list. 23 | 24 | Installation 25 | ============ 26 | 27 | If you have `setuptools 28 | `_ installed you 29 | should be able to do **easy_install pymongo_hadoop** to install 30 | the module. Otherwise you can download the project source and do **python 31 | setup.py install** to install. 32 | 33 | -------------------------------------------------------------------------------- /streaming/language_support/python/pymongo_hadoop/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from input import BSONInput, KeyValueBSONInput 4 | from output import BSONOutput, KeyValueBSONOutput 5 | from reducer import BSONReducer, BSONReducerInput 6 | from reducer import KeyValueBSONReducer, KeyValueBSONReducerInput 7 | from mapper import BSONMapper, KeyValueBSONMapper 8 | 9 | __all__ = ['BSONInput', 'BSONOutput', 10 | 'KeyValueBSONOutput', 'KeyValueBSONInput', 11 | 'BSONReducerInput', 'BSONReducer', 12 | 'KeyValueBSONReducer', 'KeyValueBSONReducerInput'] 13 | 14 | def dump_bits(bits): 15 | for bit in bits: 16 | print >> sys.stderr, "\t * Bit: %s Ord: %d" % (hex(ord(bit)), ord(bit)) 17 | 18 | -------------------------------------------------------------------------------- /streaming/language_support/python/pymongo_hadoop/input.py: -------------------------------------------------------------------------------- 1 | from bson import InvalidBSON, BSON 2 | from bson.codec_options import CodecOptions 3 | 4 | import sys 5 | import struct 6 | 7 | STREAMING_CODEC_OPTIONS = CodecOptions(tz_aware=True) 8 | 9 | 10 | class BSONInput(object): 11 | """Custom file class for decoding streaming BSON, 12 | based upon the Dumbo & "typedbytes" modules at 13 | https://github.com/klbostee/dumbo & 14 | https://github.com/klbostee/typedbytes 15 | """ 16 | 17 | def __init__(self, fh=sys.stdin, unicode_errors='strict'): 18 | self.fh = fh 19 | self.unicode_errors = unicode_errors 20 | self.eof = False 21 | 22 | def _read(self): 23 | try: 24 | size_bits = self.fh.read(4) 25 | size = struct.unpack("> sys.stderr, "Parsing Length record failed: %s" % e 35 | self.eof = True 36 | raise StopIteration(e) 37 | 38 | def read(self): 39 | try: 40 | return self._read() 41 | except StopIteration, e: 42 | print >> sys.stderr, "Iteration Failure: %s" % e 43 | return None 44 | 45 | def _reads(self): 46 | r = self._read 47 | while 1: 48 | yield r() 49 | 50 | def close(self): 51 | self.fh.close() 52 | 53 | __iter__ = reads = _reads 54 | 55 | class KeyValueBSONInput(BSONInput): 56 | def read(self): 57 | try: 58 | doc = self._read() 59 | except StopIteration, e: 60 | print >> sys.stderr, "Key/Value Input iteration failed/stopped: %s" % e 61 | return None 62 | if '_id' in doc: 63 | return doc['_id'], doc 64 | else: 65 | raise struct.error("Cannot read Key '_id' from Input Doc '%s'" % doc) 66 | 67 | def reads(self): 68 | it = self._reads() 69 | n = it.next 70 | while 1: 71 | doc = n() 72 | if '_id' in doc: 73 | yield doc['_id'], doc 74 | else: 75 | raise struct.error("Cannot read Key '_id' from Input Doc '%s'" % doc) 76 | 77 | __iter__ = reads 78 | -------------------------------------------------------------------------------- /streaming/language_support/python/pymongo_hadoop/mapper.py: -------------------------------------------------------------------------------- 1 | from input import BSONInput, KeyValueBSONInput 2 | from output import BSONOutput, KeyValueBSONOutput 3 | 4 | class BSONMapper(object): 5 | """Wraps BSONInput to allow writing mapper functions 6 | as generators. 7 | """ 8 | 9 | def __init__(self, target, **kwargs): 10 | """`target` should be a generator function that accepts a 11 | single argument which will be an instance of :class:`BSONInput`, 12 | and which yields dictionaries to be emitted. The yielded 13 | dictionaries should conform to the format expected by 14 | :class:`BSONInput` (i.e. they should have the key defined 15 | in a field named `_id`). 16 | 17 | Keyword arguments are passed directly to the underlying 18 | :class:`BSONInput`. 19 | """ 20 | 21 | output = BSONOutput() 22 | input = BSONInput(**kwargs) 23 | 24 | generator = target(input) 25 | for mapped in generator: 26 | output.write(mapped) 27 | 28 | class KeyValueBSONMapper(object): 29 | """Wraps KeyValueBSONInput to allow writing mapper functions 30 | as generators. 31 | """ 32 | 33 | def __init__(self, target, **kwargs): 34 | """`target` should be a generator function that accepts a 35 | single argument which will be an instance of 36 | :class:`KeyValueBSONInput`, and which yields tuples of 37 | (key, value) to be emitted. 38 | 39 | Keyword arguments are passed directly to the underlying 40 | :class:`KeyValueBSONInput`. 41 | """ 42 | 43 | output = KeyValueBSONOutput() 44 | input = KeyValueBSONInput(**kwargs) 45 | 46 | generator = target(input) 47 | for key_and_value in generator: 48 | output.write(key_and_value) 49 | 50 | -------------------------------------------------------------------------------- /streaming/language_support/python/setup.py: -------------------------------------------------------------------------------- 1 | try: 2 | from setuptools import setup, Feature 3 | except ImportError: 4 | from distribute_setup import use_setuptools 5 | use_setuptools() 6 | from setuptools import setup, Feature 7 | 8 | f = open("README.rst") 9 | try: 10 | try: 11 | readme_content = f.read() 12 | except: 13 | readme_content = "" 14 | finally: 15 | f.close() 16 | 17 | 18 | setup( 19 | name='pymongo_hadoop', 20 | version='1.1.0', 21 | maintainer="Michael O'Brien", 22 | maintainer_email='mikeo@10gen.com', 23 | long_description=readme_content, 24 | packages=['pymongo_hadoop'], 25 | url='https://github.com/mongodb/mongo-hadoop', 26 | keywords=["mongo", "mongodb", "hadoop", "hdfs", "streaming"], 27 | install_requires=[ 28 | 'pymongo' 29 | ], 30 | ) 31 | -------------------------------------------------------------------------------- /streaming/language_support/python/test_install.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | try: 4 | import pymongo 5 | from bson import _elements_to_dict, InvalidBSON 6 | except: 7 | raise Exception("Cannot find a valid pymongo installation.") 8 | 9 | try: 10 | from pymongo_hadoop import BSONInput 11 | except: 12 | raise Exception("Cannot find a valid pymongo_hadoop installation.") 13 | 14 | print "*** Everything looks OK. All required modules were found." 15 | -------------------------------------------------------------------------------- /streaming/language_support/ruby/bin/mongo-hadoop: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | require "thor" 3 | 4 | class MongoHadoop < Thor 5 | include Thor::Actions 6 | 7 | def self.source_root 8 | File.dirname(__FILE__) 9 | end 10 | 11 | desc "create PROJECT_NAME", "Create a new Mongo Hadoop project" 12 | method_option :assembly, :type => :string, :default => "mongo-hadoop-streaming-assembly*.jar" 13 | method_option :uri, :type => :string, :aliases => "-h", :default => "mongodb://127.0.0.1" 14 | method_option :database, :type => :string, :aliases => "-d", :default => "mongo_hadoop" 15 | method_option :in, :type => :string, :aliases => "-i", :default => "project.in" 16 | method_option :out, :type => :string, :aliases => "-o", :default => "project.out" 17 | 18 | def create(name) 19 | @name = name 20 | @streaming_assembly = options[:assembly] 21 | 22 | base_uri = options[:uri] 23 | db = options[:database] 24 | @input_uri = "#{base_uri}/#{db}.#{options[:in]}" 25 | @output_uri = "#{base_uri}/#{db}.#{options[:out]}" 26 | 27 | create_mapper 28 | create_reducer 29 | create_runner 30 | end 31 | 32 | private 33 | 34 | def create_mapper 35 | template '../templates/mapper.tt', "#{@name}/mapper.rb" 36 | chmod "#{@name}/mapper.rb", 0766, :verbose => false 37 | end 38 | 39 | def create_reducer 40 | template '../templates/reducer.tt', "#{@name}/reducer.rb" 41 | chmod "#{@name}/reducer.rb", 0766, :verbose => false 42 | end 43 | 44 | def create_runner 45 | template '../templates/runner.tt', "#{@name}/run.sh" 46 | chmod "#{@name}/run.sh", 0766, :verbose => false 47 | end 48 | end 49 | 50 | MongoHadoop.start -------------------------------------------------------------------------------- /streaming/language_support/ruby/lib/mongo-hadoop.rb: -------------------------------------------------------------------------------- 1 | require 'mongo-hadoop/mapper' 2 | require 'mongo-hadoop/reducer' -------------------------------------------------------------------------------- /streaming/language_support/ruby/lib/mongo-hadoop/input.rb: -------------------------------------------------------------------------------- 1 | require 'bson' 2 | 3 | class BSONInput 4 | include Enumerable 5 | 6 | def initialize(stream=nil) 7 | @stream = stream || $stdin 8 | end 9 | 10 | def read 11 | begin 12 | BSON.read_bson_document(@stream) 13 | rescue NoMethodError 14 | nil 15 | end 16 | end 17 | 18 | def each 19 | while(doc = read) 20 | yield doc 21 | end 22 | end 23 | end 24 | 25 | class BSONKeyValueInput < BSONInput 26 | def each 27 | while(doc = read) 28 | yield doc['_id'], doc 29 | end 30 | end 31 | end 32 | -------------------------------------------------------------------------------- /streaming/language_support/ruby/lib/mongo-hadoop/mapper.rb: -------------------------------------------------------------------------------- 1 | require 'mongo-hadoop/input' 2 | require 'mongo-hadoop/output' 3 | 4 | module MongoHadoop 5 | def map 6 | input = BSONInput.new 7 | output = BSONOutput.new 8 | 9 | input.each do |doc| 10 | mapped = yield doc 11 | mapped = [mapped] unless mapped.is_a?(Array) 12 | 13 | mapped.each do |mapped| 14 | output.write mapped if mapped 15 | end 16 | end 17 | end 18 | 19 | def kvmap 20 | kvinput = BSONKeyValueInput.new 21 | kvoutput = BSONKeyValueOutput.new 22 | 23 | kvinput.each do |key, value| 24 | mapped = yield key, value 25 | mapped = [mapped] unless mapped.is_a(Array) 26 | 27 | mapped.each do |mapped| 28 | kvoutput.write mapped if mapped 29 | end 30 | end 31 | end 32 | end 33 | -------------------------------------------------------------------------------- /streaming/language_support/ruby/lib/mongo-hadoop/output.rb: -------------------------------------------------------------------------------- 1 | require 'bson' 2 | 3 | class BSONOutput 4 | def initialize(stream=nil) 5 | @stream = stream || $stdout 6 | end 7 | 8 | def write(doc) 9 | bson_doc = BSON.serialize(doc) 10 | @stream.write(bson_doc) 11 | @stream.flush 12 | end 13 | end 14 | 15 | class BSONKeyValueOutput < BSONOutput 16 | def write(pair) 17 | key, value = *pair 18 | 19 | doc = value.is_a?(Hash) ? value : { :value => value } 20 | 21 | doc['_id'] = key 22 | super(doc) 23 | end 24 | end 25 | -------------------------------------------------------------------------------- /streaming/language_support/ruby/lib/mongo-hadoop/reducer.rb: -------------------------------------------------------------------------------- 1 | require 'mongo-hadoop/input' 2 | require 'mongo-hadoop/output' 3 | 4 | module MongoHadoop 5 | def reduce 6 | input = BSONInput.new 7 | output = BSONOutput.new 8 | 9 | grouped = input.group_by { |doc| doc['_id'] } 10 | 11 | grouped.each do |key, values| 12 | output.write yield key, values 13 | end 14 | end 15 | 16 | def kvreduce 17 | kvinput = BSONKeyValueInput.new 18 | kvoutput = BSONKeyValueOutput.new 19 | 20 | grouped = kvinput.inject(Hash.new) do |hash, pair| 21 | key, value = *pair 22 | hash[key] ||= [] 23 | hash[key] << value 24 | hash 25 | end 26 | 27 | grouped.each do |key, values| 28 | kvoutput.write yield key, values 29 | end 30 | end 31 | end 32 | -------------------------------------------------------------------------------- /streaming/language_support/ruby/mongo-hadoop.gemspec: -------------------------------------------------------------------------------- 1 | Gem::Specification.new do |s| 2 | s.name = 'mongo-hadoop' 3 | s.version = '1.0.0' 4 | s.date = '2012-05-20' 5 | s.summary = "MongoDB Hadoop streaming support" 6 | s.description = "Ruby MongoDB Hadoop streaming support" 7 | s.authors = ["Tyler Brock"] 8 | s.email = 'tyler.brock@gmail.com' 9 | s.files = [ 10 | "bin/mongo-hadoop", 11 | "lib/mongo-hadoop/input.rb", 12 | "lib/mongo-hadoop/output.rb", 13 | "lib/mongo-hadoop/mapper.rb", 14 | "lib/mongo-hadoop/reducer.rb" 15 | ] 16 | s.executables = ['mongo-hadoop'] 17 | s.homepage = 'http://github.com/mongodb/mongo-hadoop' 18 | s.add_dependency 'bson' 19 | s.add_dependency 'thor' 20 | end 21 | -------------------------------------------------------------------------------- /streaming/language_support/ruby/templates/mapper.tt: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | require 'mongo-hadoop' 3 | 4 | MongoHadoop::map do |document| 5 | { :_id => document['_id'] } 6 | end 7 | -------------------------------------------------------------------------------- /streaming/language_support/ruby/templates/reducer.tt: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | require 'mongo-hadoop' 3 | 4 | MongoHadoop::reduce do |key, values| 5 | { :_id => key, :count => values.size } 6 | end 7 | -------------------------------------------------------------------------------- /streaming/language_support/ruby/templates/runner.tt: -------------------------------------------------------------------------------- 1 | hadoop jar <%= @streaming_assembly %> \ 2 | -mapper ./mapper.rb \ 3 | -reducer ./reducer.rb \ 4 | -inputURI <%= @input_uri %> \ 5 | -outputURI <%= @output_uri %> \ 6 | -inputformat com.mongodb.hadoop.mapred.MongoInputFormat \ 7 | -outputformat com.mongodb.hadoop.mapred.MongoOutputFormat \ 8 | -------------------------------------------------------------------------------- /streaming/src/main/java/com/mongodb/hadoop/streaming/MongoOutput.java: -------------------------------------------------------------------------------- 1 | // MongoOutput.java 2 | /* 3 | * Copyright 2010 10gen Inc. 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.mongodb.hadoop.streaming; 19 | 20 | import com.mongodb.DBObject; 21 | 22 | public interface MongoOutput { 23 | void appendAsKey(DBObject o); 24 | 25 | void appendAsValue(DBObject o); 26 | } 27 | -------------------------------------------------------------------------------- /streaming/src/main/java/com/mongodb/hadoop/streaming/io/MongoIdentifierResolver.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.streaming.io; 2 | 3 | import com.mongodb.hadoop.io.BSONWritable; 4 | import com.mongodb.hadoop.io.MongoUpdateWritable; 5 | import org.apache.hadoop.streaming.io.IdentifierResolver; 6 | 7 | public class MongoIdentifierResolver extends IdentifierResolver { 8 | public static final String MONGODB_ID = "mongodb"; 9 | public static final String MONGO_ID = "mongo"; 10 | public static final String BSON_ID = "bson"; 11 | public static final String MONGODB_UPDATE = "mongoUpdate"; 12 | 13 | @Override 14 | public void resolve(final String identifier) { 15 | if (identifier.equalsIgnoreCase(MONGODB_ID) 16 | || identifier.equalsIgnoreCase(MONGO_ID) 17 | || identifier.equalsIgnoreCase(BSON_ID)) { 18 | setInputWriterClass(MongoInputWriter.class); 19 | setOutputReaderClass(MongoOutputReader.class); 20 | setOutputKeyClass(BSONWritable.class); 21 | setOutputValueClass(BSONWritable.class); 22 | } else if (identifier.equalsIgnoreCase(MONGODB_UPDATE)) { 23 | setInputWriterClass(MongoUpdateInputWriter.class); 24 | setOutputReaderClass(MongoUpdateOutputReader.class); 25 | setOutputKeyClass(BSONWritable.class); 26 | setOutputValueClass(MongoUpdateWritable.class); 27 | } else { 28 | super.resolve(identifier); 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /streaming/src/main/java/com/mongodb/hadoop/streaming/io/MongoInputWriter.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.streaming.io; 2 | 3 | import com.mongodb.hadoop.io.BSONWritable; 4 | import org.apache.hadoop.streaming.PipeMapRed; 5 | import org.apache.hadoop.streaming.io.InputWriter; 6 | 7 | import java.io.DataOutput; 8 | import java.io.IOException; 9 | 10 | public class MongoInputWriter extends InputWriter { 11 | 12 | private DataOutput out; 13 | 14 | @Override 15 | public void initialize(final PipeMapRed pipeMapRed) throws IOException { 16 | super.initialize(pipeMapRed); 17 | out = pipeMapRed.getClientOutput(); 18 | } 19 | 20 | @Override 21 | public void writeKey(final Object key) throws IOException { 22 | // We skip the key COMPLETELY as it's just a copy of _id 23 | // and readable by the BSON implementation 24 | } 25 | 26 | @Override 27 | public void writeValue(final BSONWritable value) throws IOException { 28 | value.write(out); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /streaming/src/main/java/com/mongodb/hadoop/streaming/io/MongoOutputReader.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.streaming.io; 2 | 3 | import com.mongodb.BasicDBObject; 4 | import com.mongodb.hadoop.io.BSONWritable; 5 | import org.apache.commons.logging.Log; 6 | import org.apache.commons.logging.LogFactory; 7 | import org.apache.hadoop.streaming.PipeMapRed; 8 | import org.apache.hadoop.streaming.io.OutputReader; 9 | 10 | import java.io.DataInput; 11 | import java.io.IOException; 12 | 13 | public class MongoOutputReader extends OutputReader { 14 | 15 | private DataInput in; 16 | private static final Log LOG = LogFactory.getLog(MongoOutputReader.class); 17 | private BSONWritable currentKey; 18 | private BSONWritable currentValue; 19 | 20 | @Override 21 | public void initialize(final PipeMapRed pipeMapRed) throws IOException { 22 | super.initialize(pipeMapRed); 23 | in = pipeMapRed.getClientInput(); 24 | this.currentKey = new BSONWritable(); 25 | this.currentValue = new BSONWritable(); 26 | } 27 | 28 | @Override 29 | public boolean readKeyValue() throws IOException { 30 | // Actually, just read the value as the key is embedded. 31 | try { 32 | currentValue.readFields(in); 33 | Object id = currentValue.getDoc().get("_id"); 34 | currentKey.setDoc(new BasicDBObject("_id", id)); 35 | // If successful we'll have an _id field 36 | return id != null; 37 | } catch (IndexOutOfBoundsException e) { 38 | // No more data 39 | LOG.info("No more data; no key/value pair read."); 40 | return false; 41 | } 42 | } 43 | 44 | @Override 45 | public BSONWritable getCurrentKey() throws IOException { 46 | return currentKey; 47 | } 48 | 49 | @Override 50 | public BSONWritable getCurrentValue() throws IOException { 51 | return currentValue; 52 | } 53 | 54 | @Override 55 | public String getLastOutput() { 56 | return currentValue.toString(); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /streaming/src/main/java/com/mongodb/hadoop/streaming/io/MongoUpdateInputWriter.java: -------------------------------------------------------------------------------- 1 | package com.mongodb.hadoop.streaming.io; 2 | 3 | import com.mongodb.hadoop.io.BSONWritable; 4 | import com.mongodb.hadoop.io.MongoUpdateWritable; 5 | import org.apache.hadoop.io.Writable; 6 | import org.apache.hadoop.streaming.PipeMapRed; 7 | import org.apache.hadoop.streaming.io.InputWriter; 8 | 9 | import java.io.DataOutput; 10 | import java.io.IOException; 11 | 12 | /** 13 | * InputWriter capable of handling both BSONWritable and MongoUpdateWritable 14 | * as value types. 15 | */ 16 | public class MongoUpdateInputWriter extends InputWriter { 17 | 18 | private DataOutput output; 19 | private final BSONWritable bsonWritable = new BSONWritable(); 20 | 21 | @Override 22 | public void initialize(final PipeMapRed pipeMapRed) throws IOException { 23 | super.initialize(pipeMapRed); 24 | output = pipeMapRed.getClientOutput(); 25 | } 26 | 27 | @Override 28 | public void writeKey(final Object key) throws IOException { 29 | // Nothing to do. 30 | } 31 | 32 | @Override 33 | public void writeValue(final Writable value) throws IOException { 34 | if (value instanceof MongoUpdateWritable) { 35 | // If we're writing to the input of a streaming script, just send 36 | // back the "query" portion of the MongoUpdateWritable, so that 37 | // mapper and reducer scripts can operate on a single document. 38 | bsonWritable.setDoc(((MongoUpdateWritable) value).getQuery()); 39 | bsonWritable.write(output); 40 | } else if (value instanceof BSONWritable) { 41 | value.write(output); 42 | } else { 43 | throw new IOException("Unexpected Writable type :" + value); 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | alias g="./gradlew --daemon" 4 | 5 | OPTS=test 6 | 7 | while [ "$1" ] 8 | do 9 | case $1 in 10 | "examples") 11 | OPTS="historicalYield sensorData enronEmails" 12 | ;; 13 | "all") 14 | HV="all" 15 | ;; 16 | esac 17 | shift 18 | done 19 | 20 | echo Running \"$OPTS\" 21 | 22 | function browser() { 23 | while [ "$1" ] 24 | do 25 | [ -f $1 ] && open $1 26 | shift 27 | done 28 | } 29 | 30 | function run() { 31 | g clean jar testJar $OPTS --stacktrace 2>&1 | tee -a build/test.out 32 | 33 | 34 | for i in "*/build/reports/tests/index.html" 35 | do 36 | if [ "`grep -i failed $i 2> /dev/null`" ] 37 | then 38 | echo "********** Found failing tests. Exiting." 39 | browser $i 40 | FAILED=true 41 | fi 42 | 43 | if [ $FAILED ] 44 | then 45 | exit 46 | fi 47 | done 48 | } 49 | 50 | run 51 | --------------------------------------------------------------------------------