├── .classpath ├── .gitignore ├── .project ├── CONTRIBUTING.md ├── License ├── README.md ├── pom.xml ├── samples ├── Hive_Tutorial.hql ├── MapReduceTutorial.java └── Pig_Tutorial.pig └── src ├── BulkImportScript.js └── com └── microsoft └── azure └── documentdb ├── hadoop ├── BackoffExponentialRetryPolicy.java ├── ConfigurationUtil.java ├── DocumentDBConnectorUtil.java ├── DocumentDBInputFormat.java ├── DocumentDBInputSplit.java ├── DocumentDBOutputCommitter.java ├── DocumentDBOutputFormat.java ├── DocumentDBRecordReader.java ├── DocumentDBRecordWriter.java ├── DocumentDBWritable.java └── DocumentDBWritableComparator.java ├── hive ├── DocumentDBSerDe.java └── DocumentDBStorageHandler.java ├── mapred └── hadoop │ ├── DocumentDBInputFormat.java │ ├── DocumentDBOutputFormat.java │ ├── DocumentDBRecordReader.java │ ├── DocumentDBRecordWriter.java │ └── WrapperSplit.java └── pig ├── DocumentDBLoader.java ├── DocumentDBStorage.java └── SchemaHelper.java /.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | /build/ 3 | target/ 4 | 5 | # Mobile Tools for Java (J2ME) 6 | .mtj.tmp/ 7 | 8 | # Package Files # 9 | *.jar 10 | *.war 11 | *.ear 12 | 13 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 14 | hs_err_pid* 15 | 16 | # Eclipse 17 | .settings/ 18 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | DocumentDBHadoop 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | org.eclipse.m2e.core.maven2Builder 15 | 16 | 17 | 18 | 19 | 20 | org.eclipse.m2e.core.maven2Nature 21 | org.eclipse.jdt.core.javanature 22 | 23 | 24 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Please read the contributing guidelines from the [Azure Team](http://azure.github.io/guidelines.html "Azure Team") -------------------------------------------------------------------------------- /License: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | Copyright (c) 2014 Microsoft Corporation 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all 12 | copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 20 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Microsoft Azure DocumentDB Hadoop Connector 2 | 3 | ![](https://img.shields.io/github/release/azure/azure-documentdb-hadoop.svg) 4 | ![](https://img.shields.io/maven-central/v/com.microsoft.azure/azure-documentdb-hadoop.svg) 5 | ![](https://img.shields.io/github/issues/azure/azure-documentdb-hadoop.svg) 6 | 7 | This project provides a client library in Java that allows Microsoft Azure DocumentDB to act as an input source or output sink for MapReduce, Hive and Pig jobs. 8 | 9 | ## Download 10 | ### Option 1: Via Github 11 | 12 | To get the binaries of this library as distributed by Microsoft, ready for use within your project, you can use [GitHub releases](https://github.com/Azure/azure-documentdb-hadoop/releases). 13 | 14 | ### Option 2: Source Via Git 15 | 16 | To get the source code of the connector via git just type: 17 | 18 | git clone git://github.com/Azure/azure-documentdb-hadoop.git 19 | 20 | ### Option 3: Source Zip 21 | 22 | To download a copy of the source code, click "Download ZIP" on the right side of the page or click [here](https://github.com/Azure/azure-documentdb-hadoop/archive/master.zip). 23 | 24 | ### Option 4: Via Maven 25 | 26 | To get the binaries of this library as distributed by Microsoft, ready for use within your project, you can use Maven. 27 | ```xml 28 | 29 | com.microsoft.azure 30 | azure-documentdb-hadoop 31 | 1.2.0 32 | 33 | ``` 34 | ### Option 5: HDInsight 35 | 36 | Install the DocumentDB Hadoop Connector onto HDInsight clusters through custom action scripts. Full instructions can be found [here](https://azure.microsoft.com/documentation/articles/documentdb-run-hadoop-with-hdinsight/). 37 | 38 | ## Requirements 39 | * Java Development Kit 7 40 | 41 | ## Supported Versions 42 | * Apache Hadoop & YARN 2.4.0 43 | * Apache Pig 0.12.1 44 | * Apache Hive & HCatalog 0.13.1 45 | * Apache Hadoop & YARN 2.6.0 46 | * Apache Pig 0.14.0 47 | * Apache Hive $ HCatalog 0.14.0 48 | * HDI 3.1 ([Getting started with HDInsight](https://azure.microsoft.com/documentation/articles/documentdb-run-hadoop-with-hdinsight/)) 49 | * HDI 3.2 50 | 51 | ## Dependencies 52 | * Microsoft Azure DocumentDB Java SDK 1.6.0 (com.microsoft.azure / azure-documentdb / 1.6.0) 53 | 54 | When using Hive: 55 | * OpenX Technologies JsonSerde 1.3.1-SNAPSHOT (org.openx.data / json-serde-parent / 1.3.1-SNAPSHOT) 56 | GitHub repo can be found [here](https://github.com/rcongiu/Hive-JSON-Serde) 57 | 58 | Please download the jars and add them to your build path. 59 | 60 | ## Usage 61 | 62 | To use this client library with Azure DocumentDB, you need to first [create an account](http://azure.microsoft.com/en-us/documentation/articles/documentdb-create-account/). 63 | 64 | ### MapReduce 65 | 66 | ##### Configuring input and output from DocumentDB Example 67 | ```Java 68 | // Import Hadoop Connector Classes 69 | import com.microsoft.azure.documentdb.Document; 70 | import com.microsoft.azure.documentdb.hadoop.ConfigurationUtil; 71 | import com.microsoft.azure.documentdb.hadoop.DocumentDBInputFormat; 72 | import com.microsoft.azure.documentdb.hadoop.DocumentDBOutputFormat; 73 | import com.microsoft.azure.documentdb.hadoop.DocumentDBWritable; 74 | 75 | // Set Configurations 76 | Configuration conf = new Configuration(); 77 | final String host = "Your DocumentDB Endpoint"; 78 | final String key = "Your DocumentDB Primary Key"; 79 | final String dbName = "Your DocumentDB Database Name"; 80 | final String inputCollNames = "Your DocumentDB Input Collection Name[s]"; 81 | final String outputCollNames = "Your DocumentDB Output Collection Name[s]"; 82 | final String query = "[Optional] Your DocumentDB Query"; 83 | final String outputStringPrecision = "[Optional] Number of bytes to use for String indexes" 84 | final String offerType = "[Optional] Your performance level for Output Collection Creations"; 85 | final String upsert = "[Optional] Bool to disable or enable document upsert"; 86 | 87 | conf.set(ConfigurationUtil.DB_HOST, host); 88 | conf.set(ConfigurationUtil.DB_KEY, key); 89 | conf.set(ConfigurationUtil.DB_NAME, dbName); 90 | conf.set(ConfigurationUtil.INPUT_COLLECTION_NAMES, inputCollNames); 91 | conf.set(ConfigurationUtil.OUTPUT_COLLECTION_NAMES, outputCollNames); 92 | conf.set(ConfigurationUtil.QUERY, query); 93 | conf.set(ConfigurationUtil.OUTPUT_STRING_PRECISION, outputStringPrecision); 94 | conf.set(ConfigurationUtil.OUTPUT_COLLECTIONS_OFFER, offerType); 95 | conf.set(ConfigurationUtil.UPSERT, upsert); 96 | ``` 97 | 98 | Full MapReduce sample can be found [here](https://github.com/Azure/azure-documentdb-hadoop/blob/master/samples/MapReduceTutorial.java). 99 | 100 | ### Hive 101 | ##### Loading data from DocumentDB Example 102 | ```Java 103 | CREATE EXTERNAL TABLE DocumentDB_Hive_Table( COLUMNS ) 104 | STORED BY 'com.microsoft.azure.documentdb.hive.DocumentDBStorageHandler' 105 | tblproperties ( 106 | 'DocumentDB.endpoint' = 'Your DocumentDB Endpoint', 107 | 'DocumentDB.key' = 'Your DocumentDB Primary Key', 108 | 'DocumentDB.db' = 'Your DocumentDB Database Name', 109 | 'DocumentDB.inputCollections' = 'Your DocumentDB Input Collection Name[s]', 110 | 'DocumentDB.query' = '[Optional] Your DocumentDB Query' ); 111 | ``` 112 | 113 | ##### Storing data to DocumentDB Example 114 | ```Java 115 | CREATE EXTERNAL TABLE Hive_DocumentDB_Table( COLUMNS ) 116 | STORED BY 'com.microsoft.azure.documentdb.hive.DocumentDBStorageHandler' 117 | tblproperties ( 118 | 'DocumentDB.endpoint' = 'Your DocumentDB Endpoint', 119 | 'DocumentDB.key' = 'Your DocumentDB Primary Key', 120 | 'DocumentDB.db' = 'Your DocumentDB Database Name', 121 | 'DocumentDB.outputCollections' = 'Your DocumentDB Output Collection Name[s]', 122 | '[Optional] DocumentDB.outputStringPrecision' = '[Optional] Number of bytes to use for String indexes', 123 | '[Optional] DocumentDB.outputCollectionsOffer' = '[Optional] Your performance level for Output Collection Creations', 124 | '[Optional] DocumentDB.upsert' = '[Optional] Bool to disable or enable document upsert'); 125 | INSERT INTO TABLE Hive_DocumentDB_Table 126 | ``` 127 | Full Hive sample can be found [here](https://github.com/Azure/azure-documentdb-hadoop/blob/master/samples/Hive_Tutorial.hql). 128 | 129 | ### Pig 130 | ##### Loading data from DocumentDB Example 131 | ```Java 132 | LOAD 'Your DocumentDB Endpoint' 133 | USING com.microsoft.azure.documentdb.hadoop.pig.DocumentDBLoader( 134 | 'Your DocumentDB Primary Key', 135 | 'Your DocumentDB Database Name', 136 | 'Your DocumentDB Input Collection Name[s]', 137 | '[Optional] Your DocumentDB SQL Query' ); 138 | ``` 139 | 140 | ##### Storing data to DocumentDB Example 141 | ```Java 142 | STORE data INTO 'DocumentDB Endpoint' 143 | USING com.microsoft.azure.documentdb.hadoop.pig.DocumentDBStorage( 144 | 'DocumentDB Primary Key', 145 | 'DocumentDB Database Name', 146 | 'DocumentDB Output Collection Name[s]', 147 | '[Optional] Your performance level for Output Collection Creations', 148 | '[Optional] Number of bytes to use for String indexes', 149 | '[Optional] Bool to disable or enable document upsert'); 150 | ``` 151 | Full Pig sample can be found [here](https://github.com/Azure/azure-documentdb-hadoop/blob/master/samples/Pig_Tutorial.pig). 152 | 153 | ## Remarks 154 | * When outputting to DocumentDB, your output collection will require capacity for an [additional stored procedure](http://azure.microsoft.com/en-us/documentation/articles/documentdb-limits/). The stored procedure will remain in your collection for reuse. 155 | * The Hadoop Connector automatically sets your indexes to range indexes with max precision on strings and numbers. More information can be found [here](http://azure.microsoft.com/en-us/documentation/articles/documentdb-indexing-policies/). 156 | * Connector supports configurable *upsert* option. *Upsert* configuration is automatically set to *true* and will overwrite documents within the same collection with the same *id*. 157 | * Reads and writes to DocumentDB will be counted against your provisioned throughput for each collection. 158 | * Output to DocumentDB collections is done in batch round robin. 159 | * Connector supports configurable *offer* option. *Offer* configuration allows users to set the [performance tier](http://azure.microsoft.com/en-us/documentation/articles/documentdb-performance-levels/) of their newly creation collections (this does not apply when outputting to an already existing collection). 160 | * Connector supports output to partitioned collections. Hadoop Connector **will not** automatically create partitioned collections for Hadoop job outputs. 161 | 162 | ## Need Help? 163 | 164 | Be sure to check out the Microsoft Azure [Developer Forums on MSDN](https://social.msdn.microsoft.com/forums/azure/en-US/home?forum=AzureDocumentDB) or the [Developer Forums on Stack Overflow](http://stackoverflow.com/questions/tagged/azure-documentdb) if you have trouble with the provided code. Also, check out our [tutorial](http://azure.microsoft.com/en-us/documentation/articles/documentdb-run-hadoop-with-hdinsight/) for more information. 165 | 166 | ## Contribute Code or Provide Feedback 167 | 168 | If you would like to become an active contributor to this project please follow the instructions provided in [Azure Projects Contribution Guidelines](http://azure.github.io/guidelines.html). 169 | 170 | If you encounter any bugs with the library please file an issue in the [Issues](https://github.com/Azure/azure-documentdb-hadoop/issues) section of the project. 171 | 172 | ## Learn More 173 | * [DocumentDB with HDInsight Tutorial](https://azure.microsoft.com/documentation/articles/documentdb-run-hadoop-with-hdinsight/) 174 | * [Official Hadoop Documentation](http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/ClusterSetup.html) 175 | * [Azure Developer Center](http://azure.microsoft.com/en-us/develop/java/) 176 | * [Azure DocumentDB Service](http://azure.microsoft.com/en-us/documentation/services/documentdb/) 177 | * [Azure DocumentDB Team Blog](http://blogs.msdn.com/b/documentdb/) 178 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | com.microsoft.azure 5 | azure-documentdb-hadoop 6 | 1.2.0 7 | jar 8 | 9 | Azure-DocumentDB-Hadoop 10 | Hadoop Connector for Microsoft Azure DocumentDB 11 | http://azure.microsoft.com/en-us/services/documentdb/ 12 | 13 | 14 | MIT License 15 | http://www.opensource.org/licenses/mit-license.php 16 | 17 | 18 | 19 | 20 | DocumentDB Team 21 | anhoh@microsoft.com 22 | Microsoft 23 | http://www.microsoft.com/ 24 | 25 | 26 | 27 | scm:git:git@github.com:Azure/azure-documentdb-hadoop.git 28 | scm:git:git@github.com:Azure/azure-documentdb-hadoop.git 29 | git@github.com:Azure/azure-documentdb-hadoop.git 30 | 31 | 32 | 33 | UTF-8 34 | 35 | 36 | src 37 | 38 | 39 | maven-compiler-plugin 40 | 3.1 41 | 42 | 1.7 43 | 1.7 44 | 45 | 46 | 47 | maven-assembly-plugin 48 | 2.2 49 | 50 | 51 | bin 52 | 53 | 54 | 55 | 56 | make-assembly 57 | package 58 | 59 | single 60 | 61 | 62 | 63 | 64 | 65 | org.sonatype.plugins 66 | nexus-staging-maven-plugin 67 | 1.6.3 68 | true 69 | 70 | ossrh 71 | https://oss.sonatype.org/ 72 | true 73 | 74 | 75 | 76 | org.apache.maven.plugins 77 | maven-source-plugin 78 | 2.2.1 79 | 80 | 81 | attach-sources 82 | 83 | jar-no-fork 84 | 85 | 86 | 87 | 88 | 89 | org.apache.maven.plugins 90 | maven-javadoc-plugin 91 | 2.9.1 92 | 93 | 94 | attach-javadocs 95 | 96 | jar 97 | 98 | 99 | 100 | 101 | 102 | org.apache.maven.plugins 103 | maven-gpg-plugin 104 | 1.5 105 | 106 | 107 | sign-artifacts 108 | verify 109 | 110 | sign 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | src 119 | 120 | BulkImportScript.js 121 | 122 | 123 | 124 | 125 | 126 | 127 | commons-io 128 | commons-io 129 | 2.4 130 | 131 | 132 | org.apache.hadoop 133 | hadoop-mapreduce 134 | 2.5.1 135 | pom 136 | provided 137 | 138 | 139 | org.apache.commons 140 | commons-lang3 141 | 3.3.2 142 | 143 | 144 | org.apache.hadoop 145 | hadoop-hdfs 146 | 2.5.1 147 | pom 148 | 149 | 150 | org.apache.hadoop 151 | hadoop-common 152 | 2.5.1 153 | 154 | 155 | commons-codec 156 | commons-codec 157 | 158 | 159 | 160 | 161 | org.apache.hadoop 162 | hadoop-mapreduce-client-core 163 | 2.5.1 164 | 165 | 166 | commons-codec 167 | commons-codec 168 | 169 | 170 | 171 | 172 | org.apache.hadoop 173 | hadoop-mapreduce-client-shuffle 174 | 2.5.1 175 | 176 | 177 | commons-codec 178 | commons-codec 179 | 180 | 181 | 182 | 183 | org.apache.hadoop 184 | hadoop-hdfs 185 | 2.5.1 186 | 187 | 188 | commons-codec 189 | commons-codec 190 | 191 | 192 | 193 | 194 | com.google.guava 195 | guava 196 | 15.0 197 | jar 198 | 199 | 200 | org.apache.httpcomponents 201 | httpclient 202 | 4.3.5 203 | 204 | 205 | org.apache.httpcomponents 206 | httpcore 207 | 4.3.2 208 | 209 | 210 | org.apache.hive 211 | hive-serde 212 | 0.13.1 213 | jar 214 | 215 | 216 | org.apache.hive 217 | hive-exec 218 | 0.13.1 219 | jar 220 | 221 | 222 | org.apache.hive 223 | hive-metastore 224 | 0.13.1 225 | jar 226 | 227 | 228 | com.microsoft.azure 229 | azure-documentdb 230 | 1.6.0 231 | 232 | 233 | org.apache.pig 234 | pig 235 | 0.13.0 236 | provided 237 | 238 | 239 | org.openx.data 240 | json-serde-parent 241 | 1.3.1-SNAPSHOT 242 | 243 | 244 | 245 | 246 | ossrh 247 | https://oss.sonatype.org/content/repositories/snapshots 248 | 249 | 250 | ossrh 251 | https://oss.sonatype.org/service/local/staging/deploy/maven2/ 252 | 253 | 254 | 255 | -------------------------------------------------------------------------------- /samples/Hive_Tutorial.hql: -------------------------------------------------------------------------------- 1 | -- Count the total number of document modifications (creations or updates) by the minute using the system generated _ts 2 | -- Read from two input collections and store ouput in a separate collection 3 | 4 | -- Add dependencies 5 | add JAR ; 6 | add JAR ; 7 | add JAR ; 8 | 9 | -- Create a Hive Table from DocumentDB ids and timestamps 10 | drop table DocumentDB_timestamps; 11 | create external table DocumentDB_timestamps(id string, ts BIGINT) 12 | stored by 'com.microsoft.azure.documentdb.hive.DocumentDBStorageHandler' 13 | tblproperties ( 14 | 'DocumentDB.endpoint' = 'DocumentDB Endpoint', 15 | 'DocumentDB.key' = 'DocumentDB Primary Key', 16 | 'DocumentDB.db' = 'DocumentDB Database Name', 17 | 'DocumentDB.inputCollections' = 'DocumentDB Input Collection Name 1,Document Input Collection Name 2', 18 | 'DocumentDB.query' = 'SELECT r._rid AS id, r._ts AS ts FROM root r' ); 19 | 20 | -- Create a Hive Table for outputting to DocumentDB 21 | drop table DocumentDB_analytics; 22 | create external table DocumentDB_analytics(Month INT, Day INT, Hour INT, Minute INT, Total INT) 23 | stored by 'com.microsoft.azure.documentdb.hive.DocumentDBStorageHandler' 24 | tblproperties ( 25 | 'DocumentDB.endpoint' = 'DocumentDB Endpoint', 26 | 'DocumentDB.key' = 'DocumentDB Primary Key', 27 | 'DocumentDB.db' = 'DocumentDB Database Name', 28 | 'DocumentDB.outputCollections' = 'DocumentDB Output Collection Name' ); 29 | 30 | -- Insert aggregations to Output Hive Table 31 | INSERT INTO table DocumentDB_analytics 32 | SELECT month(from_unixtime(ts)) as Month, day(from_unixtime(ts)) as Day, hour(from_unixtime(ts)) as Hour, minute(from_unixtime(ts)) as Minute, COUNT(*) AS Total 33 | FROM DocumentDB_timestamps 34 | GROUP BY month(from_unixtime(ts)), day(from_unixtime(ts)), hour(from_unixtime(ts)) , minute(from_unixtime(ts)); 35 | -------------------------------------------------------------------------------- /samples/MapReduceTutorial.java: -------------------------------------------------------------------------------- 1 | import java.io.IOException; 2 | import java.util.Iterator; 3 | import java.util.Set; 4 | 5 | import org.apache.hadoop.io.IntWritable; 6 | import org.apache.hadoop.io.LongWritable; 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.conf.*; 9 | import org.apache.hadoop.mapreduce.*; 10 | 11 | import com.microsoft.azure.documentdb.Document; 12 | import com.microsoft.azure.documentdb.hadoop.ConfigurationUtil; 13 | import com.microsoft.azure.documentdb.hadoop.DocumentDBInputFormat; 14 | import com.microsoft.azure.documentdb.hadoop.DocumentDBOutputFormat; 15 | import com.microsoft.azure.documentdb.hadoop.DocumentDBWritable; 16 | 17 | // Tally the number of property occurrences for all Documents in a collection 18 | public class MapReduceTutorial { 19 | public static class Map extends Mapper { 20 | private final static IntWritable one = new IntWritable(1); 21 | 22 | @Override 23 | public void map(LongWritable key, DocumentDBWritable value, 24 | Context context) 25 | throws IOException, InterruptedException { 26 | 27 | // Retrieve all property names from Document 28 | Set properties = value.getDoc().getHashMap().keySet(); 29 | 30 | for(String property : properties) { 31 | context.write(new Text(property), one); 32 | } 33 | } 34 | } 35 | 36 | public static class Reduce extends Reducer { 37 | 38 | @Override 39 | protected void reduce(Text key, Iterable values, 40 | Context context) throws IOException, InterruptedException { 41 | int sum = 0; 42 | Iterator itr = values.iterator(); 43 | 44 | // Count the number of occurrences for a given property 45 | while (itr.hasNext()) { 46 | sum += itr.next().get(); 47 | } 48 | 49 | // Write the property and frequency back into DocumentDB as a document 50 | Document d = new Document(); 51 | d.set("id", key.toString()); 52 | d.set("frequency", sum); 53 | context.write(key, new DocumentDBWritable(d)); 54 | } 55 | } 56 | 57 | 58 | public static void main(String[] args) throws Exception { 59 | Configuration conf = new Configuration(); 60 | final String host = "DocumentDB Endpoint"; 61 | final String key = "DocumentDB Primary Key"; 62 | final String dbName = "DocumentDB Database Name"; 63 | final String inputCollName = "DocumentDB Input Collection Name"; 64 | final String outputCollName = "DocumentDB Output Collection Name"; 65 | conf.set(ConfigurationUtil.DB_HOST, host); 66 | conf.set(ConfigurationUtil.DB_KEY, key); 67 | conf.set(ConfigurationUtil.DB_NAME, dbName); 68 | conf.set(ConfigurationUtil.INPUT_COLLECTION_NAMES, inputCollName); 69 | conf.set(ConfigurationUtil.OUTPUT_COLLECTION_NAMES, outputCollName); 70 | 71 | Job job = Job.getInstance(conf, "MapReduceTutorial"); 72 | job.setJobName("TallyProperties"); 73 | 74 | job.setMapperClass(Map.class); 75 | job.setReducerClass(Reduce.class); 76 | 77 | job.setInputFormatClass(DocumentDBInputFormat.class); 78 | job.setOutputFormatClass(DocumentDBOutputFormat.class); 79 | 80 | job.setMapOutputKeyClass(Text.class); 81 | job.setMapOutputValueClass(IntWritable.class); 82 | 83 | job.setOutputKeyClass(Text.class); 84 | job.setOutputValueClass(DocumentDBWritable.class); 85 | 86 | job.setJarByClass(MapReduceTutorial.class); 87 | 88 | System.exit(job.waitForCompletion(true) ? 0 : 1); 89 | } 90 | } -------------------------------------------------------------------------------- /samples/Pig_Tutorial.pig: -------------------------------------------------------------------------------- 1 | -- Count the total number of document modifications (creations or updates) by the minute using the system generated _ts 2 | -- Read from two input collections and store ouput in a separate collection 3 | 4 | -- Add dependencies 5 | REGISTER ; 6 | REGISTER ; 7 | 8 | -- Load DocumentDB ids and timestamps 9 | DocumentDB_timestamps = LOAD 'DocumentDB Endpoint' USING com.microsoft.azure.documentdb.pig.DocumentDBLoader( 10 | 'DocumentDB Primary Key', 'DocumentDB Database Name', 'DocumentDB Input Collection Name 1,DocumentDB Input Collection Name 2', 11 | 'SELECT r._rid AS id, r._ts AS ts FROM root r' ); 12 | 13 | timestamp_record = FOREACH DocumentDB_timestamps GENERATE $0#'id' as id:int, ToDate((long)($0#'ts') * 1000) as timestamp:datetime; 14 | 15 | by_minute = GROUP timestamp_record BY (GetYear(timestamp), GetMonth(timestamp), GetDay(timestamp), GetHour(timestamp), GetMinute(timestamp)); 16 | by_minute_count = FOREACH by_minute GENERATE FLATTEN(group) as (Year:int, Month:int, Day:int, Hour:int, Minute:int), COUNT(timestamp_record) as Total:int; 17 | 18 | -- Store results back into DocumentDB 19 | STORE by_minute_count INTO 'DocumentDB Endpoint' 20 | USING com.microsoft.azure.documentdb.pig.DocumentDBStorage( 21 | 'DocumentDB Primary Key', 'DocumentDB Database Name', 'DocumentDB Output Collection Name'); 22 | -------------------------------------------------------------------------------- /src/BulkImportScript.js: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------ 2 | // Copyright (c) Microsoft Corporation. All rights reserved. 3 | //------------------------------------------------------------ 4 | 5 | function bulkImport(docs, upsert) { 6 | var collection = getContext().getCollection(); 7 | var collectionLink = collection.getSelfLink(); 8 | 9 | // The count of imported docs, also used as current doc index. 10 | var count = 0; 11 | var errorCodes = { CONFLICT: 409 }; 12 | 13 | // Validate input. 14 | if (!docs) throw new Error("The array is undefined or null."); 15 | 16 | var docsLength = docs.length; 17 | if (docsLength == 0) { 18 | getContext().getResponse().setBody(0); 19 | return; 20 | } 21 | 22 | // Call the create API to create a document. 23 | tryCreate(docs[count], callback); 24 | 25 | // Note that there are 2 exit conditions: 26 | // 1) The createDocument request was not accepted. 27 | // In this case the callback will not be called, we just call 28 | // setBody and we are done. 29 | // 2) The callback was called docs.length times. 30 | // In this case all documents were created and we don’t need to call 31 | // tryCreate anymore. Just call setBody and we are done. 32 | function tryCreate(doc, callback) { 33 | var isAccepted = collection.createDocument(collectionLink, doc, { disableAutomaticIdGeneration : true}, callback); 34 | 35 | // If the request was accepted, callback will be called. 36 | // Otherwise report current count back to the client, 37 | // which will call the script again with remaining set of docs. 38 | if (!isAccepted) getContext().getResponse().setBody(count); 39 | } 40 | 41 | // To replace the document, first issue a query to find it and then call replace. 42 | function tryReplace(doc, callback) { 43 | var parsedDoc = JSON.parse(doc); 44 | retrieveDoc(parsedDoc, null, function(retrievedDocs){ 45 | var isAccepted = collection.replaceDocument(retrievedDocs[0]._self, parsedDoc, callback); 46 | if (!isAccepted) getContext().getResponse().setBody(count); 47 | }); 48 | } 49 | 50 | function retrieveDoc(doc, continuation, callback) { 51 | var query = "select * from root r where r.id = '" + doc.id + "'"; 52 | var requestOptions = { continuation : continuation }; 53 | var isAccepted = collection.queryDocuments(collectionLink, query, requestOptions, function(err, retrievedDocs, responseOptions) { 54 | if (err) throw err; 55 | 56 | if (retrievedDocs.length > 0) { 57 | callback(retrievedDocs); 58 | } else if (responseOptions.continuation) { 59 | retrieveDoc(doc, responseOptions.continuation, callback); 60 | } else { 61 | throw "Error in retrieving document: " + doc.id; 62 | } 63 | }); 64 | 65 | if (!isAccepted) getContext().getResponse().setBody(count); 66 | } 67 | 68 | // This is called when collection.createDocument is done in order to 69 | // process the result. 70 | function callback(err, doc, options) { 71 | if (err) { 72 | // Replace the document if status code is 409 and upsert is enabled 73 | if(upsert && err.number == errorCodes.CONFLICT) { 74 | return tryReplace(docs[count], callback); 75 | } else { 76 | throw err; 77 | } 78 | } 79 | 80 | // One more document has been inserted, increment the count. 81 | count++; 82 | if (count >= docsLength) { 83 | // If we created all documents, we are done. Just set the response. 84 | getContext().getResponse().setBody(count); 85 | } else { 86 | // Create next document. 87 | tryCreate(docs[count], callback); 88 | } 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/com/microsoft/azure/documentdb/hadoop/BackoffExponentialRetryPolicy.java: -------------------------------------------------------------------------------- 1 | package com.microsoft.azure.documentdb.hadoop; 2 | 3 | import org.apache.commons.logging.Log; 4 | import org.apache.commons.logging.LogFactory; 5 | 6 | import com.microsoft.azure.documentdb.DocumentClientException; 7 | 8 | public class BackoffExponentialRetryPolicy { 9 | private static final int REQUEST_RATE_TOO_LARGE = 429; 10 | 11 | private static final Log LOG = LogFactory.getLog(BackoffExponentialRetryPolicy.class); 12 | 13 | private final long defaultRetryInSeconds = 3; 14 | 15 | private final int retryAdditiveMultiplier = 500; 16 | 17 | private int currentAttemptCount = 0; 18 | 19 | private long retryAfterInMilliseconds = 0; 20 | 21 | public BackoffExponentialRetryPolicy() { 22 | 23 | } 24 | 25 | public int getCurrentAttempt() { 26 | return this.currentAttemptCount; 27 | } 28 | 29 | public boolean shouldRetry(){ 30 | return true; 31 | } 32 | 33 | /** 34 | * Report that an error has occured and sleeps if the error is retriable 35 | * @param exception 36 | */ 37 | public void errorOccured(Exception exception) { 38 | if (!isExceptionRetriable(exception)) { 39 | throw new IllegalStateException("Exception not retriable: " + exception.getMessage(), exception); 40 | } 41 | 42 | waitUntilNextTry(); 43 | } 44 | 45 | private void waitUntilNextTry() { 46 | try { 47 | LOG.info("Trial number: " + this.currentAttemptCount + ", retrying after: " + this.getRetryAfterInMilliseconds()); 48 | Thread.sleep(this.getRetryAfterInMilliseconds()); 49 | } catch (InterruptedException ignored) { 50 | } 51 | } 52 | 53 | private long getRetryAfterInMilliseconds() { 54 | return this.retryAfterInMilliseconds; 55 | } 56 | 57 | private boolean isExceptionRetriable(Exception exception) { 58 | this.retryAfterInMilliseconds = 0; 59 | 60 | if (this.CheckIfRetryNeeded(exception)) { 61 | this.currentAttemptCount++; 62 | return true; 63 | } else { 64 | return false; 65 | } 66 | } 67 | 68 | private boolean CheckIfRetryNeeded(Exception exception) { 69 | this.retryAfterInMilliseconds = 0; 70 | 71 | if(exception instanceof IllegalStateException) { 72 | exception = (Exception) exception.getCause(); 73 | } 74 | 75 | if (exception instanceof DocumentClientException) { 76 | DocumentClientException dce = (DocumentClientException) exception; 77 | 78 | if (dce.getStatusCode() == REQUEST_RATE_TOO_LARGE) { 79 | this.retryAfterInMilliseconds = dce.getRetryAfterInMilliseconds() + this.currentAttemptCount * this.retryAdditiveMultiplier; 80 | 81 | if (this.retryAfterInMilliseconds == 0) { 82 | // we should never reach here as BE should turn non-zero of 83 | // retry delay. 84 | this.retryAfterInMilliseconds = this.defaultRetryInSeconds * 1000; 85 | } 86 | 87 | return true; 88 | } 89 | } 90 | 91 | return false; 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /src/com/microsoft/azure/documentdb/hadoop/ConfigurationUtil.java: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------ 2 | // Copyright (c) Microsoft Corporation. All rights reserved. 3 | //------------------------------------------------------------ 4 | package com.microsoft.azure.documentdb.hadoop; 5 | 6 | import java.util.Map; 7 | import java.util.Properties; 8 | import java.util.Set; 9 | 10 | import org.apache.commons.lang3.StringUtils; 11 | import org.apache.hadoop.conf.Configuration; 12 | 13 | import com.google.common.collect.ImmutableSet; 14 | 15 | /** 16 | * 17 | * Provides the configuration properties needed for running a hadoop job on documentdb. 18 | * 19 | */ 20 | public class ConfigurationUtil { 21 | /** 22 | * The database Id used in the Map Reduce job. 23 | */ 24 | public static final String DB_NAME = "DocumentDB.db"; 25 | 26 | /** 27 | * Comma separated input collections Ids used in the map reduce job. 28 | */ 29 | public static final String INPUT_COLLECTION_NAMES = "DocumentDB.inputCollections"; 30 | 31 | /** 32 | * Comma separated outp collections Ids used in the map reduce job. 33 | */ 34 | public static final String OUTPUT_COLLECTION_NAMES = "DocumentDB.outputCollections"; 35 | 36 | /** 37 | * The link for the documentdb endpoint 38 | */ 39 | public static final String DB_HOST = "DocumentDB.endpoint"; 40 | 41 | /** 42 | * The masterkey used for the documentdb account. 43 | */ 44 | public static final String DB_KEY = "DocumentDB.key"; 45 | 46 | /** 47 | * the documentdb query pushed down to the input collections when reading. 48 | */ 49 | public static final String QUERY = "DocumentDB.query"; 50 | 51 | /** 52 | * Precision of the output collections' string indexes . 53 | */ 54 | public static final String OUTPUT_STRING_PRECISION = "DocumentDB.outputStringPrecision"; 55 | 56 | /** 57 | * The offer type of the output collections. 58 | */ 59 | public static final String OUTPUT_COLLECTIONS_OFFER = "DocumentDB.outputCollectionsOffer"; 60 | 61 | /** 62 | * An upsert option, true by default. This can be disabled by setting it to "false" 63 | */ 64 | public static final String UPSERT = "DocumentDB.upsert"; 65 | 66 | public static final int DEFAULT_STRING_PRECISION = -1; // Maxmum precision. 67 | 68 | /** 69 | * Gets the DocumentDB.db from the Configuration object. 70 | * @param conf job configuration object 71 | * @return database Id 72 | */ 73 | public final static String getDBName(Configuration conf) { 74 | return conf.get(DB_NAME); 75 | } 76 | 77 | /** 78 | * A set of all the configuration properties of the connector. 79 | */ 80 | private static final Set ALL_PROPERTIES = ImmutableSet.of(DB_NAME, 81 | INPUT_COLLECTION_NAMES, OUTPUT_COLLECTION_NAMES, DB_HOST, DB_KEY, 82 | QUERY); 83 | 84 | /** 85 | * Gets the DocumentDB.inputCollections from the Configuration object. 86 | * @param conf job configuration object 87 | * @return Array of collection Ids 88 | */ 89 | public final static String[] getInputCollectionNames(Configuration conf) { 90 | String[] collectionNames = conf.get(INPUT_COLLECTION_NAMES).split(","); 91 | return collectionNames; 92 | } 93 | 94 | /** 95 | * Gets the DocumentDB.outputCollections from the Configuration object. 96 | * @param conf job configuration object 97 | * @return Array of collection Ids 98 | */ 99 | public final static String[] getOutputCollectionNames(Configuration conf) { 100 | String[] collectionNames = conf.get(OUTPUT_COLLECTION_NAMES).split(","); 101 | return collectionNames; 102 | } 103 | 104 | /** 105 | * Gets the DocumentDB.endpoint from the Configuration object. 106 | * @param conf job configuration object 107 | * @return The documentdb endpoint url 108 | */ 109 | public final static String getDBEndpoint(Configuration conf) { 110 | return conf.get(DB_HOST); 111 | } 112 | 113 | /** 114 | * Gets the DocumentDB.key from the Configuration object. 115 | * @param conf job configuration object. 116 | * @return The masterkey for documentdb database account. 117 | */ 118 | public final static String getDBKey(Configuration conf) { 119 | return conf.get(DB_KEY); 120 | } 121 | 122 | /** 123 | * Gets the DocumentDB.query from the Configuration object. 124 | * @param conf job configuration object 125 | * @return sql query used to read from input collections. 126 | */ 127 | public final static String getQuery(Configuration conf) { 128 | return conf.get(QUERY); 129 | } 130 | 131 | /** 132 | * Gets the DocumentDB.outputStringPrecision from the Configuration object. 133 | * @param conf job configuration object 134 | * @return the string precision of the output collections. 135 | */ 136 | public final static int getOutputStringPrecision(Configuration conf) { 137 | String value = conf.get(OUTPUT_STRING_PRECISION); 138 | 139 | Integer stringPrecision = new Integer(DEFAULT_STRING_PRECISION); 140 | 141 | if (StringUtils.isEmpty(value)) { 142 | return stringPrecision; 143 | } 144 | 145 | try { 146 | stringPrecision = Integer.valueOf(value); 147 | } catch (IllegalArgumentException e) { 148 | throw new IllegalArgumentException("outputStringPrecision is expected to be an integer.", e); 149 | } 150 | 151 | if (stringPrecision < -1 || stringPrecision == 0) { 152 | throw new IllegalArgumentException("outputStringPrecision can only be -1 or a positive number."); 153 | } 154 | 155 | return stringPrecision; 156 | } 157 | 158 | /** 159 | * Gets the DocumentDB.upsert from the Configuration object. 160 | * @param conf job configuration object 161 | * @return the value of upsert option 162 | */ 163 | public final static boolean getUpsert(Configuration conf) { 164 | String upsert = conf.get(UPSERT); 165 | return (upsert != null && upsert.equalsIgnoreCase("false")) ? false : true; 166 | } 167 | 168 | /** 169 | * Gets the DocumentDB.outputCollectionsOffer from the Configuration object. 170 | * @param conf job configuration object 171 | * @return the value of documentdb.outputCollectionsOffer option 172 | */ 173 | public final static String getOutputCollectionsOffer(Configuration conf) { 174 | String outputCollectionsOffer = conf.get(OUTPUT_COLLECTIONS_OFFER); 175 | return (outputCollectionsOffer != null) ? outputCollectionsOffer : "S3"; 176 | } 177 | 178 | /** 179 | * Copies the configuration properties for the connector to a map. 180 | * @param from Properties object to copy from. 181 | * @param to Target map to copy properties to. 182 | */ 183 | public static void copyDocumentDBProperties(Properties from, Map to) { 184 | for (String key : ALL_PROPERTIES) { 185 | String value = from.getProperty(key); 186 | if (value != null) { 187 | to.put(key, value); 188 | } 189 | } 190 | } 191 | } 192 | -------------------------------------------------------------------------------- /src/com/microsoft/azure/documentdb/hadoop/DocumentDBConnectorUtil.java: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------ 2 | // Copyright (c) Microsoft Corporation. All rights reserved. 3 | //------------------------------------------------------------ 4 | 5 | package com.microsoft.azure.documentdb.hadoop; 6 | 7 | import java.io.IOException; 8 | import java.io.InputStream; 9 | import java.util.ArrayList; 10 | import java.util.Iterator; 11 | import java.util.List; 12 | import java.util.UUID; 13 | 14 | import org.apache.commons.io.IOUtils; 15 | import org.apache.commons.logging.Log; 16 | import org.apache.commons.logging.LogFactory; 17 | 18 | import com.microsoft.azure.documentdb.DataType; 19 | import com.microsoft.azure.documentdb.Database; 20 | import com.microsoft.azure.documentdb.Document; 21 | import com.microsoft.azure.documentdb.DocumentClient; 22 | import com.microsoft.azure.documentdb.DocumentClientException; 23 | import com.microsoft.azure.documentdb.DocumentCollection; 24 | import com.microsoft.azure.documentdb.IncludedPath; 25 | import com.microsoft.azure.documentdb.IndexingPolicy; 26 | import com.microsoft.azure.documentdb.PartitionKey; 27 | import com.microsoft.azure.documentdb.QueryIterable; 28 | import com.microsoft.azure.documentdb.RangeIndex; 29 | import com.microsoft.azure.documentdb.RequestOptions; 30 | import com.microsoft.azure.documentdb.SqlParameter; 31 | import com.microsoft.azure.documentdb.SqlParameterCollection; 32 | import com.microsoft.azure.documentdb.SqlQuerySpec; 33 | import com.microsoft.azure.documentdb.StoredProcedure; 34 | 35 | /** 36 | * 37 | * Utils used by the connector for DocumentDBCrud 38 | * 39 | */ 40 | public class DocumentDBConnectorUtil { 41 | private static final Log LOG = LogFactory.getLog(DocumentDBConnectorUtil.class); 42 | private final static int MAX_SCRIPT_DOCS = 50; 43 | private final static int MAX_SCRIPT_SIZE = 50000; 44 | private final static String BULK_IMPORT_ID = "HadoopBulkImportSprocV1"; 45 | private final static String BULK_IMPORT_PATH = "/BulkImportScript.js"; 46 | private final static int CONFLICT_ERROR = 409; 47 | 48 | public static String UserAgentSuffix = " HadoopConnector/1.1.0"; 49 | 50 | /** 51 | * Creates a document and replaces it if it already exists when isUpsert is true. The function also retries on throttling 52 | * @param client The DocumentClient instance. 53 | * @param collectionSelfLink The self link of the passed collection. 54 | * @param isUpsert Specify if the document should be upserted. 55 | */ 56 | public static Document createDocument(DocumentClient client, String collectionSelfLink, Document doc, boolean isUpsert) { 57 | BackoffExponentialRetryPolicy retryPolicy = new BackoffExponentialRetryPolicy(); 58 | while(retryPolicy.shouldRetry()){ 59 | try { 60 | if(isUpsert) { 61 | return client.upsertDocument(collectionSelfLink, doc, null, false).getResource(); 62 | } else { 63 | return client.createDocument(collectionSelfLink, doc, null, false).getResource(); 64 | } 65 | } catch(DocumentClientException e){ 66 | retryPolicy.errorOccured(e); 67 | } 68 | } 69 | 70 | return null; 71 | } 72 | 73 | /** 74 | * Gets an output collection with the passed name ( if the collection already exists return it, otherwise create new one 75 | * @param client The DocumentClient instance. 76 | * @param databaseSelfLink the self link of the passed database. 77 | * @param collectionId The id of the output collection. 78 | * @param outputStringPrecision An optional parameter that contains the default string precision to be used to create an indexing policy. 79 | * @param offerType An optional parameter that contains the offer type of the output collection. 80 | */ 81 | public static DocumentCollection getOrCreateOutputCollection(DocumentClient client, String databaseSelfLink, 82 | String collectionId, int outputStringPrecision, String offerType) throws DocumentClientException { 83 | 84 | DocumentCollection outputCollection = DocumentDBConnectorUtil.GetDocumentCollection(client, databaseSelfLink, collectionId); 85 | 86 | if (outputCollection == null) { 87 | DocumentCollection outputColl = new DocumentCollection("{ 'id':'" + collectionId + "' }"); 88 | 89 | outputColl.setIndexingPolicy(DocumentDBConnectorUtil.getOutputIndexingPolicy(outputStringPrecision)); 90 | 91 | BackoffExponentialRetryPolicy retryPolicy = new BackoffExponentialRetryPolicy(); 92 | 93 | while(retryPolicy.shouldRetry()) { 94 | try { 95 | RequestOptions options = new RequestOptions(); 96 | options.setOfferType(offerType); 97 | outputCollection = client.createCollection(databaseSelfLink, outputColl, options).getResource(); 98 | break; 99 | } catch (Exception e) { 100 | retryPolicy.errorOccured(e); 101 | } 102 | } 103 | } 104 | 105 | return outputCollection; 106 | } 107 | 108 | /** 109 | * Gets an output collection with the passed name ( if the collection already exists return it, otherwise create new one 110 | * @param client The DocumentClient instance. 111 | * @param databaseSelfLink the self link of the passed database. 112 | * @param collectionId The id of the output collection. 113 | */ 114 | public static DocumentCollection GetDocumentCollection(DocumentClient client, String databaseSelfLink, String collectionId) { 115 | BackoffExponentialRetryPolicy retryPolicy = new BackoffExponentialRetryPolicy(); 116 | QueryIterable collIterable = client.queryCollections( 117 | databaseSelfLink, 118 | new SqlQuerySpec("SELECT * FROM root r WHERE r.id=@id", 119 | new SqlParameterCollection(new SqlParameter("@id", collectionId))), 120 | null).getQueryIterable(); 121 | 122 | List collections = null; 123 | while(retryPolicy.shouldRetry()){ 124 | try { 125 | collections = collIterable.toList(); 126 | break; 127 | } catch (Exception e) { 128 | retryPolicy.errorOccured(e); 129 | } 130 | } 131 | 132 | if(collections.size() == 0) { 133 | return null; 134 | } 135 | 136 | return collections.get(0); 137 | } 138 | 139 | public static Database GetDatabase(DocumentClient client, String databaseId) { 140 | BackoffExponentialRetryPolicy retryPolicy = new BackoffExponentialRetryPolicy(); 141 | QueryIterable dbIterable = client.queryDatabases( 142 | new SqlQuerySpec("SELECT * FROM root r WHERE r.id=@id", 143 | new SqlParameterCollection(new SqlParameter("@id", databaseId))), 144 | null).getQueryIterable(); 145 | 146 | List databases = null; 147 | while(retryPolicy.shouldRetry()){ 148 | try { 149 | databases = dbIterable.toList(); 150 | break; 151 | } catch (Exception e) { 152 | retryPolicy.errorOccured(e); 153 | } 154 | } 155 | 156 | if(databases.size() == 0) { 157 | return null; 158 | } 159 | 160 | return databases.get(0); 161 | } 162 | 163 | /** 164 | * Gets the bulk import stored procedure that will be used for writing documents ( if the sproc already exists, use it, otherwise create a new one. 165 | * @param client the DocumentClient instance for DocumentDB. 166 | * @param collectionLink the self-link of the collection to write to. 167 | * @return StoredProcedure instance that will be used for writing 168 | */ 169 | public static StoredProcedure CreateBulkImportStoredProcedure(DocumentClient client, String collectionLink) 170 | throws DocumentClientException { 171 | BackoffExponentialRetryPolicy retryPolicy = new BackoffExponentialRetryPolicy(); 172 | List sprocs = null; 173 | 174 | while(retryPolicy.shouldRetry()){ 175 | try { 176 | sprocs = client.queryStoredProcedures(collectionLink, 177 | new SqlQuerySpec("SELECT * FROM root r WHERE r.id=@id", 178 | new SqlParameterCollection(new SqlParameter("@id", BULK_IMPORT_ID))), 179 | null).getQueryIterable().toList(); 180 | break; 181 | } catch (Exception e) { 182 | retryPolicy.errorOccured(e); 183 | } 184 | } 185 | 186 | if(sprocs.size() > 0) { 187 | return sprocs.get(0); 188 | } 189 | 190 | StoredProcedure sproc = new StoredProcedure(); 191 | sproc.setId(BULK_IMPORT_ID); 192 | String sprocBody = getBulkImportBody(client); 193 | sproc.setBody(sprocBody); 194 | return client.createStoredProcedure(collectionLink, sproc, null).getResource(); 195 | } 196 | 197 | /** 198 | * Executes the bulk import stored procedure for a list of documents. 199 | * The execution takes into consideration throttling and blacklisting of the stored procedure. 200 | * @param client The DocumentClient instance for DocumentDB 201 | * @param collectionSelfLink the self-link for the collection to write to. 202 | * @param sproc The stored procedure to execute 203 | * @param allDocs The list of documents to write 204 | * @param upsert Specifies whether to replace the document if exists or not. By default it's true. 205 | */ 206 | public static void executeWriteStoredProcedure(final DocumentClient client, String collectionSelfLink, final StoredProcedure sproc, 207 | List allDocs, final boolean upsert) { 208 | 209 | int currentCount = 0; 210 | 211 | while (currentCount < allDocs.size()) 212 | { 213 | String []jsonArrayString = CreateBulkInsertScriptArguments(allDocs, currentCount, MAX_SCRIPT_SIZE); 214 | BackoffExponentialRetryPolicy retryPolicy = new BackoffExponentialRetryPolicy(); 215 | String response = null; 216 | while(retryPolicy.shouldRetry()){ 217 | try { 218 | response = client.executeStoredProcedure(sproc.getSelfLink(), new Object[] { jsonArrayString, upsert }) 219 | .getResponseAsString(); 220 | break; 221 | } catch(Exception e){ 222 | retryPolicy.errorOccured(e); 223 | } 224 | } 225 | 226 | int createdCount = Integer.parseInt(response); 227 | currentCount += createdCount; 228 | } 229 | } 230 | 231 | /** 232 | * 233 | * @param docs The list of documents to be created 234 | * @param currentIndex the current index in the list of docs to start with. 235 | * @param maxCount the max count to be created by the sproc. 236 | * @param maxScriptSize the max size of the sproc that is used to avoid exceeding the max request size. 237 | * @return a string array for all documents to be created 238 | */ 239 | private static String[] CreateBulkInsertScriptArguments(List docs, int currentIndex, int maxScriptSize) 240 | { 241 | if (currentIndex >= docs.size()) return new String[]{}; 242 | 243 | ArrayList jsonDocumentList = new ArrayList(); 244 | String stringifiedDoc; 245 | int scriptCapacityRemaining = maxScriptSize; 246 | 247 | int i = 0; 248 | while (scriptCapacityRemaining > 0 && i < MAX_SCRIPT_DOCS && currentIndex + i < docs.size()) 249 | { 250 | stringifiedDoc = docs.get(currentIndex + i).toString(); 251 | jsonDocumentList.add(stringifiedDoc); 252 | scriptCapacityRemaining-= stringifiedDoc.length(); 253 | i++; 254 | } 255 | 256 | String[] jsonDocumentArray = new String[jsonDocumentList.size()]; 257 | jsonDocumentList.toArray(jsonDocumentArray); 258 | return jsonDocumentArray; 259 | } 260 | 261 | /** 262 | * Reads the bulk import script body from the file. 263 | * @param client the DocumentClient instance. 264 | * @return a string that contains the stored procedure body. 265 | */ 266 | private static String getBulkImportBody(DocumentClient client) { 267 | try { 268 | InputStream stream = DocumentDBConnectorUtil.class.getResourceAsStream(BULK_IMPORT_PATH); 269 | List scriptLines = IOUtils.readLines(stream); 270 | StringBuilder scriptBody = new StringBuilder(); 271 | for (Iterator iterator = scriptLines.iterator(); iterator.hasNext();) { 272 | String line = (String) iterator.next(); 273 | scriptBody.append(line + "\n"); 274 | } 275 | 276 | return scriptBody.toString(); 277 | } catch (IOException e) { 278 | throw new IllegalStateException(e); 279 | } 280 | } 281 | 282 | /** 283 | * If no id is provided, replace it with an auto generated guid id. 284 | * @param doc The document to be checked for id. 285 | */ 286 | public static void addIdIfMissing(Document doc) { 287 | if (doc.getId() == null) { 288 | doc.setId(UUID.randomUUID().toString()); 289 | } 290 | } 291 | 292 | private static IndexingPolicy getOutputIndexingPolicy(int outputStringPrecision) { 293 | // Setup indexing policy. 294 | IndexingPolicy policy = new IndexingPolicy(); 295 | ArrayList includedPaths = new ArrayList(); 296 | 297 | // All paths. 298 | IncludedPath path = new IncludedPath(); 299 | RangeIndex stringIndex = new RangeIndex(DataType.String); 300 | stringIndex.setPrecision(outputStringPrecision); 301 | path.getIndexes().add(stringIndex); 302 | RangeIndex numberIndex = new RangeIndex(DataType.Number); 303 | numberIndex.setPrecision(-1); // Maximum precision 304 | path.getIndexes().add(numberIndex); 305 | path.setPath("/*"); 306 | includedPaths.add(path); 307 | policy.setIncludedPaths(includedPaths); 308 | return policy; 309 | } 310 | } 311 | -------------------------------------------------------------------------------- /src/com/microsoft/azure/documentdb/hadoop/DocumentDBInputFormat.java: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------ 2 | // Copyright (c) Microsoft Corporation. All rights reserved. 3 | //------------------------------------------------------------ 4 | package com.microsoft.azure.documentdb.hadoop; 5 | 6 | import java.io.IOException; 7 | import java.util.List; 8 | 9 | import org.apache.commons.logging.LogFactory; 10 | import org.apache.hadoop.conf.Configuration; 11 | import org.apache.hadoop.io.LongWritable; 12 | import org.apache.hadoop.mapreduce.InputFormat; 13 | import org.apache.hadoop.mapreduce.InputSplit; 14 | import org.apache.hadoop.mapreduce.JobContext; 15 | import org.apache.hadoop.mapreduce.RecordReader; 16 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 17 | import org.apache.commons.logging.Log; 18 | 19 | /** 20 | * An input format that can read data from Azure DocumentDB. It sends one Document 21 | * at a time to the mapper. 22 | */ 23 | public class DocumentDBInputFormat extends InputFormat { 24 | 25 | private static final Log LOG = LogFactory.getLog(DocumentDBWritable.class); 26 | 27 | /** 28 | * Creates an instance of DocumentDBRecordReader 29 | */ 30 | @Override 31 | public RecordReader createRecordReader(InputSplit split, 32 | TaskAttemptContext context) throws IOException, InterruptedException { 33 | return new DocumentDBRecordReader((DocumentDBInputSplit) split); 34 | } 35 | 36 | /** 37 | * Gets a list of DocumentDBInputSplit and validates all the required properties to read from documentdb. 38 | */ 39 | @Override 40 | public List getSplits(JobContext context) throws IOException, InterruptedException { 41 | Configuration conf = context.getConfiguration(); 42 | final String endpoint = ConfigurationUtil.getDBEndpoint(conf); 43 | final String key = ConfigurationUtil.getDBKey(conf); 44 | final String dbName = ConfigurationUtil.getDBName(conf); 45 | final String[] collectionNames = ConfigurationUtil.getInputCollectionNames(conf); 46 | final String query = ConfigurationUtil.getQuery(conf); 47 | 48 | if (endpoint == null) 49 | throw new IOException("DB_HOST must be set for the jobconf"); 50 | if (key == null) 51 | throw new IOException("DB_KEY must be set for the jobconf"); 52 | if (dbName == null) 53 | throw new IOException("DB_NAME must be set for the jobconf"); 54 | if (collectionNames.length < 1) 55 | throw new IOException("INPUT_COLLECTION_NAMES must be set for the jobconf as comma separated names"); 56 | return DocumentDBInputSplit.getSplits(conf, endpoint, key, dbName, collectionNames, query); 57 | } 58 | } -------------------------------------------------------------------------------- /src/com/microsoft/azure/documentdb/hadoop/DocumentDBInputSplit.java: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------ 2 | // Copyright (c) Microsoft Corporation. All rights reserved. 3 | //------------------------------------------------------------ 4 | package com.microsoft.azure.documentdb.hadoop; 5 | 6 | import java.io.DataInput; 7 | import java.io.DataOutput; 8 | import java.io.IOException; 9 | import java.util.Arrays; 10 | import java.util.Iterator; 11 | import java.util.LinkedList; 12 | import java.util.List; 13 | 14 | import org.apache.commons.logging.Log; 15 | import org.apache.commons.logging.LogFactory; 16 | import org.apache.hadoop.conf.Configuration; 17 | import org.apache.hadoop.io.Text; 18 | import org.apache.hadoop.io.Writable; 19 | import org.apache.hadoop.mapreduce.InputSplit; 20 | 21 | import com.microsoft.azure.documentdb.ConnectionPolicy; 22 | import com.microsoft.azure.documentdb.ConsistencyLevel; 23 | import com.microsoft.azure.documentdb.Database; 24 | import com.microsoft.azure.documentdb.Document; 25 | import com.microsoft.azure.documentdb.DocumentClient; 26 | import com.microsoft.azure.documentdb.DocumentCollection; 27 | import com.microsoft.azure.documentdb.FeedOptions; 28 | import com.microsoft.azure.documentdb.QueryIterable; 29 | import com.microsoft.azure.documentdb.SqlParameter; 30 | import com.microsoft.azure.documentdb.SqlParameterCollection; 31 | import com.microsoft.azure.documentdb.SqlQuerySpec; 32 | 33 | /** 34 | * An input split that represents one collection from documentdb. It reads data one page at a time and 35 | * sends one by one document to the mapper. 36 | * In order to be able to use it, you need to set the required configuration properties for the input split. 37 | */ 38 | public class DocumentDBInputSplit extends InputSplit implements Writable, org.apache.hadoop.mapred.InputSplit { 39 | 40 | private static final Log LOG = LogFactory.getLog(DocumentDBWritable.class); 41 | private final int MAX_PAGE_SIZE = 700; 42 | private Text host, key, dbName, collName, query; 43 | private Iterator documentIterator; 44 | 45 | public DocumentDBInputSplit() { 46 | this.host = new Text(); 47 | this.key = new Text(); 48 | this.dbName = new Text(); 49 | this.collName = new Text(); 50 | this.query = new Text(); 51 | } 52 | 53 | public DocumentDBInputSplit(String host, String key, String dbName, String collName, String query) { 54 | this.host = new Text(host); 55 | this.key = new Text(key); 56 | this.dbName = new Text(dbName); 57 | this.collName = new Text(collName); 58 | if (query == null) { 59 | query = ""; 60 | } 61 | 62 | this.query = new Text(query); 63 | } 64 | 65 | /** 66 | * Gets the list of DocumentDBInputSplit used. 67 | */ 68 | public static List getSplits(Configuration conf, String dbHost, String dbKey, String dbName, 69 | String[] collNames, String query) { 70 | int internalNumSplits = collNames.length; 71 | List splits = new LinkedList(); 72 | for (int i = 0; i < internalNumSplits; i++) { 73 | splits.add(new DocumentDBInputSplit(dbHost, dbKey, dbName, collNames[i].trim(), query)); 74 | } 75 | 76 | return splits; 77 | } 78 | 79 | /** 80 | * @inheritDoc 81 | */ 82 | @Override 83 | public long getLength() { 84 | return Integer.MAX_VALUE; 85 | } 86 | 87 | /** 88 | * @inheritDoc 89 | */ 90 | @Override 91 | public String[] getLocations() throws IOException { 92 | // Since we're pulling the data from DocumentDB, it's not localized 93 | // to any single node so just return localhost. 94 | return new String[] { "localhost" }; 95 | } 96 | 97 | public String getCollectionName() { 98 | return this.collName.toString(); 99 | } 100 | 101 | /** 102 | * @inheritDoc 103 | */ 104 | public void readFields(DataInput in) throws IOException { 105 | this.host.readFields(in); 106 | this.key.readFields(in); 107 | this.dbName.readFields(in); 108 | this.collName.readFields(in); 109 | this.query.readFields(in); 110 | } 111 | 112 | /** 113 | * @inheritDoc 114 | */ 115 | public void write(DataOutput out) throws IOException { 116 | this.host.write(out); 117 | this.key.write(out); 118 | this.dbName.write(out); 119 | this.collName.write(out); 120 | this.query.write(out); 121 | } 122 | 123 | /** 124 | * 125 | * @return an Iterator for documents in the collection wrapped by the split. 126 | * @throws IOException if a read operation fails on documentdb 127 | */ 128 | public Iterator getDocumentIterator() throws IOException { 129 | if (this.documentIterator != null) 130 | return this.documentIterator; 131 | 132 | Database db; 133 | DocumentCollection coll; 134 | DocumentClient client; 135 | try { 136 | LOG.debug("Connecting to " + this.host + " and reading from collection " + this.collName); 137 | ConnectionPolicy policy = ConnectionPolicy.GetDefault(); 138 | policy.setUserAgentSuffix(DocumentDBConnectorUtil.UserAgentSuffix); 139 | client = new DocumentClient(this.host.toString(), this.key.toString(), policy, 140 | ConsistencyLevel.Session); 141 | 142 | db = DocumentDBConnectorUtil.GetDatabase(client, this.dbName.toString()); 143 | if (db == null) { 144 | throw new IOException(String.format("Database %s doesn't exist", this.dbName)); 145 | } 146 | 147 | coll = DocumentDBConnectorUtil.GetDocumentCollection(client, db.getSelfLink(), this.collName.toString()); 148 | if (coll == null) { 149 | throw new IOException(String.format("collection %s doesn't exist", this.collName)); 150 | } 151 | 152 | String query = this.query.toString(); 153 | if (query != null && !query.isEmpty()) { 154 | query = this.query.toString(); 155 | } else { 156 | query = "select * from root"; 157 | } 158 | 159 | FeedOptions options = new FeedOptions(); 160 | options.setPageSize(MAX_PAGE_SIZE); 161 | this.documentIterator = client.queryDocuments( 162 | coll.getSelfLink(), 163 | query, 164 | options).getQueryIterator(); 165 | } catch (Exception e) { 166 | throw new IOException(e); 167 | } 168 | 169 | return this.documentIterator; 170 | } 171 | 172 | public String toString() { 173 | return String.format("DocumentDBSplit(collection=%s)", this.collName); 174 | } 175 | 176 | } -------------------------------------------------------------------------------- /src/com/microsoft/azure/documentdb/hadoop/DocumentDBOutputCommitter.java: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------ 2 | // Copyright (c) Microsoft Corporation. All rights reserved. 3 | //------------------------------------------------------------ 4 | package com.microsoft.azure.documentdb.hadoop; 5 | 6 | import org.apache.commons.logging.Log; 7 | import org.apache.commons.logging.LogFactory; 8 | import org.apache.hadoop.mapreduce.JobContext; 9 | import org.apache.hadoop.mapreduce.OutputCommitter; 10 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 11 | 12 | public class DocumentDBOutputCommitter extends OutputCommitter { 13 | 14 | private static final Log LOG = LogFactory.getLog(DocumentDBWritable.class); 15 | 16 | /** 17 | * @inheritDoc 18 | */ 19 | @Override 20 | public void abortTask(final TaskAttemptContext taskContext) { 21 | LOG.info("Aborting task."); 22 | } 23 | 24 | /** 25 | * @inheritDoc 26 | */ 27 | @Override 28 | public void commitTask(final TaskAttemptContext taskContext) { 29 | LOG.info("Committing task."); 30 | } 31 | 32 | /** 33 | * @inheritDoc 34 | */ 35 | @Override 36 | public boolean needsTaskCommit(final TaskAttemptContext taskContext) { 37 | return true; 38 | } 39 | 40 | /** 41 | * @inheritDoc 42 | */ 43 | @Override 44 | public void setupJob(final JobContext jobContext) { 45 | LOG.info("Setting up job."); 46 | } 47 | 48 | /** 49 | * @inheritDoc 50 | */ 51 | @Override 52 | public void setupTask(final TaskAttemptContext taskContext) { 53 | LOG.info("Setting up task."); 54 | } 55 | 56 | } -------------------------------------------------------------------------------- /src/com/microsoft/azure/documentdb/hadoop/DocumentDBOutputFormat.java: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------ 2 | // Copyright (c) Microsoft Corporation. All rights reserved. 3 | //------------------------------------------------------------ 4 | package com.microsoft.azure.documentdb.hadoop; 5 | 6 | import java.io.IOException; 7 | 8 | import org.apache.hadoop.conf.Configuration; 9 | import org.apache.hadoop.io.Writable; 10 | import org.apache.hadoop.mapreduce.JobContext; 11 | import org.apache.hadoop.mapreduce.OutputCommitter; 12 | import org.apache.hadoop.mapreduce.OutputFormat; 13 | import org.apache.hadoop.mapreduce.RecordWriter; 14 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 15 | 16 | /** 17 | * An output format that is used to write data to documentdb. 18 | */ 19 | public class DocumentDBOutputFormat extends OutputFormat { 20 | 21 | /** 22 | * Validates the required properties needed to write to documentdb. 23 | */ 24 | @Override 25 | public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException { 26 | Configuration conf = context.getConfiguration(); 27 | final String endpoint = ConfigurationUtil.getDBEndpoint(conf); 28 | final String key = ConfigurationUtil.getDBKey(conf); 29 | final String dbName = ConfigurationUtil.getDBName(conf); 30 | final String[] collectionNames = ConfigurationUtil.getOutputCollectionNames(conf); 31 | 32 | if (endpoint == null) 33 | throw new IOException("DB_HOST must be set for the jobconf"); 34 | if (key == null) 35 | throw new IOException("DB_KEY must be set for the jobconf"); 36 | if (dbName == null) 37 | throw new IOException("DB_NAME must be set for the jobconf"); 38 | if (collectionNames == null || collectionNames.length == 0) 39 | throw new IOException("OUTPUT_COLLECTION_NAMES must be set for the jobconf as comma separated names"); 40 | } 41 | 42 | /** 43 | * {@inheritDoc} 44 | */ 45 | @Override 46 | public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException { 47 | return new DocumentDBOutputCommitter(); 48 | } 49 | 50 | /** 51 | * Creates an instance of DocumentDBRecordWriter. 52 | */ 53 | @Override 54 | public RecordWriter getRecordWriter(TaskAttemptContext context) throws IOException, 55 | InterruptedException { 56 | Configuration conf = context.getConfiguration(); 57 | return new DocumentDBRecordWriter(conf, ConfigurationUtil.getDBEndpoint(conf), 58 | ConfigurationUtil.getDBKey(conf), ConfigurationUtil.getDBName(conf), 59 | ConfigurationUtil.getOutputCollectionNames(conf), 60 | ConfigurationUtil.getOutputStringPrecision(conf), 61 | ConfigurationUtil.getUpsert(conf), 62 | ConfigurationUtil.getOutputCollectionsOffer(conf)); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/com/microsoft/azure/documentdb/hadoop/DocumentDBRecordReader.java: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------ 2 | // Copyright (c) Microsoft Corporation. All rights reserved. 3 | //------------------------------------------------------------ 4 | package com.microsoft.azure.documentdb.hadoop; 5 | 6 | import java.io.IOException; 7 | import java.util.Iterator; 8 | 9 | import org.apache.commons.logging.Log; 10 | import org.apache.commons.logging.LogFactory; 11 | import org.apache.hadoop.io.LongWritable; 12 | import org.apache.hadoop.mapreduce.InputSplit; 13 | import org.apache.hadoop.mapreduce.RecordReader; 14 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 15 | 16 | import com.microsoft.azure.documentdb.Document; 17 | 18 | /** 19 | * Reads documents from documendb using a DocumentDBIterable instance. 20 | */ 21 | public class DocumentDBRecordReader extends 22 | RecordReader { 23 | 24 | private DocumentDBInputSplit split; 25 | private Iterator documentIterator; 26 | private long documentsProcessed; 27 | private DocumentDBWritable current; 28 | private static final Log LOG = LogFactory.getLog(DocumentDBWritable.class); 29 | 30 | public DocumentDBRecordReader(DocumentDBInputSplit split) throws IOException { 31 | this.split = split; 32 | this.current = new DocumentDBWritable(); 33 | this.documentIterator = this.split.getDocumentIterator(); 34 | } 35 | 36 | public void close() throws IOException { 37 | 38 | } 39 | 40 | public float getProgress() throws IOException { 41 | if(this.documentIterator == null) return 0f; 42 | boolean hasNext = false; 43 | BackoffExponentialRetryPolicy policy = new BackoffExponentialRetryPolicy(); 44 | while(policy.shouldRetry()) { 45 | try { 46 | hasNext = this.documentIterator.hasNext(); 47 | break; 48 | } 49 | catch(Exception e) { 50 | policy.errorOccured(e); 51 | } 52 | } 53 | 54 | return hasNext ? 0f : 1f; 55 | } 56 | 57 | /** 58 | * {@inheritDoc} 59 | */ 60 | @Override 61 | public LongWritable getCurrentKey() throws IOException, 62 | InterruptedException { 63 | return new LongWritable(); 64 | } 65 | 66 | /** 67 | * {@inheritDoc} 68 | */ 69 | @Override 70 | public DocumentDBWritable getCurrentValue() throws IOException, 71 | InterruptedException { 72 | return current; 73 | } 74 | 75 | /** 76 | * {@inheritDoc} 77 | */ 78 | @Override 79 | public void initialize(InputSplit split, TaskAttemptContext context) 80 | throws IOException, InterruptedException { 81 | if(this.split == null) this.split = (DocumentDBInputSplit) split; 82 | } 83 | 84 | /** 85 | * {@inheritDoc} 86 | */ 87 | @Override 88 | public boolean nextKeyValue() throws IOException, InterruptedException { 89 | 90 | BackoffExponentialRetryPolicy retryPolicy = new BackoffExponentialRetryPolicy(); 91 | while(retryPolicy.shouldRetry()) { 92 | try { 93 | if (this.documentIterator == null || !this.documentIterator.hasNext()) { 94 | LOG.info(String.format("processed %d documents of collection %s", this.documentsProcessed, this.split.getCollectionName())); 95 | return false; 96 | } 97 | 98 | if(documentsProcessed % 100 == 0) { 99 | LOG.info(String.format("processed %d documents of collection %s", this.documentsProcessed, this.split.getCollectionName())); 100 | } 101 | 102 | this.current.setDoc(this.documentIterator.next()); 103 | this.documentsProcessed++; 104 | break; 105 | } catch(Exception e) { 106 | retryPolicy.errorOccured(e); 107 | } 108 | } 109 | 110 | return true; 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /src/com/microsoft/azure/documentdb/hadoop/DocumentDBRecordWriter.java: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------ 2 | // Copyright (c) Microsoft Corporation. All rights reserved. 3 | //------------------------------------------------------------ 4 | package com.microsoft.azure.documentdb.hadoop; 5 | 6 | import java.io.IOException; 7 | import java.util.LinkedList; 8 | import java.util.List; 9 | 10 | import org.apache.commons.logging.Log; 11 | import org.apache.commons.logging.LogFactory; 12 | import org.apache.hadoop.conf.Configuration; 13 | import org.apache.hadoop.io.Writable; 14 | import org.apache.hadoop.mapreduce.RecordWriter; 15 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 16 | 17 | import com.microsoft.azure.documentdb.ConnectionPolicy; 18 | import com.microsoft.azure.documentdb.ConsistencyLevel; 19 | import com.microsoft.azure.documentdb.Database; 20 | import com.microsoft.azure.documentdb.Document; 21 | import com.microsoft.azure.documentdb.DocumentClient; 22 | import com.microsoft.azure.documentdb.DocumentCollection; 23 | import com.microsoft.azure.documentdb.StoredProcedure; 24 | 25 | /** 26 | * Writes data to DocumentDB in document batches using a stored procedure. 27 | */ 28 | public class DocumentDBRecordWriter extends RecordWriter { 29 | private static final Log LOG = LogFactory.getLog(DocumentDBWritable.class); 30 | private static int MAX_DOC_SIZE = 50; 31 | private DocumentClient client; 32 | private DocumentCollection[] collections; 33 | private StoredProcedure[] sprocs; 34 | private boolean enableUpsert; 35 | private int documentsProcessed = 0; 36 | private List cachedDocs; 37 | private int currentStoredProcedureIndex = 0; 38 | 39 | public DocumentDBRecordWriter(Configuration conf, String host, String key, String dbName, String[] collNames, 40 | int outputStringPrecision, boolean upsert, String offerType) throws IOException { 41 | try { 42 | ConnectionPolicy policy = ConnectionPolicy.GetDefault(); 43 | policy.setUserAgentSuffix(DocumentDBConnectorUtil.UserAgentSuffix); 44 | DocumentClient client = new DocumentClient(host, key, policy, 45 | ConsistencyLevel.Session); 46 | 47 | Database db = DocumentDBConnectorUtil.GetDatabase(client, dbName); 48 | this.collections = new DocumentCollection[collNames.length]; 49 | this.sprocs = new StoredProcedure[collNames.length]; 50 | for (int i = 0; i < collNames.length; i++) { 51 | this.collections[i] = DocumentDBConnectorUtil.getOrCreateOutputCollection(client, db.getSelfLink(), collNames[i], 52 | outputStringPrecision, offerType); 53 | this.sprocs[i] = DocumentDBConnectorUtil.CreateBulkImportStoredProcedure(client, this.collections[i].getSelfLink()); 54 | } 55 | 56 | this.client = client; 57 | this.enableUpsert = upsert; 58 | this.cachedDocs = new LinkedList(); 59 | } catch (Exception e) { 60 | e.printStackTrace(); 61 | throw new IOException(e); 62 | } 63 | } 64 | 65 | /** 66 | * Writes data to DocumentDB if the cached documents reach the maximum cache size. 67 | */ 68 | public void write(Writable key, DocumentDBWritable value) throws IOException { 69 | Document doc = value.getDoc(); 70 | DocumentCollection targetCollection = this.collections[this.currentStoredProcedureIndex]; 71 | currentStoredProcedureIndex = (this.currentStoredProcedureIndex + 1) % this.collections.length; 72 | this.documentsProcessed++; 73 | if(targetCollection.getPartitionKey() != null) { 74 | DocumentDBConnectorUtil.createDocument(this.client, targetCollection.getSelfLink(), doc, this.enableUpsert); 75 | if (documentsProcessed % MAX_DOC_SIZE == 0) { 76 | LOG.info(String.format("wrote %d documents", this.documentsProcessed)); 77 | } 78 | } else { 79 | DocumentDBConnectorUtil.addIdIfMissing(doc); 80 | this.cachedDocs.add(doc); 81 | if (this.documentsProcessed % MAX_DOC_SIZE == 0) { 82 | this.writeCurrentBatch(); 83 | LOG.info(String.format("wrote %d documents", this.documentsProcessed)); 84 | } 85 | } 86 | } 87 | 88 | /** 89 | * Writes the last batch of documents that are being cached. 90 | */ 91 | @Override 92 | public void close(TaskAttemptContext context) throws IOException, InterruptedException { 93 | if (this.cachedDocs.size() > 0) { 94 | this.writeCurrentBatch(); 95 | } 96 | } 97 | 98 | private void writeCurrentBatch() { 99 | // Writing to output collections is round robin for each batch. 100 | DocumentDBConnectorUtil.executeWriteStoredProcedure(this.client, 101 | this.collections[this.currentStoredProcedureIndex].getSelfLink(), 102 | this.sprocs[this.currentStoredProcedureIndex], this.cachedDocs, 103 | this.enableUpsert); 104 | this.cachedDocs.clear(); 105 | 106 | // Do a round robin on the collections and execute the stored procedure once per each. 107 | this.currentStoredProcedureIndex = (this.currentStoredProcedureIndex + 1) % this.sprocs.length; 108 | } 109 | 110 | } 111 | -------------------------------------------------------------------------------- /src/com/microsoft/azure/documentdb/hadoop/DocumentDBWritable.java: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------ 2 | // Copyright (c) Microsoft Corporation. All rights reserved. 3 | //------------------------------------------------------------ 4 | package com.microsoft.azure.documentdb.hadoop; 5 | 6 | import java.io.DataInput; 7 | import java.io.DataOutput; 8 | import java.io.IOException; 9 | 10 | import org.apache.hadoop.io.Writable; 11 | import org.apache.hadoop.io.WritableComparable; 12 | import org.apache.hadoop.io.WritableComparator; 13 | 14 | import com.microsoft.azure.documentdb.Document; 15 | 16 | public class DocumentDBWritable implements WritableComparable