├── .classpath
├── .gitignore
├── .project
├── CONTRIBUTING.md
├── License
├── README.md
├── pom.xml
├── samples
├── Hive_Tutorial.hql
├── MapReduceTutorial.java
└── Pig_Tutorial.pig
└── src
├── BulkImportScript.js
└── com
└── microsoft
└── azure
└── documentdb
├── hadoop
├── BackoffExponentialRetryPolicy.java
├── ConfigurationUtil.java
├── DocumentDBConnectorUtil.java
├── DocumentDBInputFormat.java
├── DocumentDBInputSplit.java
├── DocumentDBOutputCommitter.java
├── DocumentDBOutputFormat.java
├── DocumentDBRecordReader.java
├── DocumentDBRecordWriter.java
├── DocumentDBWritable.java
└── DocumentDBWritableComparator.java
├── hive
├── DocumentDBSerDe.java
└── DocumentDBStorageHandler.java
├── mapred
└── hadoop
│ ├── DocumentDBInputFormat.java
│ ├── DocumentDBOutputFormat.java
│ ├── DocumentDBRecordReader.java
│ ├── DocumentDBRecordWriter.java
│ └── WrapperSplit.java
└── pig
├── DocumentDBLoader.java
├── DocumentDBStorage.java
└── SchemaHelper.java
/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 | /build/
3 | target/
4 |
5 | # Mobile Tools for Java (J2ME)
6 | .mtj.tmp/
7 |
8 | # Package Files #
9 | *.jar
10 | *.war
11 | *.ear
12 |
13 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
14 | hs_err_pid*
15 |
16 | # Eclipse
17 | .settings/
18 |
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | DocumentDBHadoop
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.jdt.core.javabuilder
10 |
11 |
12 |
13 |
14 | org.eclipse.m2e.core.maven2Builder
15 |
16 |
17 |
18 |
19 |
20 | org.eclipse.m2e.core.maven2Nature
21 | org.eclipse.jdt.core.javanature
22 |
23 |
24 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | Please read the contributing guidelines from the [Azure Team](http://azure.github.io/guidelines.html "Azure Team")
--------------------------------------------------------------------------------
/License:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 | Copyright (c) 2014 Microsoft Corporation
3 |
4 | Permission is hereby granted, free of charge, to any person obtaining a copy
5 | of this software and associated documentation files (the "Software"), to deal
6 | in the Software without restriction, including without limitation the rights
7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 | copies of the Software, and to permit persons to whom the Software is
9 | furnished to do so, subject to the following conditions:
10 |
11 | The above copyright notice and this permission notice shall be included in all
12 | copies or substantial portions of the Software.
13 |
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
20 | SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Microsoft Azure DocumentDB Hadoop Connector
2 |
3 | 
4 | 
5 | 
6 |
7 | This project provides a client library in Java that allows Microsoft Azure DocumentDB to act as an input source or output sink for MapReduce, Hive and Pig jobs.
8 |
9 | ## Download
10 | ### Option 1: Via Github
11 |
12 | To get the binaries of this library as distributed by Microsoft, ready for use within your project, you can use [GitHub releases](https://github.com/Azure/azure-documentdb-hadoop/releases).
13 |
14 | ### Option 2: Source Via Git
15 |
16 | To get the source code of the connector via git just type:
17 |
18 | git clone git://github.com/Azure/azure-documentdb-hadoop.git
19 |
20 | ### Option 3: Source Zip
21 |
22 | To download a copy of the source code, click "Download ZIP" on the right side of the page or click [here](https://github.com/Azure/azure-documentdb-hadoop/archive/master.zip).
23 |
24 | ### Option 4: Via Maven
25 |
26 | To get the binaries of this library as distributed by Microsoft, ready for use within your project, you can use Maven.
27 | ```xml
28 |
29 | com.microsoft.azure
30 | azure-documentdb-hadoop
31 | 1.2.0
32 |
33 | ```
34 | ### Option 5: HDInsight
35 |
36 | Install the DocumentDB Hadoop Connector onto HDInsight clusters through custom action scripts. Full instructions can be found [here](https://azure.microsoft.com/documentation/articles/documentdb-run-hadoop-with-hdinsight/).
37 |
38 | ## Requirements
39 | * Java Development Kit 7
40 |
41 | ## Supported Versions
42 | * Apache Hadoop & YARN 2.4.0
43 | * Apache Pig 0.12.1
44 | * Apache Hive & HCatalog 0.13.1
45 | * Apache Hadoop & YARN 2.6.0
46 | * Apache Pig 0.14.0
47 | * Apache Hive $ HCatalog 0.14.0
48 | * HDI 3.1 ([Getting started with HDInsight](https://azure.microsoft.com/documentation/articles/documentdb-run-hadoop-with-hdinsight/))
49 | * HDI 3.2
50 |
51 | ## Dependencies
52 | * Microsoft Azure DocumentDB Java SDK 1.6.0 (com.microsoft.azure / azure-documentdb / 1.6.0)
53 |
54 | When using Hive:
55 | * OpenX Technologies JsonSerde 1.3.1-SNAPSHOT (org.openx.data / json-serde-parent / 1.3.1-SNAPSHOT)
56 | GitHub repo can be found [here](https://github.com/rcongiu/Hive-JSON-Serde)
57 |
58 | Please download the jars and add them to your build path.
59 |
60 | ## Usage
61 |
62 | To use this client library with Azure DocumentDB, you need to first [create an account](http://azure.microsoft.com/en-us/documentation/articles/documentdb-create-account/).
63 |
64 | ### MapReduce
65 |
66 | ##### Configuring input and output from DocumentDB Example
67 | ```Java
68 | // Import Hadoop Connector Classes
69 | import com.microsoft.azure.documentdb.Document;
70 | import com.microsoft.azure.documentdb.hadoop.ConfigurationUtil;
71 | import com.microsoft.azure.documentdb.hadoop.DocumentDBInputFormat;
72 | import com.microsoft.azure.documentdb.hadoop.DocumentDBOutputFormat;
73 | import com.microsoft.azure.documentdb.hadoop.DocumentDBWritable;
74 |
75 | // Set Configurations
76 | Configuration conf = new Configuration();
77 | final String host = "Your DocumentDB Endpoint";
78 | final String key = "Your DocumentDB Primary Key";
79 | final String dbName = "Your DocumentDB Database Name";
80 | final String inputCollNames = "Your DocumentDB Input Collection Name[s]";
81 | final String outputCollNames = "Your DocumentDB Output Collection Name[s]";
82 | final String query = "[Optional] Your DocumentDB Query";
83 | final String outputStringPrecision = "[Optional] Number of bytes to use for String indexes"
84 | final String offerType = "[Optional] Your performance level for Output Collection Creations";
85 | final String upsert = "[Optional] Bool to disable or enable document upsert";
86 |
87 | conf.set(ConfigurationUtil.DB_HOST, host);
88 | conf.set(ConfigurationUtil.DB_KEY, key);
89 | conf.set(ConfigurationUtil.DB_NAME, dbName);
90 | conf.set(ConfigurationUtil.INPUT_COLLECTION_NAMES, inputCollNames);
91 | conf.set(ConfigurationUtil.OUTPUT_COLLECTION_NAMES, outputCollNames);
92 | conf.set(ConfigurationUtil.QUERY, query);
93 | conf.set(ConfigurationUtil.OUTPUT_STRING_PRECISION, outputStringPrecision);
94 | conf.set(ConfigurationUtil.OUTPUT_COLLECTIONS_OFFER, offerType);
95 | conf.set(ConfigurationUtil.UPSERT, upsert);
96 | ```
97 |
98 | Full MapReduce sample can be found [here](https://github.com/Azure/azure-documentdb-hadoop/blob/master/samples/MapReduceTutorial.java).
99 |
100 | ### Hive
101 | ##### Loading data from DocumentDB Example
102 | ```Java
103 | CREATE EXTERNAL TABLE DocumentDB_Hive_Table( COLUMNS )
104 | STORED BY 'com.microsoft.azure.documentdb.hive.DocumentDBStorageHandler'
105 | tblproperties (
106 | 'DocumentDB.endpoint' = 'Your DocumentDB Endpoint',
107 | 'DocumentDB.key' = 'Your DocumentDB Primary Key',
108 | 'DocumentDB.db' = 'Your DocumentDB Database Name',
109 | 'DocumentDB.inputCollections' = 'Your DocumentDB Input Collection Name[s]',
110 | 'DocumentDB.query' = '[Optional] Your DocumentDB Query' );
111 | ```
112 |
113 | ##### Storing data to DocumentDB Example
114 | ```Java
115 | CREATE EXTERNAL TABLE Hive_DocumentDB_Table( COLUMNS )
116 | STORED BY 'com.microsoft.azure.documentdb.hive.DocumentDBStorageHandler'
117 | tblproperties (
118 | 'DocumentDB.endpoint' = 'Your DocumentDB Endpoint',
119 | 'DocumentDB.key' = 'Your DocumentDB Primary Key',
120 | 'DocumentDB.db' = 'Your DocumentDB Database Name',
121 | 'DocumentDB.outputCollections' = 'Your DocumentDB Output Collection Name[s]',
122 | '[Optional] DocumentDB.outputStringPrecision' = '[Optional] Number of bytes to use for String indexes',
123 | '[Optional] DocumentDB.outputCollectionsOffer' = '[Optional] Your performance level for Output Collection Creations',
124 | '[Optional] DocumentDB.upsert' = '[Optional] Bool to disable or enable document upsert');
125 | INSERT INTO TABLE Hive_DocumentDB_Table
126 | ```
127 | Full Hive sample can be found [here](https://github.com/Azure/azure-documentdb-hadoop/blob/master/samples/Hive_Tutorial.hql).
128 |
129 | ### Pig
130 | ##### Loading data from DocumentDB Example
131 | ```Java
132 | LOAD 'Your DocumentDB Endpoint'
133 | USING com.microsoft.azure.documentdb.hadoop.pig.DocumentDBLoader(
134 | 'Your DocumentDB Primary Key',
135 | 'Your DocumentDB Database Name',
136 | 'Your DocumentDB Input Collection Name[s]',
137 | '[Optional] Your DocumentDB SQL Query' );
138 | ```
139 |
140 | ##### Storing data to DocumentDB Example
141 | ```Java
142 | STORE data INTO 'DocumentDB Endpoint'
143 | USING com.microsoft.azure.documentdb.hadoop.pig.DocumentDBStorage(
144 | 'DocumentDB Primary Key',
145 | 'DocumentDB Database Name',
146 | 'DocumentDB Output Collection Name[s]',
147 | '[Optional] Your performance level for Output Collection Creations',
148 | '[Optional] Number of bytes to use for String indexes',
149 | '[Optional] Bool to disable or enable document upsert');
150 | ```
151 | Full Pig sample can be found [here](https://github.com/Azure/azure-documentdb-hadoop/blob/master/samples/Pig_Tutorial.pig).
152 |
153 | ## Remarks
154 | * When outputting to DocumentDB, your output collection will require capacity for an [additional stored procedure](http://azure.microsoft.com/en-us/documentation/articles/documentdb-limits/). The stored procedure will remain in your collection for reuse.
155 | * The Hadoop Connector automatically sets your indexes to range indexes with max precision on strings and numbers. More information can be found [here](http://azure.microsoft.com/en-us/documentation/articles/documentdb-indexing-policies/).
156 | * Connector supports configurable *upsert* option. *Upsert* configuration is automatically set to *true* and will overwrite documents within the same collection with the same *id*.
157 | * Reads and writes to DocumentDB will be counted against your provisioned throughput for each collection.
158 | * Output to DocumentDB collections is done in batch round robin.
159 | * Connector supports configurable *offer* option. *Offer* configuration allows users to set the [performance tier](http://azure.microsoft.com/en-us/documentation/articles/documentdb-performance-levels/) of their newly creation collections (this does not apply when outputting to an already existing collection).
160 | * Connector supports output to partitioned collections. Hadoop Connector **will not** automatically create partitioned collections for Hadoop job outputs.
161 |
162 | ## Need Help?
163 |
164 | Be sure to check out the Microsoft Azure [Developer Forums on MSDN](https://social.msdn.microsoft.com/forums/azure/en-US/home?forum=AzureDocumentDB) or the [Developer Forums on Stack Overflow](http://stackoverflow.com/questions/tagged/azure-documentdb) if you have trouble with the provided code. Also, check out our [tutorial](http://azure.microsoft.com/en-us/documentation/articles/documentdb-run-hadoop-with-hdinsight/) for more information.
165 |
166 | ## Contribute Code or Provide Feedback
167 |
168 | If you would like to become an active contributor to this project please follow the instructions provided in [Azure Projects Contribution Guidelines](http://azure.github.io/guidelines.html).
169 |
170 | If you encounter any bugs with the library please file an issue in the [Issues](https://github.com/Azure/azure-documentdb-hadoop/issues) section of the project.
171 |
172 | ## Learn More
173 | * [DocumentDB with HDInsight Tutorial](https://azure.microsoft.com/documentation/articles/documentdb-run-hadoop-with-hdinsight/)
174 | * [Official Hadoop Documentation](http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/ClusterSetup.html)
175 | * [Azure Developer Center](http://azure.microsoft.com/en-us/develop/java/)
176 | * [Azure DocumentDB Service](http://azure.microsoft.com/en-us/documentation/services/documentdb/)
177 | * [Azure DocumentDB Team Blog](http://blogs.msdn.com/b/documentdb/)
178 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 | com.microsoft.azure
5 | azure-documentdb-hadoop
6 | 1.2.0
7 | jar
8 |
9 | Azure-DocumentDB-Hadoop
10 | Hadoop Connector for Microsoft Azure DocumentDB
11 | http://azure.microsoft.com/en-us/services/documentdb/
12 |
13 |
14 | MIT License
15 | http://www.opensource.org/licenses/mit-license.php
16 |
17 |
18 |
19 |
20 | DocumentDB Team
21 | anhoh@microsoft.com
22 | Microsoft
23 | http://www.microsoft.com/
24 |
25 |
26 |
27 | scm:git:git@github.com:Azure/azure-documentdb-hadoop.git
28 | scm:git:git@github.com:Azure/azure-documentdb-hadoop.git
29 | git@github.com:Azure/azure-documentdb-hadoop.git
30 |
31 |
32 |
33 | UTF-8
34 |
35 |
36 | src
37 |
38 |
39 | maven-compiler-plugin
40 | 3.1
41 |
42 | 1.7
43 | 1.7
44 |
45 |
46 |
47 | maven-assembly-plugin
48 | 2.2
49 |
50 |
51 | bin
52 |
53 |
54 |
55 |
56 | make-assembly
57 | package
58 |
59 | single
60 |
61 |
62 |
63 |
64 |
65 | org.sonatype.plugins
66 | nexus-staging-maven-plugin
67 | 1.6.3
68 | true
69 |
70 | ossrh
71 | https://oss.sonatype.org/
72 | true
73 |
74 |
75 |
76 | org.apache.maven.plugins
77 | maven-source-plugin
78 | 2.2.1
79 |
80 |
81 | attach-sources
82 |
83 | jar-no-fork
84 |
85 |
86 |
87 |
88 |
89 | org.apache.maven.plugins
90 | maven-javadoc-plugin
91 | 2.9.1
92 |
93 |
94 | attach-javadocs
95 |
96 | jar
97 |
98 |
99 |
100 |
101 |
102 | org.apache.maven.plugins
103 | maven-gpg-plugin
104 | 1.5
105 |
106 |
107 | sign-artifacts
108 | verify
109 |
110 | sign
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 | src
119 |
120 | BulkImportScript.js
121 |
122 |
123 |
124 |
125 |
126 |
127 | commons-io
128 | commons-io
129 | 2.4
130 |
131 |
132 | org.apache.hadoop
133 | hadoop-mapreduce
134 | 2.5.1
135 | pom
136 | provided
137 |
138 |
139 | org.apache.commons
140 | commons-lang3
141 | 3.3.2
142 |
143 |
144 | org.apache.hadoop
145 | hadoop-hdfs
146 | 2.5.1
147 | pom
148 |
149 |
150 | org.apache.hadoop
151 | hadoop-common
152 | 2.5.1
153 |
154 |
155 | commons-codec
156 | commons-codec
157 |
158 |
159 |
160 |
161 | org.apache.hadoop
162 | hadoop-mapreduce-client-core
163 | 2.5.1
164 |
165 |
166 | commons-codec
167 | commons-codec
168 |
169 |
170 |
171 |
172 | org.apache.hadoop
173 | hadoop-mapreduce-client-shuffle
174 | 2.5.1
175 |
176 |
177 | commons-codec
178 | commons-codec
179 |
180 |
181 |
182 |
183 | org.apache.hadoop
184 | hadoop-hdfs
185 | 2.5.1
186 |
187 |
188 | commons-codec
189 | commons-codec
190 |
191 |
192 |
193 |
194 | com.google.guava
195 | guava
196 | 15.0
197 | jar
198 |
199 |
200 | org.apache.httpcomponents
201 | httpclient
202 | 4.3.5
203 |
204 |
205 | org.apache.httpcomponents
206 | httpcore
207 | 4.3.2
208 |
209 |
210 | org.apache.hive
211 | hive-serde
212 | 0.13.1
213 | jar
214 |
215 |
216 | org.apache.hive
217 | hive-exec
218 | 0.13.1
219 | jar
220 |
221 |
222 | org.apache.hive
223 | hive-metastore
224 | 0.13.1
225 | jar
226 |
227 |
228 | com.microsoft.azure
229 | azure-documentdb
230 | 1.6.0
231 |
232 |
233 | org.apache.pig
234 | pig
235 | 0.13.0
236 | provided
237 |
238 |
239 | org.openx.data
240 | json-serde-parent
241 | 1.3.1-SNAPSHOT
242 |
243 |
244 |
245 |
246 | ossrh
247 | https://oss.sonatype.org/content/repositories/snapshots
248 |
249 |
250 | ossrh
251 | https://oss.sonatype.org/service/local/staging/deploy/maven2/
252 |
253 |
254 |
255 |
--------------------------------------------------------------------------------
/samples/Hive_Tutorial.hql:
--------------------------------------------------------------------------------
1 | -- Count the total number of document modifications (creations or updates) by the minute using the system generated _ts
2 | -- Read from two input collections and store ouput in a separate collection
3 |
4 | -- Add dependencies
5 | add JAR ;
6 | add JAR ;
7 | add JAR ;
8 |
9 | -- Create a Hive Table from DocumentDB ids and timestamps
10 | drop table DocumentDB_timestamps;
11 | create external table DocumentDB_timestamps(id string, ts BIGINT)
12 | stored by 'com.microsoft.azure.documentdb.hive.DocumentDBStorageHandler'
13 | tblproperties (
14 | 'DocumentDB.endpoint' = 'DocumentDB Endpoint',
15 | 'DocumentDB.key' = 'DocumentDB Primary Key',
16 | 'DocumentDB.db' = 'DocumentDB Database Name',
17 | 'DocumentDB.inputCollections' = 'DocumentDB Input Collection Name 1,Document Input Collection Name 2',
18 | 'DocumentDB.query' = 'SELECT r._rid AS id, r._ts AS ts FROM root r' );
19 |
20 | -- Create a Hive Table for outputting to DocumentDB
21 | drop table DocumentDB_analytics;
22 | create external table DocumentDB_analytics(Month INT, Day INT, Hour INT, Minute INT, Total INT)
23 | stored by 'com.microsoft.azure.documentdb.hive.DocumentDBStorageHandler'
24 | tblproperties (
25 | 'DocumentDB.endpoint' = 'DocumentDB Endpoint',
26 | 'DocumentDB.key' = 'DocumentDB Primary Key',
27 | 'DocumentDB.db' = 'DocumentDB Database Name',
28 | 'DocumentDB.outputCollections' = 'DocumentDB Output Collection Name' );
29 |
30 | -- Insert aggregations to Output Hive Table
31 | INSERT INTO table DocumentDB_analytics
32 | SELECT month(from_unixtime(ts)) as Month, day(from_unixtime(ts)) as Day, hour(from_unixtime(ts)) as Hour, minute(from_unixtime(ts)) as Minute, COUNT(*) AS Total
33 | FROM DocumentDB_timestamps
34 | GROUP BY month(from_unixtime(ts)), day(from_unixtime(ts)), hour(from_unixtime(ts)) , minute(from_unixtime(ts));
35 |
--------------------------------------------------------------------------------
/samples/MapReduceTutorial.java:
--------------------------------------------------------------------------------
1 | import java.io.IOException;
2 | import java.util.Iterator;
3 | import java.util.Set;
4 |
5 | import org.apache.hadoop.io.IntWritable;
6 | import org.apache.hadoop.io.LongWritable;
7 | import org.apache.hadoop.io.Text;
8 | import org.apache.hadoop.conf.*;
9 | import org.apache.hadoop.mapreduce.*;
10 |
11 | import com.microsoft.azure.documentdb.Document;
12 | import com.microsoft.azure.documentdb.hadoop.ConfigurationUtil;
13 | import com.microsoft.azure.documentdb.hadoop.DocumentDBInputFormat;
14 | import com.microsoft.azure.documentdb.hadoop.DocumentDBOutputFormat;
15 | import com.microsoft.azure.documentdb.hadoop.DocumentDBWritable;
16 |
17 | // Tally the number of property occurrences for all Documents in a collection
18 | public class MapReduceTutorial {
19 | public static class Map extends Mapper {
20 | private final static IntWritable one = new IntWritable(1);
21 |
22 | @Override
23 | public void map(LongWritable key, DocumentDBWritable value,
24 | Context context)
25 | throws IOException, InterruptedException {
26 |
27 | // Retrieve all property names from Document
28 | Set properties = value.getDoc().getHashMap().keySet();
29 |
30 | for(String property : properties) {
31 | context.write(new Text(property), one);
32 | }
33 | }
34 | }
35 |
36 | public static class Reduce extends Reducer {
37 |
38 | @Override
39 | protected void reduce(Text key, Iterable values,
40 | Context context) throws IOException, InterruptedException {
41 | int sum = 0;
42 | Iterator itr = values.iterator();
43 |
44 | // Count the number of occurrences for a given property
45 | while (itr.hasNext()) {
46 | sum += itr.next().get();
47 | }
48 |
49 | // Write the property and frequency back into DocumentDB as a document
50 | Document d = new Document();
51 | d.set("id", key.toString());
52 | d.set("frequency", sum);
53 | context.write(key, new DocumentDBWritable(d));
54 | }
55 | }
56 |
57 |
58 | public static void main(String[] args) throws Exception {
59 | Configuration conf = new Configuration();
60 | final String host = "DocumentDB Endpoint";
61 | final String key = "DocumentDB Primary Key";
62 | final String dbName = "DocumentDB Database Name";
63 | final String inputCollName = "DocumentDB Input Collection Name";
64 | final String outputCollName = "DocumentDB Output Collection Name";
65 | conf.set(ConfigurationUtil.DB_HOST, host);
66 | conf.set(ConfigurationUtil.DB_KEY, key);
67 | conf.set(ConfigurationUtil.DB_NAME, dbName);
68 | conf.set(ConfigurationUtil.INPUT_COLLECTION_NAMES, inputCollName);
69 | conf.set(ConfigurationUtil.OUTPUT_COLLECTION_NAMES, outputCollName);
70 |
71 | Job job = Job.getInstance(conf, "MapReduceTutorial");
72 | job.setJobName("TallyProperties");
73 |
74 | job.setMapperClass(Map.class);
75 | job.setReducerClass(Reduce.class);
76 |
77 | job.setInputFormatClass(DocumentDBInputFormat.class);
78 | job.setOutputFormatClass(DocumentDBOutputFormat.class);
79 |
80 | job.setMapOutputKeyClass(Text.class);
81 | job.setMapOutputValueClass(IntWritable.class);
82 |
83 | job.setOutputKeyClass(Text.class);
84 | job.setOutputValueClass(DocumentDBWritable.class);
85 |
86 | job.setJarByClass(MapReduceTutorial.class);
87 |
88 | System.exit(job.waitForCompletion(true) ? 0 : 1);
89 | }
90 | }
--------------------------------------------------------------------------------
/samples/Pig_Tutorial.pig:
--------------------------------------------------------------------------------
1 | -- Count the total number of document modifications (creations or updates) by the minute using the system generated _ts
2 | -- Read from two input collections and store ouput in a separate collection
3 |
4 | -- Add dependencies
5 | REGISTER ;
6 | REGISTER ;
7 |
8 | -- Load DocumentDB ids and timestamps
9 | DocumentDB_timestamps = LOAD 'DocumentDB Endpoint' USING com.microsoft.azure.documentdb.pig.DocumentDBLoader(
10 | 'DocumentDB Primary Key', 'DocumentDB Database Name', 'DocumentDB Input Collection Name 1,DocumentDB Input Collection Name 2',
11 | 'SELECT r._rid AS id, r._ts AS ts FROM root r' );
12 |
13 | timestamp_record = FOREACH DocumentDB_timestamps GENERATE $0#'id' as id:int, ToDate((long)($0#'ts') * 1000) as timestamp:datetime;
14 |
15 | by_minute = GROUP timestamp_record BY (GetYear(timestamp), GetMonth(timestamp), GetDay(timestamp), GetHour(timestamp), GetMinute(timestamp));
16 | by_minute_count = FOREACH by_minute GENERATE FLATTEN(group) as (Year:int, Month:int, Day:int, Hour:int, Minute:int), COUNT(timestamp_record) as Total:int;
17 |
18 | -- Store results back into DocumentDB
19 | STORE by_minute_count INTO 'DocumentDB Endpoint'
20 | USING com.microsoft.azure.documentdb.pig.DocumentDBStorage(
21 | 'DocumentDB Primary Key', 'DocumentDB Database Name', 'DocumentDB Output Collection Name');
22 |
--------------------------------------------------------------------------------
/src/BulkImportScript.js:
--------------------------------------------------------------------------------
1 | //------------------------------------------------------------
2 | // Copyright (c) Microsoft Corporation. All rights reserved.
3 | //------------------------------------------------------------
4 |
5 | function bulkImport(docs, upsert) {
6 | var collection = getContext().getCollection();
7 | var collectionLink = collection.getSelfLink();
8 |
9 | // The count of imported docs, also used as current doc index.
10 | var count = 0;
11 | var errorCodes = { CONFLICT: 409 };
12 |
13 | // Validate input.
14 | if (!docs) throw new Error("The array is undefined or null.");
15 |
16 | var docsLength = docs.length;
17 | if (docsLength == 0) {
18 | getContext().getResponse().setBody(0);
19 | return;
20 | }
21 |
22 | // Call the create API to create a document.
23 | tryCreate(docs[count], callback);
24 |
25 | // Note that there are 2 exit conditions:
26 | // 1) The createDocument request was not accepted.
27 | // In this case the callback will not be called, we just call
28 | // setBody and we are done.
29 | // 2) The callback was called docs.length times.
30 | // In this case all documents were created and we don’t need to call
31 | // tryCreate anymore. Just call setBody and we are done.
32 | function tryCreate(doc, callback) {
33 | var isAccepted = collection.createDocument(collectionLink, doc, { disableAutomaticIdGeneration : true}, callback);
34 |
35 | // If the request was accepted, callback will be called.
36 | // Otherwise report current count back to the client,
37 | // which will call the script again with remaining set of docs.
38 | if (!isAccepted) getContext().getResponse().setBody(count);
39 | }
40 |
41 | // To replace the document, first issue a query to find it and then call replace.
42 | function tryReplace(doc, callback) {
43 | var parsedDoc = JSON.parse(doc);
44 | retrieveDoc(parsedDoc, null, function(retrievedDocs){
45 | var isAccepted = collection.replaceDocument(retrievedDocs[0]._self, parsedDoc, callback);
46 | if (!isAccepted) getContext().getResponse().setBody(count);
47 | });
48 | }
49 |
50 | function retrieveDoc(doc, continuation, callback) {
51 | var query = "select * from root r where r.id = '" + doc.id + "'";
52 | var requestOptions = { continuation : continuation };
53 | var isAccepted = collection.queryDocuments(collectionLink, query, requestOptions, function(err, retrievedDocs, responseOptions) {
54 | if (err) throw err;
55 |
56 | if (retrievedDocs.length > 0) {
57 | callback(retrievedDocs);
58 | } else if (responseOptions.continuation) {
59 | retrieveDoc(doc, responseOptions.continuation, callback);
60 | } else {
61 | throw "Error in retrieving document: " + doc.id;
62 | }
63 | });
64 |
65 | if (!isAccepted) getContext().getResponse().setBody(count);
66 | }
67 |
68 | // This is called when collection.createDocument is done in order to
69 | // process the result.
70 | function callback(err, doc, options) {
71 | if (err) {
72 | // Replace the document if status code is 409 and upsert is enabled
73 | if(upsert && err.number == errorCodes.CONFLICT) {
74 | return tryReplace(docs[count], callback);
75 | } else {
76 | throw err;
77 | }
78 | }
79 |
80 | // One more document has been inserted, increment the count.
81 | count++;
82 | if (count >= docsLength) {
83 | // If we created all documents, we are done. Just set the response.
84 | getContext().getResponse().setBody(count);
85 | } else {
86 | // Create next document.
87 | tryCreate(docs[count], callback);
88 | }
89 | }
90 | }
91 |
--------------------------------------------------------------------------------
/src/com/microsoft/azure/documentdb/hadoop/BackoffExponentialRetryPolicy.java:
--------------------------------------------------------------------------------
1 | package com.microsoft.azure.documentdb.hadoop;
2 |
3 | import org.apache.commons.logging.Log;
4 | import org.apache.commons.logging.LogFactory;
5 |
6 | import com.microsoft.azure.documentdb.DocumentClientException;
7 |
8 | public class BackoffExponentialRetryPolicy {
9 | private static final int REQUEST_RATE_TOO_LARGE = 429;
10 |
11 | private static final Log LOG = LogFactory.getLog(BackoffExponentialRetryPolicy.class);
12 |
13 | private final long defaultRetryInSeconds = 3;
14 |
15 | private final int retryAdditiveMultiplier = 500;
16 |
17 | private int currentAttemptCount = 0;
18 |
19 | private long retryAfterInMilliseconds = 0;
20 |
21 | public BackoffExponentialRetryPolicy() {
22 |
23 | }
24 |
25 | public int getCurrentAttempt() {
26 | return this.currentAttemptCount;
27 | }
28 |
29 | public boolean shouldRetry(){
30 | return true;
31 | }
32 |
33 | /**
34 | * Report that an error has occured and sleeps if the error is retriable
35 | * @param exception
36 | */
37 | public void errorOccured(Exception exception) {
38 | if (!isExceptionRetriable(exception)) {
39 | throw new IllegalStateException("Exception not retriable: " + exception.getMessage(), exception);
40 | }
41 |
42 | waitUntilNextTry();
43 | }
44 |
45 | private void waitUntilNextTry() {
46 | try {
47 | LOG.info("Trial number: " + this.currentAttemptCount + ", retrying after: " + this.getRetryAfterInMilliseconds());
48 | Thread.sleep(this.getRetryAfterInMilliseconds());
49 | } catch (InterruptedException ignored) {
50 | }
51 | }
52 |
53 | private long getRetryAfterInMilliseconds() {
54 | return this.retryAfterInMilliseconds;
55 | }
56 |
57 | private boolean isExceptionRetriable(Exception exception) {
58 | this.retryAfterInMilliseconds = 0;
59 |
60 | if (this.CheckIfRetryNeeded(exception)) {
61 | this.currentAttemptCount++;
62 | return true;
63 | } else {
64 | return false;
65 | }
66 | }
67 |
68 | private boolean CheckIfRetryNeeded(Exception exception) {
69 | this.retryAfterInMilliseconds = 0;
70 |
71 | if(exception instanceof IllegalStateException) {
72 | exception = (Exception) exception.getCause();
73 | }
74 |
75 | if (exception instanceof DocumentClientException) {
76 | DocumentClientException dce = (DocumentClientException) exception;
77 |
78 | if (dce.getStatusCode() == REQUEST_RATE_TOO_LARGE) {
79 | this.retryAfterInMilliseconds = dce.getRetryAfterInMilliseconds() + this.currentAttemptCount * this.retryAdditiveMultiplier;
80 |
81 | if (this.retryAfterInMilliseconds == 0) {
82 | // we should never reach here as BE should turn non-zero of
83 | // retry delay.
84 | this.retryAfterInMilliseconds = this.defaultRetryInSeconds * 1000;
85 | }
86 |
87 | return true;
88 | }
89 | }
90 |
91 | return false;
92 | }
93 | }
94 |
--------------------------------------------------------------------------------
/src/com/microsoft/azure/documentdb/hadoop/ConfigurationUtil.java:
--------------------------------------------------------------------------------
1 | //------------------------------------------------------------
2 | // Copyright (c) Microsoft Corporation. All rights reserved.
3 | //------------------------------------------------------------
4 | package com.microsoft.azure.documentdb.hadoop;
5 |
6 | import java.util.Map;
7 | import java.util.Properties;
8 | import java.util.Set;
9 |
10 | import org.apache.commons.lang3.StringUtils;
11 | import org.apache.hadoop.conf.Configuration;
12 |
13 | import com.google.common.collect.ImmutableSet;
14 |
15 | /**
16 | *
17 | * Provides the configuration properties needed for running a hadoop job on documentdb.
18 | *
19 | */
20 | public class ConfigurationUtil {
21 | /**
22 | * The database Id used in the Map Reduce job.
23 | */
24 | public static final String DB_NAME = "DocumentDB.db";
25 |
26 | /**
27 | * Comma separated input collections Ids used in the map reduce job.
28 | */
29 | public static final String INPUT_COLLECTION_NAMES = "DocumentDB.inputCollections";
30 |
31 | /**
32 | * Comma separated outp collections Ids used in the map reduce job.
33 | */
34 | public static final String OUTPUT_COLLECTION_NAMES = "DocumentDB.outputCollections";
35 |
36 | /**
37 | * The link for the documentdb endpoint
38 | */
39 | public static final String DB_HOST = "DocumentDB.endpoint";
40 |
41 | /**
42 | * The masterkey used for the documentdb account.
43 | */
44 | public static final String DB_KEY = "DocumentDB.key";
45 |
46 | /**
47 | * the documentdb query pushed down to the input collections when reading.
48 | */
49 | public static final String QUERY = "DocumentDB.query";
50 |
51 | /**
52 | * Precision of the output collections' string indexes .
53 | */
54 | public static final String OUTPUT_STRING_PRECISION = "DocumentDB.outputStringPrecision";
55 |
56 | /**
57 | * The offer type of the output collections.
58 | */
59 | public static final String OUTPUT_COLLECTIONS_OFFER = "DocumentDB.outputCollectionsOffer";
60 |
61 | /**
62 | * An upsert option, true by default. This can be disabled by setting it to "false"
63 | */
64 | public static final String UPSERT = "DocumentDB.upsert";
65 |
66 | public static final int DEFAULT_STRING_PRECISION = -1; // Maxmum precision.
67 |
68 | /**
69 | * Gets the DocumentDB.db from the Configuration object.
70 | * @param conf job configuration object
71 | * @return database Id
72 | */
73 | public final static String getDBName(Configuration conf) {
74 | return conf.get(DB_NAME);
75 | }
76 |
77 | /**
78 | * A set of all the configuration properties of the connector.
79 | */
80 | private static final Set ALL_PROPERTIES = ImmutableSet.of(DB_NAME,
81 | INPUT_COLLECTION_NAMES, OUTPUT_COLLECTION_NAMES, DB_HOST, DB_KEY,
82 | QUERY);
83 |
84 | /**
85 | * Gets the DocumentDB.inputCollections from the Configuration object.
86 | * @param conf job configuration object
87 | * @return Array of collection Ids
88 | */
89 | public final static String[] getInputCollectionNames(Configuration conf) {
90 | String[] collectionNames = conf.get(INPUT_COLLECTION_NAMES).split(",");
91 | return collectionNames;
92 | }
93 |
94 | /**
95 | * Gets the DocumentDB.outputCollections from the Configuration object.
96 | * @param conf job configuration object
97 | * @return Array of collection Ids
98 | */
99 | public final static String[] getOutputCollectionNames(Configuration conf) {
100 | String[] collectionNames = conf.get(OUTPUT_COLLECTION_NAMES).split(",");
101 | return collectionNames;
102 | }
103 |
104 | /**
105 | * Gets the DocumentDB.endpoint from the Configuration object.
106 | * @param conf job configuration object
107 | * @return The documentdb endpoint url
108 | */
109 | public final static String getDBEndpoint(Configuration conf) {
110 | return conf.get(DB_HOST);
111 | }
112 |
113 | /**
114 | * Gets the DocumentDB.key from the Configuration object.
115 | * @param conf job configuration object.
116 | * @return The masterkey for documentdb database account.
117 | */
118 | public final static String getDBKey(Configuration conf) {
119 | return conf.get(DB_KEY);
120 | }
121 |
122 | /**
123 | * Gets the DocumentDB.query from the Configuration object.
124 | * @param conf job configuration object
125 | * @return sql query used to read from input collections.
126 | */
127 | public final static String getQuery(Configuration conf) {
128 | return conf.get(QUERY);
129 | }
130 |
131 | /**
132 | * Gets the DocumentDB.outputStringPrecision from the Configuration object.
133 | * @param conf job configuration object
134 | * @return the string precision of the output collections.
135 | */
136 | public final static int getOutputStringPrecision(Configuration conf) {
137 | String value = conf.get(OUTPUT_STRING_PRECISION);
138 |
139 | Integer stringPrecision = new Integer(DEFAULT_STRING_PRECISION);
140 |
141 | if (StringUtils.isEmpty(value)) {
142 | return stringPrecision;
143 | }
144 |
145 | try {
146 | stringPrecision = Integer.valueOf(value);
147 | } catch (IllegalArgumentException e) {
148 | throw new IllegalArgumentException("outputStringPrecision is expected to be an integer.", e);
149 | }
150 |
151 | if (stringPrecision < -1 || stringPrecision == 0) {
152 | throw new IllegalArgumentException("outputStringPrecision can only be -1 or a positive number.");
153 | }
154 |
155 | return stringPrecision;
156 | }
157 |
158 | /**
159 | * Gets the DocumentDB.upsert from the Configuration object.
160 | * @param conf job configuration object
161 | * @return the value of upsert option
162 | */
163 | public final static boolean getUpsert(Configuration conf) {
164 | String upsert = conf.get(UPSERT);
165 | return (upsert != null && upsert.equalsIgnoreCase("false")) ? false : true;
166 | }
167 |
168 | /**
169 | * Gets the DocumentDB.outputCollectionsOffer from the Configuration object.
170 | * @param conf job configuration object
171 | * @return the value of documentdb.outputCollectionsOffer option
172 | */
173 | public final static String getOutputCollectionsOffer(Configuration conf) {
174 | String outputCollectionsOffer = conf.get(OUTPUT_COLLECTIONS_OFFER);
175 | return (outputCollectionsOffer != null) ? outputCollectionsOffer : "S3";
176 | }
177 |
178 | /**
179 | * Copies the configuration properties for the connector to a map.
180 | * @param from Properties object to copy from.
181 | * @param to Target map to copy properties to.
182 | */
183 | public static void copyDocumentDBProperties(Properties from, Map to) {
184 | for (String key : ALL_PROPERTIES) {
185 | String value = from.getProperty(key);
186 | if (value != null) {
187 | to.put(key, value);
188 | }
189 | }
190 | }
191 | }
192 |
--------------------------------------------------------------------------------
/src/com/microsoft/azure/documentdb/hadoop/DocumentDBConnectorUtil.java:
--------------------------------------------------------------------------------
1 | //------------------------------------------------------------
2 | // Copyright (c) Microsoft Corporation. All rights reserved.
3 | //------------------------------------------------------------
4 |
5 | package com.microsoft.azure.documentdb.hadoop;
6 |
7 | import java.io.IOException;
8 | import java.io.InputStream;
9 | import java.util.ArrayList;
10 | import java.util.Iterator;
11 | import java.util.List;
12 | import java.util.UUID;
13 |
14 | import org.apache.commons.io.IOUtils;
15 | import org.apache.commons.logging.Log;
16 | import org.apache.commons.logging.LogFactory;
17 |
18 | import com.microsoft.azure.documentdb.DataType;
19 | import com.microsoft.azure.documentdb.Database;
20 | import com.microsoft.azure.documentdb.Document;
21 | import com.microsoft.azure.documentdb.DocumentClient;
22 | import com.microsoft.azure.documentdb.DocumentClientException;
23 | import com.microsoft.azure.documentdb.DocumentCollection;
24 | import com.microsoft.azure.documentdb.IncludedPath;
25 | import com.microsoft.azure.documentdb.IndexingPolicy;
26 | import com.microsoft.azure.documentdb.PartitionKey;
27 | import com.microsoft.azure.documentdb.QueryIterable;
28 | import com.microsoft.azure.documentdb.RangeIndex;
29 | import com.microsoft.azure.documentdb.RequestOptions;
30 | import com.microsoft.azure.documentdb.SqlParameter;
31 | import com.microsoft.azure.documentdb.SqlParameterCollection;
32 | import com.microsoft.azure.documentdb.SqlQuerySpec;
33 | import com.microsoft.azure.documentdb.StoredProcedure;
34 |
35 | /**
36 | *
37 | * Utils used by the connector for DocumentDBCrud
38 | *
39 | */
40 | public class DocumentDBConnectorUtil {
41 | private static final Log LOG = LogFactory.getLog(DocumentDBConnectorUtil.class);
42 | private final static int MAX_SCRIPT_DOCS = 50;
43 | private final static int MAX_SCRIPT_SIZE = 50000;
44 | private final static String BULK_IMPORT_ID = "HadoopBulkImportSprocV1";
45 | private final static String BULK_IMPORT_PATH = "/BulkImportScript.js";
46 | private final static int CONFLICT_ERROR = 409;
47 |
48 | public static String UserAgentSuffix = " HadoopConnector/1.1.0";
49 |
50 | /**
51 | * Creates a document and replaces it if it already exists when isUpsert is true. The function also retries on throttling
52 | * @param client The DocumentClient instance.
53 | * @param collectionSelfLink The self link of the passed collection.
54 | * @param isUpsert Specify if the document should be upserted.
55 | */
56 | public static Document createDocument(DocumentClient client, String collectionSelfLink, Document doc, boolean isUpsert) {
57 | BackoffExponentialRetryPolicy retryPolicy = new BackoffExponentialRetryPolicy();
58 | while(retryPolicy.shouldRetry()){
59 | try {
60 | if(isUpsert) {
61 | return client.upsertDocument(collectionSelfLink, doc, null, false).getResource();
62 | } else {
63 | return client.createDocument(collectionSelfLink, doc, null, false).getResource();
64 | }
65 | } catch(DocumentClientException e){
66 | retryPolicy.errorOccured(e);
67 | }
68 | }
69 |
70 | return null;
71 | }
72 |
73 | /**
74 | * Gets an output collection with the passed name ( if the collection already exists return it, otherwise create new one
75 | * @param client The DocumentClient instance.
76 | * @param databaseSelfLink the self link of the passed database.
77 | * @param collectionId The id of the output collection.
78 | * @param outputStringPrecision An optional parameter that contains the default string precision to be used to create an indexing policy.
79 | * @param offerType An optional parameter that contains the offer type of the output collection.
80 | */
81 | public static DocumentCollection getOrCreateOutputCollection(DocumentClient client, String databaseSelfLink,
82 | String collectionId, int outputStringPrecision, String offerType) throws DocumentClientException {
83 |
84 | DocumentCollection outputCollection = DocumentDBConnectorUtil.GetDocumentCollection(client, databaseSelfLink, collectionId);
85 |
86 | if (outputCollection == null) {
87 | DocumentCollection outputColl = new DocumentCollection("{ 'id':'" + collectionId + "' }");
88 |
89 | outputColl.setIndexingPolicy(DocumentDBConnectorUtil.getOutputIndexingPolicy(outputStringPrecision));
90 |
91 | BackoffExponentialRetryPolicy retryPolicy = new BackoffExponentialRetryPolicy();
92 |
93 | while(retryPolicy.shouldRetry()) {
94 | try {
95 | RequestOptions options = new RequestOptions();
96 | options.setOfferType(offerType);
97 | outputCollection = client.createCollection(databaseSelfLink, outputColl, options).getResource();
98 | break;
99 | } catch (Exception e) {
100 | retryPolicy.errorOccured(e);
101 | }
102 | }
103 | }
104 |
105 | return outputCollection;
106 | }
107 |
108 | /**
109 | * Gets an output collection with the passed name ( if the collection already exists return it, otherwise create new one
110 | * @param client The DocumentClient instance.
111 | * @param databaseSelfLink the self link of the passed database.
112 | * @param collectionId The id of the output collection.
113 | */
114 | public static DocumentCollection GetDocumentCollection(DocumentClient client, String databaseSelfLink, String collectionId) {
115 | BackoffExponentialRetryPolicy retryPolicy = new BackoffExponentialRetryPolicy();
116 | QueryIterable collIterable = client.queryCollections(
117 | databaseSelfLink,
118 | new SqlQuerySpec("SELECT * FROM root r WHERE r.id=@id",
119 | new SqlParameterCollection(new SqlParameter("@id", collectionId))),
120 | null).getQueryIterable();
121 |
122 | List collections = null;
123 | while(retryPolicy.shouldRetry()){
124 | try {
125 | collections = collIterable.toList();
126 | break;
127 | } catch (Exception e) {
128 | retryPolicy.errorOccured(e);
129 | }
130 | }
131 |
132 | if(collections.size() == 0) {
133 | return null;
134 | }
135 |
136 | return collections.get(0);
137 | }
138 |
139 | public static Database GetDatabase(DocumentClient client, String databaseId) {
140 | BackoffExponentialRetryPolicy retryPolicy = new BackoffExponentialRetryPolicy();
141 | QueryIterable dbIterable = client.queryDatabases(
142 | new SqlQuerySpec("SELECT * FROM root r WHERE r.id=@id",
143 | new SqlParameterCollection(new SqlParameter("@id", databaseId))),
144 | null).getQueryIterable();
145 |
146 | List databases = null;
147 | while(retryPolicy.shouldRetry()){
148 | try {
149 | databases = dbIterable.toList();
150 | break;
151 | } catch (Exception e) {
152 | retryPolicy.errorOccured(e);
153 | }
154 | }
155 |
156 | if(databases.size() == 0) {
157 | return null;
158 | }
159 |
160 | return databases.get(0);
161 | }
162 |
163 | /**
164 | * Gets the bulk import stored procedure that will be used for writing documents ( if the sproc already exists, use it, otherwise create a new one.
165 | * @param client the DocumentClient instance for DocumentDB.
166 | * @param collectionLink the self-link of the collection to write to.
167 | * @return StoredProcedure instance that will be used for writing
168 | */
169 | public static StoredProcedure CreateBulkImportStoredProcedure(DocumentClient client, String collectionLink)
170 | throws DocumentClientException {
171 | BackoffExponentialRetryPolicy retryPolicy = new BackoffExponentialRetryPolicy();
172 | List sprocs = null;
173 |
174 | while(retryPolicy.shouldRetry()){
175 | try {
176 | sprocs = client.queryStoredProcedures(collectionLink,
177 | new SqlQuerySpec("SELECT * FROM root r WHERE r.id=@id",
178 | new SqlParameterCollection(new SqlParameter("@id", BULK_IMPORT_ID))),
179 | null).getQueryIterable().toList();
180 | break;
181 | } catch (Exception e) {
182 | retryPolicy.errorOccured(e);
183 | }
184 | }
185 |
186 | if(sprocs.size() > 0) {
187 | return sprocs.get(0);
188 | }
189 |
190 | StoredProcedure sproc = new StoredProcedure();
191 | sproc.setId(BULK_IMPORT_ID);
192 | String sprocBody = getBulkImportBody(client);
193 | sproc.setBody(sprocBody);
194 | return client.createStoredProcedure(collectionLink, sproc, null).getResource();
195 | }
196 |
197 | /**
198 | * Executes the bulk import stored procedure for a list of documents.
199 | * The execution takes into consideration throttling and blacklisting of the stored procedure.
200 | * @param client The DocumentClient instance for DocumentDB
201 | * @param collectionSelfLink the self-link for the collection to write to.
202 | * @param sproc The stored procedure to execute
203 | * @param allDocs The list of documents to write
204 | * @param upsert Specifies whether to replace the document if exists or not. By default it's true.
205 | */
206 | public static void executeWriteStoredProcedure(final DocumentClient client, String collectionSelfLink, final StoredProcedure sproc,
207 | List allDocs, final boolean upsert) {
208 |
209 | int currentCount = 0;
210 |
211 | while (currentCount < allDocs.size())
212 | {
213 | String []jsonArrayString = CreateBulkInsertScriptArguments(allDocs, currentCount, MAX_SCRIPT_SIZE);
214 | BackoffExponentialRetryPolicy retryPolicy = new BackoffExponentialRetryPolicy();
215 | String response = null;
216 | while(retryPolicy.shouldRetry()){
217 | try {
218 | response = client.executeStoredProcedure(sproc.getSelfLink(), new Object[] { jsonArrayString, upsert })
219 | .getResponseAsString();
220 | break;
221 | } catch(Exception e){
222 | retryPolicy.errorOccured(e);
223 | }
224 | }
225 |
226 | int createdCount = Integer.parseInt(response);
227 | currentCount += createdCount;
228 | }
229 | }
230 |
231 | /**
232 | *
233 | * @param docs The list of documents to be created
234 | * @param currentIndex the current index in the list of docs to start with.
235 | * @param maxCount the max count to be created by the sproc.
236 | * @param maxScriptSize the max size of the sproc that is used to avoid exceeding the max request size.
237 | * @return a string array for all documents to be created
238 | */
239 | private static String[] CreateBulkInsertScriptArguments(List docs, int currentIndex, int maxScriptSize)
240 | {
241 | if (currentIndex >= docs.size()) return new String[]{};
242 |
243 | ArrayList jsonDocumentList = new ArrayList();
244 | String stringifiedDoc;
245 | int scriptCapacityRemaining = maxScriptSize;
246 |
247 | int i = 0;
248 | while (scriptCapacityRemaining > 0 && i < MAX_SCRIPT_DOCS && currentIndex + i < docs.size())
249 | {
250 | stringifiedDoc = docs.get(currentIndex + i).toString();
251 | jsonDocumentList.add(stringifiedDoc);
252 | scriptCapacityRemaining-= stringifiedDoc.length();
253 | i++;
254 | }
255 |
256 | String[] jsonDocumentArray = new String[jsonDocumentList.size()];
257 | jsonDocumentList.toArray(jsonDocumentArray);
258 | return jsonDocumentArray;
259 | }
260 |
261 | /**
262 | * Reads the bulk import script body from the file.
263 | * @param client the DocumentClient instance.
264 | * @return a string that contains the stored procedure body.
265 | */
266 | private static String getBulkImportBody(DocumentClient client) {
267 | try {
268 | InputStream stream = DocumentDBConnectorUtil.class.getResourceAsStream(BULK_IMPORT_PATH);
269 | List scriptLines = IOUtils.readLines(stream);
270 | StringBuilder scriptBody = new StringBuilder();
271 | for (Iterator iterator = scriptLines.iterator(); iterator.hasNext();) {
272 | String line = (String) iterator.next();
273 | scriptBody.append(line + "\n");
274 | }
275 |
276 | return scriptBody.toString();
277 | } catch (IOException e) {
278 | throw new IllegalStateException(e);
279 | }
280 | }
281 |
282 | /**
283 | * If no id is provided, replace it with an auto generated guid id.
284 | * @param doc The document to be checked for id.
285 | */
286 | public static void addIdIfMissing(Document doc) {
287 | if (doc.getId() == null) {
288 | doc.setId(UUID.randomUUID().toString());
289 | }
290 | }
291 |
292 | private static IndexingPolicy getOutputIndexingPolicy(int outputStringPrecision) {
293 | // Setup indexing policy.
294 | IndexingPolicy policy = new IndexingPolicy();
295 | ArrayList includedPaths = new ArrayList();
296 |
297 | // All paths.
298 | IncludedPath path = new IncludedPath();
299 | RangeIndex stringIndex = new RangeIndex(DataType.String);
300 | stringIndex.setPrecision(outputStringPrecision);
301 | path.getIndexes().add(stringIndex);
302 | RangeIndex numberIndex = new RangeIndex(DataType.Number);
303 | numberIndex.setPrecision(-1); // Maximum precision
304 | path.getIndexes().add(numberIndex);
305 | path.setPath("/*");
306 | includedPaths.add(path);
307 | policy.setIncludedPaths(includedPaths);
308 | return policy;
309 | }
310 | }
311 |
--------------------------------------------------------------------------------
/src/com/microsoft/azure/documentdb/hadoop/DocumentDBInputFormat.java:
--------------------------------------------------------------------------------
1 | //------------------------------------------------------------
2 | // Copyright (c) Microsoft Corporation. All rights reserved.
3 | //------------------------------------------------------------
4 | package com.microsoft.azure.documentdb.hadoop;
5 |
6 | import java.io.IOException;
7 | import java.util.List;
8 |
9 | import org.apache.commons.logging.LogFactory;
10 | import org.apache.hadoop.conf.Configuration;
11 | import org.apache.hadoop.io.LongWritable;
12 | import org.apache.hadoop.mapreduce.InputFormat;
13 | import org.apache.hadoop.mapreduce.InputSplit;
14 | import org.apache.hadoop.mapreduce.JobContext;
15 | import org.apache.hadoop.mapreduce.RecordReader;
16 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
17 | import org.apache.commons.logging.Log;
18 |
19 | /**
20 | * An input format that can read data from Azure DocumentDB. It sends one Document
21 | * at a time to the mapper.
22 | */
23 | public class DocumentDBInputFormat extends InputFormat {
24 |
25 | private static final Log LOG = LogFactory.getLog(DocumentDBWritable.class);
26 |
27 | /**
28 | * Creates an instance of DocumentDBRecordReader
29 | */
30 | @Override
31 | public RecordReader createRecordReader(InputSplit split,
32 | TaskAttemptContext context) throws IOException, InterruptedException {
33 | return new DocumentDBRecordReader((DocumentDBInputSplit) split);
34 | }
35 |
36 | /**
37 | * Gets a list of DocumentDBInputSplit and validates all the required properties to read from documentdb.
38 | */
39 | @Override
40 | public List getSplits(JobContext context) throws IOException, InterruptedException {
41 | Configuration conf = context.getConfiguration();
42 | final String endpoint = ConfigurationUtil.getDBEndpoint(conf);
43 | final String key = ConfigurationUtil.getDBKey(conf);
44 | final String dbName = ConfigurationUtil.getDBName(conf);
45 | final String[] collectionNames = ConfigurationUtil.getInputCollectionNames(conf);
46 | final String query = ConfigurationUtil.getQuery(conf);
47 |
48 | if (endpoint == null)
49 | throw new IOException("DB_HOST must be set for the jobconf");
50 | if (key == null)
51 | throw new IOException("DB_KEY must be set for the jobconf");
52 | if (dbName == null)
53 | throw new IOException("DB_NAME must be set for the jobconf");
54 | if (collectionNames.length < 1)
55 | throw new IOException("INPUT_COLLECTION_NAMES must be set for the jobconf as comma separated names");
56 | return DocumentDBInputSplit.getSplits(conf, endpoint, key, dbName, collectionNames, query);
57 | }
58 | }
--------------------------------------------------------------------------------
/src/com/microsoft/azure/documentdb/hadoop/DocumentDBInputSplit.java:
--------------------------------------------------------------------------------
1 | //------------------------------------------------------------
2 | // Copyright (c) Microsoft Corporation. All rights reserved.
3 | //------------------------------------------------------------
4 | package com.microsoft.azure.documentdb.hadoop;
5 |
6 | import java.io.DataInput;
7 | import java.io.DataOutput;
8 | import java.io.IOException;
9 | import java.util.Arrays;
10 | import java.util.Iterator;
11 | import java.util.LinkedList;
12 | import java.util.List;
13 |
14 | import org.apache.commons.logging.Log;
15 | import org.apache.commons.logging.LogFactory;
16 | import org.apache.hadoop.conf.Configuration;
17 | import org.apache.hadoop.io.Text;
18 | import org.apache.hadoop.io.Writable;
19 | import org.apache.hadoop.mapreduce.InputSplit;
20 |
21 | import com.microsoft.azure.documentdb.ConnectionPolicy;
22 | import com.microsoft.azure.documentdb.ConsistencyLevel;
23 | import com.microsoft.azure.documentdb.Database;
24 | import com.microsoft.azure.documentdb.Document;
25 | import com.microsoft.azure.documentdb.DocumentClient;
26 | import com.microsoft.azure.documentdb.DocumentCollection;
27 | import com.microsoft.azure.documentdb.FeedOptions;
28 | import com.microsoft.azure.documentdb.QueryIterable;
29 | import com.microsoft.azure.documentdb.SqlParameter;
30 | import com.microsoft.azure.documentdb.SqlParameterCollection;
31 | import com.microsoft.azure.documentdb.SqlQuerySpec;
32 |
33 | /**
34 | * An input split that represents one collection from documentdb. It reads data one page at a time and
35 | * sends one by one document to the mapper.
36 | * In order to be able to use it, you need to set the required configuration properties for the input split.
37 | */
38 | public class DocumentDBInputSplit extends InputSplit implements Writable, org.apache.hadoop.mapred.InputSplit {
39 |
40 | private static final Log LOG = LogFactory.getLog(DocumentDBWritable.class);
41 | private final int MAX_PAGE_SIZE = 700;
42 | private Text host, key, dbName, collName, query;
43 | private Iterator documentIterator;
44 |
45 | public DocumentDBInputSplit() {
46 | this.host = new Text();
47 | this.key = new Text();
48 | this.dbName = new Text();
49 | this.collName = new Text();
50 | this.query = new Text();
51 | }
52 |
53 | public DocumentDBInputSplit(String host, String key, String dbName, String collName, String query) {
54 | this.host = new Text(host);
55 | this.key = new Text(key);
56 | this.dbName = new Text(dbName);
57 | this.collName = new Text(collName);
58 | if (query == null) {
59 | query = "";
60 | }
61 |
62 | this.query = new Text(query);
63 | }
64 |
65 | /**
66 | * Gets the list of DocumentDBInputSplit used.
67 | */
68 | public static List getSplits(Configuration conf, String dbHost, String dbKey, String dbName,
69 | String[] collNames, String query) {
70 | int internalNumSplits = collNames.length;
71 | List splits = new LinkedList();
72 | for (int i = 0; i < internalNumSplits; i++) {
73 | splits.add(new DocumentDBInputSplit(dbHost, dbKey, dbName, collNames[i].trim(), query));
74 | }
75 |
76 | return splits;
77 | }
78 |
79 | /**
80 | * @inheritDoc
81 | */
82 | @Override
83 | public long getLength() {
84 | return Integer.MAX_VALUE;
85 | }
86 |
87 | /**
88 | * @inheritDoc
89 | */
90 | @Override
91 | public String[] getLocations() throws IOException {
92 | // Since we're pulling the data from DocumentDB, it's not localized
93 | // to any single node so just return localhost.
94 | return new String[] { "localhost" };
95 | }
96 |
97 | public String getCollectionName() {
98 | return this.collName.toString();
99 | }
100 |
101 | /**
102 | * @inheritDoc
103 | */
104 | public void readFields(DataInput in) throws IOException {
105 | this.host.readFields(in);
106 | this.key.readFields(in);
107 | this.dbName.readFields(in);
108 | this.collName.readFields(in);
109 | this.query.readFields(in);
110 | }
111 |
112 | /**
113 | * @inheritDoc
114 | */
115 | public void write(DataOutput out) throws IOException {
116 | this.host.write(out);
117 | this.key.write(out);
118 | this.dbName.write(out);
119 | this.collName.write(out);
120 | this.query.write(out);
121 | }
122 |
123 | /**
124 | *
125 | * @return an Iterator for documents in the collection wrapped by the split.
126 | * @throws IOException if a read operation fails on documentdb
127 | */
128 | public Iterator getDocumentIterator() throws IOException {
129 | if (this.documentIterator != null)
130 | return this.documentIterator;
131 |
132 | Database db;
133 | DocumentCollection coll;
134 | DocumentClient client;
135 | try {
136 | LOG.debug("Connecting to " + this.host + " and reading from collection " + this.collName);
137 | ConnectionPolicy policy = ConnectionPolicy.GetDefault();
138 | policy.setUserAgentSuffix(DocumentDBConnectorUtil.UserAgentSuffix);
139 | client = new DocumentClient(this.host.toString(), this.key.toString(), policy,
140 | ConsistencyLevel.Session);
141 |
142 | db = DocumentDBConnectorUtil.GetDatabase(client, this.dbName.toString());
143 | if (db == null) {
144 | throw new IOException(String.format("Database %s doesn't exist", this.dbName));
145 | }
146 |
147 | coll = DocumentDBConnectorUtil.GetDocumentCollection(client, db.getSelfLink(), this.collName.toString());
148 | if (coll == null) {
149 | throw new IOException(String.format("collection %s doesn't exist", this.collName));
150 | }
151 |
152 | String query = this.query.toString();
153 | if (query != null && !query.isEmpty()) {
154 | query = this.query.toString();
155 | } else {
156 | query = "select * from root";
157 | }
158 |
159 | FeedOptions options = new FeedOptions();
160 | options.setPageSize(MAX_PAGE_SIZE);
161 | this.documentIterator = client.queryDocuments(
162 | coll.getSelfLink(),
163 | query,
164 | options).getQueryIterator();
165 | } catch (Exception e) {
166 | throw new IOException(e);
167 | }
168 |
169 | return this.documentIterator;
170 | }
171 |
172 | public String toString() {
173 | return String.format("DocumentDBSplit(collection=%s)", this.collName);
174 | }
175 |
176 | }
--------------------------------------------------------------------------------
/src/com/microsoft/azure/documentdb/hadoop/DocumentDBOutputCommitter.java:
--------------------------------------------------------------------------------
1 | //------------------------------------------------------------
2 | // Copyright (c) Microsoft Corporation. All rights reserved.
3 | //------------------------------------------------------------
4 | package com.microsoft.azure.documentdb.hadoop;
5 |
6 | import org.apache.commons.logging.Log;
7 | import org.apache.commons.logging.LogFactory;
8 | import org.apache.hadoop.mapreduce.JobContext;
9 | import org.apache.hadoop.mapreduce.OutputCommitter;
10 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
11 |
12 | public class DocumentDBOutputCommitter extends OutputCommitter {
13 |
14 | private static final Log LOG = LogFactory.getLog(DocumentDBWritable.class);
15 |
16 | /**
17 | * @inheritDoc
18 | */
19 | @Override
20 | public void abortTask(final TaskAttemptContext taskContext) {
21 | LOG.info("Aborting task.");
22 | }
23 |
24 | /**
25 | * @inheritDoc
26 | */
27 | @Override
28 | public void commitTask(final TaskAttemptContext taskContext) {
29 | LOG.info("Committing task.");
30 | }
31 |
32 | /**
33 | * @inheritDoc
34 | */
35 | @Override
36 | public boolean needsTaskCommit(final TaskAttemptContext taskContext) {
37 | return true;
38 | }
39 |
40 | /**
41 | * @inheritDoc
42 | */
43 | @Override
44 | public void setupJob(final JobContext jobContext) {
45 | LOG.info("Setting up job.");
46 | }
47 |
48 | /**
49 | * @inheritDoc
50 | */
51 | @Override
52 | public void setupTask(final TaskAttemptContext taskContext) {
53 | LOG.info("Setting up task.");
54 | }
55 |
56 | }
--------------------------------------------------------------------------------
/src/com/microsoft/azure/documentdb/hadoop/DocumentDBOutputFormat.java:
--------------------------------------------------------------------------------
1 | //------------------------------------------------------------
2 | // Copyright (c) Microsoft Corporation. All rights reserved.
3 | //------------------------------------------------------------
4 | package com.microsoft.azure.documentdb.hadoop;
5 |
6 | import java.io.IOException;
7 |
8 | import org.apache.hadoop.conf.Configuration;
9 | import org.apache.hadoop.io.Writable;
10 | import org.apache.hadoop.mapreduce.JobContext;
11 | import org.apache.hadoop.mapreduce.OutputCommitter;
12 | import org.apache.hadoop.mapreduce.OutputFormat;
13 | import org.apache.hadoop.mapreduce.RecordWriter;
14 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
15 |
16 | /**
17 | * An output format that is used to write data to documentdb.
18 | */
19 | public class DocumentDBOutputFormat extends OutputFormat {
20 |
21 | /**
22 | * Validates the required properties needed to write to documentdb.
23 | */
24 | @Override
25 | public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException {
26 | Configuration conf = context.getConfiguration();
27 | final String endpoint = ConfigurationUtil.getDBEndpoint(conf);
28 | final String key = ConfigurationUtil.getDBKey(conf);
29 | final String dbName = ConfigurationUtil.getDBName(conf);
30 | final String[] collectionNames = ConfigurationUtil.getOutputCollectionNames(conf);
31 |
32 | if (endpoint == null)
33 | throw new IOException("DB_HOST must be set for the jobconf");
34 | if (key == null)
35 | throw new IOException("DB_KEY must be set for the jobconf");
36 | if (dbName == null)
37 | throw new IOException("DB_NAME must be set for the jobconf");
38 | if (collectionNames == null || collectionNames.length == 0)
39 | throw new IOException("OUTPUT_COLLECTION_NAMES must be set for the jobconf as comma separated names");
40 | }
41 |
42 | /**
43 | * {@inheritDoc}
44 | */
45 | @Override
46 | public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {
47 | return new DocumentDBOutputCommitter();
48 | }
49 |
50 | /**
51 | * Creates an instance of DocumentDBRecordWriter.
52 | */
53 | @Override
54 | public RecordWriter getRecordWriter(TaskAttemptContext context) throws IOException,
55 | InterruptedException {
56 | Configuration conf = context.getConfiguration();
57 | return new DocumentDBRecordWriter(conf, ConfigurationUtil.getDBEndpoint(conf),
58 | ConfigurationUtil.getDBKey(conf), ConfigurationUtil.getDBName(conf),
59 | ConfigurationUtil.getOutputCollectionNames(conf),
60 | ConfigurationUtil.getOutputStringPrecision(conf),
61 | ConfigurationUtil.getUpsert(conf),
62 | ConfigurationUtil.getOutputCollectionsOffer(conf));
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/src/com/microsoft/azure/documentdb/hadoop/DocumentDBRecordReader.java:
--------------------------------------------------------------------------------
1 | //------------------------------------------------------------
2 | // Copyright (c) Microsoft Corporation. All rights reserved.
3 | //------------------------------------------------------------
4 | package com.microsoft.azure.documentdb.hadoop;
5 |
6 | import java.io.IOException;
7 | import java.util.Iterator;
8 |
9 | import org.apache.commons.logging.Log;
10 | import org.apache.commons.logging.LogFactory;
11 | import org.apache.hadoop.io.LongWritable;
12 | import org.apache.hadoop.mapreduce.InputSplit;
13 | import org.apache.hadoop.mapreduce.RecordReader;
14 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
15 |
16 | import com.microsoft.azure.documentdb.Document;
17 |
18 | /**
19 | * Reads documents from documendb using a DocumentDBIterable instance.
20 | */
21 | public class DocumentDBRecordReader extends
22 | RecordReader {
23 |
24 | private DocumentDBInputSplit split;
25 | private Iterator documentIterator;
26 | private long documentsProcessed;
27 | private DocumentDBWritable current;
28 | private static final Log LOG = LogFactory.getLog(DocumentDBWritable.class);
29 |
30 | public DocumentDBRecordReader(DocumentDBInputSplit split) throws IOException {
31 | this.split = split;
32 | this.current = new DocumentDBWritable();
33 | this.documentIterator = this.split.getDocumentIterator();
34 | }
35 |
36 | public void close() throws IOException {
37 |
38 | }
39 |
40 | public float getProgress() throws IOException {
41 | if(this.documentIterator == null) return 0f;
42 | boolean hasNext = false;
43 | BackoffExponentialRetryPolicy policy = new BackoffExponentialRetryPolicy();
44 | while(policy.shouldRetry()) {
45 | try {
46 | hasNext = this.documentIterator.hasNext();
47 | break;
48 | }
49 | catch(Exception e) {
50 | policy.errorOccured(e);
51 | }
52 | }
53 |
54 | return hasNext ? 0f : 1f;
55 | }
56 |
57 | /**
58 | * {@inheritDoc}
59 | */
60 | @Override
61 | public LongWritable getCurrentKey() throws IOException,
62 | InterruptedException {
63 | return new LongWritable();
64 | }
65 |
66 | /**
67 | * {@inheritDoc}
68 | */
69 | @Override
70 | public DocumentDBWritable getCurrentValue() throws IOException,
71 | InterruptedException {
72 | return current;
73 | }
74 |
75 | /**
76 | * {@inheritDoc}
77 | */
78 | @Override
79 | public void initialize(InputSplit split, TaskAttemptContext context)
80 | throws IOException, InterruptedException {
81 | if(this.split == null) this.split = (DocumentDBInputSplit) split;
82 | }
83 |
84 | /**
85 | * {@inheritDoc}
86 | */
87 | @Override
88 | public boolean nextKeyValue() throws IOException, InterruptedException {
89 |
90 | BackoffExponentialRetryPolicy retryPolicy = new BackoffExponentialRetryPolicy();
91 | while(retryPolicy.shouldRetry()) {
92 | try {
93 | if (this.documentIterator == null || !this.documentIterator.hasNext()) {
94 | LOG.info(String.format("processed %d documents of collection %s", this.documentsProcessed, this.split.getCollectionName()));
95 | return false;
96 | }
97 |
98 | if(documentsProcessed % 100 == 0) {
99 | LOG.info(String.format("processed %d documents of collection %s", this.documentsProcessed, this.split.getCollectionName()));
100 | }
101 |
102 | this.current.setDoc(this.documentIterator.next());
103 | this.documentsProcessed++;
104 | break;
105 | } catch(Exception e) {
106 | retryPolicy.errorOccured(e);
107 | }
108 | }
109 |
110 | return true;
111 | }
112 | }
113 |
--------------------------------------------------------------------------------
/src/com/microsoft/azure/documentdb/hadoop/DocumentDBRecordWriter.java:
--------------------------------------------------------------------------------
1 | //------------------------------------------------------------
2 | // Copyright (c) Microsoft Corporation. All rights reserved.
3 | //------------------------------------------------------------
4 | package com.microsoft.azure.documentdb.hadoop;
5 |
6 | import java.io.IOException;
7 | import java.util.LinkedList;
8 | import java.util.List;
9 |
10 | import org.apache.commons.logging.Log;
11 | import org.apache.commons.logging.LogFactory;
12 | import org.apache.hadoop.conf.Configuration;
13 | import org.apache.hadoop.io.Writable;
14 | import org.apache.hadoop.mapreduce.RecordWriter;
15 | import org.apache.hadoop.mapreduce.TaskAttemptContext;
16 |
17 | import com.microsoft.azure.documentdb.ConnectionPolicy;
18 | import com.microsoft.azure.documentdb.ConsistencyLevel;
19 | import com.microsoft.azure.documentdb.Database;
20 | import com.microsoft.azure.documentdb.Document;
21 | import com.microsoft.azure.documentdb.DocumentClient;
22 | import com.microsoft.azure.documentdb.DocumentCollection;
23 | import com.microsoft.azure.documentdb.StoredProcedure;
24 |
25 | /**
26 | * Writes data to DocumentDB in document batches using a stored procedure.
27 | */
28 | public class DocumentDBRecordWriter extends RecordWriter {
29 | private static final Log LOG = LogFactory.getLog(DocumentDBWritable.class);
30 | private static int MAX_DOC_SIZE = 50;
31 | private DocumentClient client;
32 | private DocumentCollection[] collections;
33 | private StoredProcedure[] sprocs;
34 | private boolean enableUpsert;
35 | private int documentsProcessed = 0;
36 | private List cachedDocs;
37 | private int currentStoredProcedureIndex = 0;
38 |
39 | public DocumentDBRecordWriter(Configuration conf, String host, String key, String dbName, String[] collNames,
40 | int outputStringPrecision, boolean upsert, String offerType) throws IOException {
41 | try {
42 | ConnectionPolicy policy = ConnectionPolicy.GetDefault();
43 | policy.setUserAgentSuffix(DocumentDBConnectorUtil.UserAgentSuffix);
44 | DocumentClient client = new DocumentClient(host, key, policy,
45 | ConsistencyLevel.Session);
46 |
47 | Database db = DocumentDBConnectorUtil.GetDatabase(client, dbName);
48 | this.collections = new DocumentCollection[collNames.length];
49 | this.sprocs = new StoredProcedure[collNames.length];
50 | for (int i = 0; i < collNames.length; i++) {
51 | this.collections[i] = DocumentDBConnectorUtil.getOrCreateOutputCollection(client, db.getSelfLink(), collNames[i],
52 | outputStringPrecision, offerType);
53 | this.sprocs[i] = DocumentDBConnectorUtil.CreateBulkImportStoredProcedure(client, this.collections[i].getSelfLink());
54 | }
55 |
56 | this.client = client;
57 | this.enableUpsert = upsert;
58 | this.cachedDocs = new LinkedList();
59 | } catch (Exception e) {
60 | e.printStackTrace();
61 | throw new IOException(e);
62 | }
63 | }
64 |
65 | /**
66 | * Writes data to DocumentDB if the cached documents reach the maximum cache size.
67 | */
68 | public void write(Writable key, DocumentDBWritable value) throws IOException {
69 | Document doc = value.getDoc();
70 | DocumentCollection targetCollection = this.collections[this.currentStoredProcedureIndex];
71 | currentStoredProcedureIndex = (this.currentStoredProcedureIndex + 1) % this.collections.length;
72 | this.documentsProcessed++;
73 | if(targetCollection.getPartitionKey() != null) {
74 | DocumentDBConnectorUtil.createDocument(this.client, targetCollection.getSelfLink(), doc, this.enableUpsert);
75 | if (documentsProcessed % MAX_DOC_SIZE == 0) {
76 | LOG.info(String.format("wrote %d documents", this.documentsProcessed));
77 | }
78 | } else {
79 | DocumentDBConnectorUtil.addIdIfMissing(doc);
80 | this.cachedDocs.add(doc);
81 | if (this.documentsProcessed % MAX_DOC_SIZE == 0) {
82 | this.writeCurrentBatch();
83 | LOG.info(String.format("wrote %d documents", this.documentsProcessed));
84 | }
85 | }
86 | }
87 |
88 | /**
89 | * Writes the last batch of documents that are being cached.
90 | */
91 | @Override
92 | public void close(TaskAttemptContext context) throws IOException, InterruptedException {
93 | if (this.cachedDocs.size() > 0) {
94 | this.writeCurrentBatch();
95 | }
96 | }
97 |
98 | private void writeCurrentBatch() {
99 | // Writing to output collections is round robin for each batch.
100 | DocumentDBConnectorUtil.executeWriteStoredProcedure(this.client,
101 | this.collections[this.currentStoredProcedureIndex].getSelfLink(),
102 | this.sprocs[this.currentStoredProcedureIndex], this.cachedDocs,
103 | this.enableUpsert);
104 | this.cachedDocs.clear();
105 |
106 | // Do a round robin on the collections and execute the stored procedure once per each.
107 | this.currentStoredProcedureIndex = (this.currentStoredProcedureIndex + 1) % this.sprocs.length;
108 | }
109 |
110 | }
111 |
--------------------------------------------------------------------------------
/src/com/microsoft/azure/documentdb/hadoop/DocumentDBWritable.java:
--------------------------------------------------------------------------------
1 | //------------------------------------------------------------
2 | // Copyright (c) Microsoft Corporation. All rights reserved.
3 | //------------------------------------------------------------
4 | package com.microsoft.azure.documentdb.hadoop;
5 |
6 | import java.io.DataInput;
7 | import java.io.DataOutput;
8 | import java.io.IOException;
9 |
10 | import org.apache.hadoop.io.Writable;
11 | import org.apache.hadoop.io.WritableComparable;
12 | import org.apache.hadoop.io.WritableComparator;
13 |
14 | import com.microsoft.azure.documentdb.Document;
15 |
16 | public class DocumentDBWritable implements WritableComparable