├── .gitignore ├── java ├── example │ ├── default │ │ ├── WEB-INF │ │ │ ├── logging.properties │ │ │ ├── dispatch.xml │ │ │ ├── queue.xml │ │ │ └── appengine-web.xml │ │ └── index.html │ ├── mapreduce │ │ └── WEB-INF │ │ │ ├── logging.properties │ │ │ ├── appengine-web.xml │ │ │ └── web.xml │ ├── shuffler │ │ └── WEB-INF │ │ │ ├── logging.properties │ │ │ ├── appengine-web.xml │ │ │ └── web.xml │ ├── META-INF │ │ ├── appengine-application.xml │ │ └── application.xml │ └── src │ │ └── com │ │ └── google │ │ └── appengine │ │ └── demos │ │ └── mapreduce │ │ ├── bigqueryload │ │ ├── SampleNestedRecord.java │ │ ├── SampleTable.java │ │ └── RandomBigQueryDataCreator.java │ │ ├── randomcollisions │ │ ├── SeedToRandomMapper.java │ │ └── CollisionFindingReducer.java │ │ └── entitycount │ │ ├── CountReducer.java │ │ ├── DeleteEntityMapper.java │ │ ├── EntityCreator.java │ │ └── CountMapper.java ├── README ├── src │ ├── test │ │ └── java │ │ │ └── com │ │ │ └── google │ │ │ └── appengine │ │ │ └── tools │ │ │ └── mapreduce │ │ │ ├── testmodels │ │ │ ├── ParameterizedClass.java │ │ │ ├── AbstractClassSample.java │ │ │ ├── Child.java │ │ │ ├── PhoneNumber.java │ │ │ ├── SampleClassWithNestedCollection.java │ │ │ ├── SampleClassWithNonParametricList.java │ │ │ ├── SimpleJsonWithWrapperTypes.java │ │ │ ├── ClassExtendingAbstract.java │ │ │ ├── ClassWithArray.java │ │ │ ├── Father.java │ │ │ ├── Man.java │ │ │ ├── SimpleJson.java │ │ │ ├── SimplAnnotatedJson.java │ │ │ └── Person.java │ │ │ ├── inputs │ │ │ ├── GoogleCloudStorageLineInputTestCase.java │ │ │ ├── ConcatenatingInputReaderTest.java │ │ │ └── BlobstoreInputTest.java │ │ │ ├── impl │ │ │ ├── shardedjob │ │ │ │ ├── TestController.java │ │ │ │ └── TestTask.java │ │ │ └── handlers │ │ │ │ └── MemoryLimiterTest.java │ │ │ └── outputs │ │ │ └── BigQueryStoreResultTest.java │ └── main │ │ ├── java │ │ └── com │ │ │ └── google │ │ │ └── appengine │ │ │ └── tools │ │ │ └── mapreduce │ │ │ ├── Context.java │ │ │ ├── MapOnlyMapperContext.java │ │ │ ├── ReducerContext.java │ │ │ ├── impl │ │ │ ├── IncrementalTaskWithContext.java │ │ │ ├── shardedjob │ │ │ │ ├── RecoverableException.java │ │ │ │ ├── ShardedJobServiceFactory.java │ │ │ │ ├── RejectRequestException.java │ │ │ │ ├── pipeline │ │ │ │ │ ├── DeleteShardedJob.java │ │ │ │ │ ├── FinalizeShardedJob.java │ │ │ │ │ └── AbstractShardedJob.java │ │ │ │ ├── ShardFailureException.java │ │ │ │ ├── JobFailureException.java │ │ │ │ ├── ShardedJobController.java │ │ │ │ ├── ShardedJobHandler.java │ │ │ │ ├── InProcessShardedJobRunner.java │ │ │ │ ├── ShardedJobState.java │ │ │ │ ├── ShardedJobServiceImpl.java │ │ │ │ ├── IncrementalTask.java │ │ │ │ └── ShardedJobService.java │ │ │ ├── BaseContext.java │ │ │ ├── BigqueryFieldMarshaller.java │ │ │ ├── MapOnlyMapperContextImpl.java │ │ │ ├── ReducerContextImpl.java │ │ │ ├── sort │ │ │ │ ├── MergeContext.java │ │ │ │ ├── SortContext.java │ │ │ │ └── LexicographicalComparator.java │ │ │ ├── MapperContextImpl.java │ │ │ ├── pipeline │ │ │ │ ├── ResultAndStatus.java │ │ │ │ ├── ExamineStatusAndReturnResult.java │ │ │ │ ├── DeleteFilesJob.java │ │ │ │ ├── ShardedJob.java │ │ │ │ └── CleanupPipelineJob.java │ │ │ ├── util │ │ │ │ ├── BigQueryDataTypeUtil.java │ │ │ │ ├── SplitUtil.java │ │ │ │ └── SerializableValue.java │ │ │ ├── HashingSharder.java │ │ │ ├── ReducerInputs.java │ │ │ ├── BigQueryConstants.java │ │ │ ├── CountersImpl.java │ │ │ ├── BaseShardContext.java │ │ │ └── KeyValueMarshaller.java │ │ │ ├── WorkerContext.java │ │ │ ├── BigQueryFieldMode.java │ │ │ ├── BigQueryIgnore.java │ │ │ ├── MapperContext.java │ │ │ ├── BigQueryMarshaller.java │ │ │ ├── MapReduceResult.java │ │ │ ├── ReducerInput.java │ │ │ ├── CorruptDataException.java │ │ │ ├── Counters.java │ │ │ ├── Counter.java │ │ │ ├── Sharder.java │ │ │ ├── inputs │ │ │ ├── DatastoreInputReader.java │ │ │ ├── DatastoreKeyInputReader.java │ │ │ ├── DatastoreInput.java │ │ │ ├── DatastoreKeyInput.java │ │ │ ├── UnmarshallingInput.java │ │ │ ├── NoInput.java │ │ │ ├── ForwardingInputReader.java │ │ │ ├── UnmarshallingInputReader.java │ │ │ ├── GoogleCloudStorageLevelDbInput.java │ │ │ ├── InMemoryInput.java │ │ │ └── BlobstoreInput.java │ │ │ ├── BaseMapper.java │ │ │ ├── MapReduceJobException.java │ │ │ ├── reducers │ │ │ ├── ValueProjectionReducer.java │ │ │ ├── KeyProjectionReducer.java │ │ │ └── NoReducer.java │ │ │ ├── ShardContext.java │ │ │ ├── bigqueryjobs │ │ │ └── BigQueryLoadJobReference.java │ │ │ ├── BigQueryDataField.java │ │ │ ├── Mapper.java │ │ │ ├── mappers │ │ │ ├── KeyProjectionMapper.java │ │ │ └── IdentityMapper.java │ │ │ ├── outputs │ │ │ ├── GoogleCloudStorageLevelDbOutputWriter.java │ │ │ ├── MarshallingOutputWriter.java │ │ │ ├── InMemoryOutputWriter.java │ │ │ ├── ForwardingOutputWriter.java │ │ │ ├── BigQueryStoreResult.java │ │ │ ├── SizeSegmentingOutputWriter.java │ │ │ ├── InMemoryOutput.java │ │ │ ├── SliceSegmentingOutputWriter.java │ │ │ ├── NoOutput.java │ │ │ ├── LevelDbOutput.java │ │ │ ├── ItemSegmentingOutputWriter.java │ │ │ └── MarshallingOutput.java │ │ │ ├── KeyValue.java │ │ │ ├── Marshaller.java │ │ │ ├── Reducer.java │ │ │ └── Input.java │ │ └── resources │ │ └── ui │ │ ├── overview.html │ │ ├── detail.html │ │ ├── base.css │ │ └── jquery.json-2.2.min.js └── NOTICE ├── img ├── detail.png └── overview.png ├── python ├── demo │ ├── queue.yaml │ ├── static │ │ ├── images │ │ │ └── favicon.ico │ │ └── js │ │ │ └── custom.js │ ├── app.yaml │ └── mapreduce.yaml ├── src │ ├── MANIFEST.in │ ├── requirements.txt │ ├── todelete.txt │ ├── mapreduce │ │ ├── include.yaml │ │ ├── third_party │ │ │ └── __init__.py │ │ ├── api │ │ │ ├── __init__.py │ │ │ └── map_job │ │ │ │ ├── __init__.py │ │ │ │ └── datastore_input_reader.py │ │ ├── lib │ │ │ ├── __init__.py │ │ │ └── input_reader │ │ │ │ └── __init__.py │ │ ├── tools │ │ │ └── __init__.py │ │ ├── operation │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── counters.py │ │ │ └── db.py │ │ ├── __init__.py │ │ ├── pipeline_base.py │ │ └── static │ │ │ ├── overview.html │ │ │ ├── detail.html │ │ │ ├── base.css │ │ │ └── jquery.json-2.2.min.js │ ├── README │ └── setup.py └── test │ ├── mapreduce │ ├── test_data │ │ └── appengine_config.py │ ├── api │ │ └── map_job │ │ │ ├── input_reader_test.py │ │ │ ├── output_writer_test.py │ │ │ └── map_job_config_test.py │ └── operation │ │ ├── counters_test.py │ │ └── db_test.py │ └── testlib │ └── __init__.py ├── .travis.yml └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | MANIFEST 2 | *.pyc 3 | .idea 4 | *.iml 5 | -------------------------------------------------------------------------------- /java/example/default/WEB-INF/logging.properties: -------------------------------------------------------------------------------- 1 | .level = INFO 2 | -------------------------------------------------------------------------------- /java/example/mapreduce/WEB-INF/logging.properties: -------------------------------------------------------------------------------- 1 | .level = INFO 2 | -------------------------------------------------------------------------------- /java/example/shuffler/WEB-INF/logging.properties: -------------------------------------------------------------------------------- 1 | .level = INFO 2 | -------------------------------------------------------------------------------- /img/detail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/appengine-mapreduce/HEAD/img/detail.png -------------------------------------------------------------------------------- /img/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/appengine-mapreduce/HEAD/img/overview.png -------------------------------------------------------------------------------- /java/example/default/WEB-INF/dispatch.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /python/demo/queue.yaml: -------------------------------------------------------------------------------- 1 | queue: 2 | - name: default 3 | rate: 50/s 4 | bucket_size: 100 5 | max_concurrent_requests: 100 6 | 7 | -------------------------------------------------------------------------------- /python/src/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include distribute_setup.py 2 | include mapreduce/include.yaml 3 | recursive-include mapreduce/static *.html *.css *.js 4 | -------------------------------------------------------------------------------- /python/src/requirements.txt: -------------------------------------------------------------------------------- 1 | GoogleAppEngineCloudStorageClient 2 | GoogleAppEnginePipeline 3 | Graphy 4 | simplejson 5 | mock 6 | mox 7 | pg8000 8 | -------------------------------------------------------------------------------- /python/demo/static/images/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudPlatform/appengine-mapreduce/HEAD/python/demo/static/images/favicon.ico -------------------------------------------------------------------------------- /java/example/default/WEB-INF/queue.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | mapreduce-workers 4 | 30/s 5 | 6 | 7 | -------------------------------------------------------------------------------- /python/src/todelete.txt: -------------------------------------------------------------------------------- 1 | GoogleAppEngineCloudStorageClient 2 | cloudstorage 3 | GoogleAppEnginePipeline 4 | pipeline 5 | Graphy 6 | graphy 7 | simplejson 8 | mock 9 | mox 10 | pg8000 11 | six 12 | -------------------------------------------------------------------------------- /python/src/mapreduce/include.yaml: -------------------------------------------------------------------------------- 1 | handlers: 2 | - url: /mapreduce/pipeline/images 3 | static_dir: pipeline/ui/images 4 | 5 | - url: /mapreduce(/.*)? 6 | script: mapreduce.main.APP 7 | login: admin 8 | 9 | -------------------------------------------------------------------------------- /java/example/META-INF/appengine-application.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | mapreduce-example 4 | -------------------------------------------------------------------------------- /python/test/mapreduce/test_data/appengine_config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Sample user config for test.""" 3 | 4 | # pylint: disable=g-bad-name 5 | mapreduce_SHARD_MAX_ATTEMPTS = 5 6 | mapreduce_QUEUE_NAME = "foo" 7 | mapreduce_BASE_PATH = "/my-mapreduce" 8 | -------------------------------------------------------------------------------- /java/README: -------------------------------------------------------------------------------- 1 | To build the library: 2 | $ ant 3 | Output files will be in dist/lib. 4 | 5 | To run the test suite: 6 | $ ant test 7 | 8 | To build the example mapreduce: 9 | $ ant compile_example 10 | Output files will be in example/war, which you can then upload with appcfg.sh 11 | -------------------------------------------------------------------------------- /java/src/test/java/com/google/appengine/tools/mapreduce/testmodels/ParameterizedClass.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.testmodels; 2 | 3 | public class ParameterizedClass { 4 | T id; 5 | /** 6 | * @param id 7 | */ 8 | public ParameterizedClass(T id) { 9 | this.id = id; 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/Context.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce; 4 | 5 | 6 | /** 7 | * MapReduce context. 8 | */ 9 | public interface Context { 10 | 11 | /** 12 | * Returns the Id for the job. 13 | */ 14 | String getJobId(); 15 | } 16 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/MapOnlyMapperContext.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce; 4 | 5 | 6 | /** 7 | * Context for {@link MapOnlyMapper} execution. 8 | * 9 | * @param type of output values produced by the mapper 10 | */ 11 | public interface MapOnlyMapperContext extends WorkerContext { 12 | } 13 | -------------------------------------------------------------------------------- /java/src/test/java/com/google/appengine/tools/mapreduce/testmodels/AbstractClassSample.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.testmodels; 2 | 3 | public abstract class AbstractClassSample { 4 | int id; 5 | String name; 6 | 7 | /** 8 | * @param id 9 | * @param name 10 | */ 11 | public AbstractClassSample(int id, String name) { 12 | this.id = id; 13 | this.name = name; 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /java/src/test/java/com/google/appengine/tools/mapreduce/testmodels/Child.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.testmodels; 2 | 3 | 4 | public class Child { 5 | public String fullName; 6 | public int age; 7 | /** 8 | * @param fullName 9 | * @param age 10 | */ 11 | public Child(String fullName, int age) { 12 | this.fullName = fullName; 13 | this.age = age; 14 | } 15 | 16 | 17 | } 18 | -------------------------------------------------------------------------------- /java/example/src/com/google/appengine/demos/mapreduce/bigqueryload/SampleNestedRecord.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.demos.mapreduce.bigqueryload; 2 | 3 | public class SampleNestedRecord { 4 | int col11; 5 | String col12; 6 | /** 7 | * @param col11 8 | * @param col12 9 | */ 10 | public SampleNestedRecord(int col11, String col12) { 11 | this.col11 = col11; 12 | this.col12 = col12; 13 | } 14 | 15 | } 16 | -------------------------------------------------------------------------------- /java/src/test/java/com/google/appengine/tools/mapreduce/testmodels/PhoneNumber.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.testmodels; 2 | 3 | public class PhoneNumber { 4 | public int areaCode; 5 | public int number; 6 | /** 7 | * @param areaCode 8 | * @param number 9 | */ 10 | public PhoneNumber(int areaCode, int number) { 11 | this.areaCode = areaCode; 12 | this.number = number; 13 | } 14 | 15 | } 16 | -------------------------------------------------------------------------------- /java/src/test/java/com/google/appengine/tools/mapreduce/testmodels/SampleClassWithNestedCollection.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.testmodels; 2 | 3 | import java.util.List; 4 | 5 | public class SampleClassWithNestedCollection { 6 | List> ll; 7 | 8 | /** 9 | * @param ll 10 | */ 11 | public SampleClassWithNestedCollection(List> ll) { 12 | this.ll = ll; 13 | } 14 | 15 | 16 | } 17 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/ReducerContext.java: -------------------------------------------------------------------------------- 1 | // Copyright 2011 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce; 4 | 5 | 6 | /** 7 | * Context for {@link Reducer} execution. 8 | * 9 | * @author ohler@google.com (Christian Ohler) 10 | * 11 | * @param type of output values produced by the reducer 12 | */ 13 | public interface ReducerContext extends WorkerContext { 14 | } 15 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/IncrementalTaskWithContext.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.impl; 2 | 3 | import com.google.appengine.tools.mapreduce.impl.shardedjob.IncrementalTask; 4 | 5 | /** 6 | * A simple extension of {@link IncrementalTask} to add information for display in a UI. 7 | */ 8 | public interface IncrementalTaskWithContext extends IncrementalTask { 9 | 10 | IncrementalTaskContext getContext(); 11 | } 12 | -------------------------------------------------------------------------------- /java/src/test/java/com/google/appengine/tools/mapreduce/testmodels/SampleClassWithNonParametricList.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.testmodels; 2 | 3 | import java.util.List; 4 | 5 | public class SampleClassWithNonParametricList { 6 | @SuppressWarnings("rawtypes") 7 | List l; 8 | 9 | /** 10 | * @param l 11 | */ 12 | @SuppressWarnings("rawtypes") 13 | public SampleClassWithNonParametricList(List l) { 14 | this.l = l; 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/WorkerContext.java: -------------------------------------------------------------------------------- 1 | // Copyright 2011 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce; 4 | 5 | 6 | /** 7 | * Context for each worker (mapper or reducer) shard. 8 | * 9 | * @param type of output values produced by the worker 10 | */ 11 | public interface WorkerContext extends ShardContext { 12 | 13 | /** 14 | * Emits a value to the output. 15 | */ 16 | void emit(O value); 17 | } 18 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/BigQueryFieldMode.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce; 2 | 3 | /** 4 | * The supported bigquery field modes. 5 | */ 6 | public enum BigQueryFieldMode { 7 | 8 | REPEATED("repeated"), NULLABLE("nullable"), REQUIRED("required"); 9 | 10 | private final String value; 11 | 12 | private BigQueryFieldMode(String value) { 13 | this.value = value; 14 | } 15 | 16 | public String getValue() { 17 | return value; 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/BigQueryIgnore.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce; 2 | 3 | import java.lang.annotation.ElementType; 4 | import java.lang.annotation.Retention; 5 | import java.lang.annotation.RetentionPolicy; 6 | import java.lang.annotation.Target; 7 | 8 | /** 9 | * A annotation for fields that should not be part of the bigquery output. 10 | */ 11 | @Retention(RetentionPolicy.RUNTIME) 12 | @Target(ElementType.FIELD) 13 | public @interface BigQueryIgnore { 14 | } 15 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/shardedjob/RecoverableException.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.impl.shardedjob; 2 | 3 | /** 4 | * An exception that indicates it is safe to restart a slice. 5 | */ 6 | public class RecoverableException extends RuntimeException { 7 | 8 | private static final long serialVersionUID = -1527377663569164133L; 9 | 10 | public RecoverableException(String message, Throwable rootCause) { 11 | super(message, rootCause); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /java/src/test/java/com/google/appengine/tools/mapreduce/testmodels/SimpleJsonWithWrapperTypes.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.testmodels; 2 | 3 | 4 | public class SimpleJsonWithWrapperTypes { 5 | Integer id; 6 | String name; 7 | Float value; 8 | /** 9 | * @param id 10 | * @param name 11 | * @param value 12 | */ 13 | public SimpleJsonWithWrapperTypes(Integer id, String name, Float value) { 14 | this.id = id; 15 | this.name = name; 16 | this.value = value; 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/BaseContext.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.impl; 2 | 3 | import com.google.appengine.tools.mapreduce.Context; 4 | 5 | 6 | /** 7 | * Base class for all Context implementations. 8 | */ 9 | public class BaseContext implements Context { 10 | 11 | private final String jobId; 12 | 13 | public BaseContext(String jobId) { 14 | this.jobId = jobId; 15 | } 16 | 17 | @Override 18 | public String getJobId() { 19 | return jobId; 20 | } 21 | } -------------------------------------------------------------------------------- /java/example/default/WEB-INF/appengine-web.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | mapreduce-example 4 | default 5 | one 6 | 7 | true 8 | 9 | 10 | 11 | 12 | 13 | F2 14 | 15 | 16 | -------------------------------------------------------------------------------- /java/src/test/java/com/google/appengine/tools/mapreduce/testmodels/ClassExtendingAbstract.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.testmodels; 2 | 3 | /** 4 | * TODO: Insert description here. (generated by kumaranj) 5 | */ 6 | public class ClassExtendingAbstract extends AbstractClassSample { 7 | int value; 8 | 9 | /** 10 | * @param id 11 | * @param name 12 | * @param value 13 | */ 14 | public ClassExtendingAbstract(int id, String name, int value) { 15 | super(id, name); 16 | this.value = value; 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/shardedjob/ShardedJobServiceFactory.java: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce.impl.shardedjob; 4 | 5 | /** 6 | * Provides {@link ShardedJobService} implementations. 7 | * 8 | * @author ohler@google.com (Christian Ohler) 9 | */ 10 | public class ShardedJobServiceFactory { 11 | 12 | private ShardedJobServiceFactory() {} 13 | 14 | public static ShardedJobService getShardedJobService() { 15 | return new ShardedJobServiceImpl(); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/BigqueryFieldMarshaller.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.impl; 2 | 3 | import com.google.api.services.bigquery.model.TableFieldSchema; 4 | 5 | import java.lang.reflect.Field; 6 | 7 | 8 | /** 9 | * Defines how a {@link Field}s should be interpreted and marshalled while generating its 10 | * {@link TableFieldSchema} for loading data into bigquery. 11 | */ 12 | public interface BigqueryFieldMarshaller { 13 | Object getFieldValue(Field field, Object object); 14 | 15 | Class getSchemaType(); 16 | } 17 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/MapOnlyMapperContextImpl.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce.impl; 4 | 5 | import com.google.appengine.tools.mapreduce.MapOnlyMapperContext; 6 | import com.google.appengine.tools.mapreduce.OutputWriter; 7 | 8 | /** 9 | */ 10 | class MapOnlyMapperContextImpl extends BaseShardContext implements MapOnlyMapperContext { 11 | 12 | MapOnlyMapperContextImpl(IncrementalTaskContext c, OutputWriter output) { 13 | super(c, output); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /java/src/test/java/com/google/appengine/tools/mapreduce/testmodels/ClassWithArray.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.testmodels; 2 | 3 | /** 4 | * TODO: Insert description here. (generated by kumaranj) 5 | */ 6 | public class ClassWithArray { 7 | public int id; 8 | public String name; 9 | public String[] values; 10 | /** 11 | * @param id 12 | * @param name 13 | * @param values 14 | */ 15 | public ClassWithArray(int id, String name, String[] values) { 16 | this.id = id; 17 | this.name = name; 18 | this.values = values; 19 | } 20 | 21 | } 22 | -------------------------------------------------------------------------------- /python/demo/app.yaml: -------------------------------------------------------------------------------- 1 | application: mapreduce-demo 2 | version: 1 3 | runtime: python27 4 | api_version: 1 5 | threadsafe: no 6 | 7 | handlers: 8 | - url: /favicon.ico 9 | static_files: static/images/favicon.ico 10 | upload: static/images/favicon.ico 11 | 12 | - url: /static/js/custom.js 13 | static_files: static/js/custom.js 14 | upload: static/js/custom.js 15 | 16 | - url: .* 17 | script: main.app 18 | login: required 19 | 20 | includes: 21 | - mapreduce/include.yaml 22 | 23 | libraries: 24 | - name: webapp2 25 | version: "2.5.1" 26 | - name: jinja2 27 | version: "2.6" 28 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/MapperContext.java: -------------------------------------------------------------------------------- 1 | // Copyright 2011 Google Inc. All Rights Reserved. 2 | package com.google.appengine.tools.mapreduce; 3 | 4 | 5 | /** 6 | * A context for mapper execution. Provides everything that might be needed by a mapper function. 7 | * 8 | * 9 | * @param type of keys produced by the mapper 10 | * @param type of values produced by the mapper 11 | */ 12 | public interface MapperContext extends WorkerContext> { 13 | 14 | /** 15 | * Emits a key and a value to the output. 16 | */ 17 | void emit(K key, V value); 18 | } 19 | -------------------------------------------------------------------------------- /java/src/test/java/com/google/appengine/tools/mapreduce/testmodels/Father.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.testmodels; 2 | 3 | import java.util.List; 4 | 5 | /** 6 | * Test class for testing BigQueryDataMarshaller 7 | */ 8 | public class Father { 9 | public boolean married; 10 | public String name; 11 | public List sons; 12 | /** 13 | * @param married 14 | * @param name 15 | * @param sons 16 | */ 17 | public Father(boolean married, String name, List sons) { 18 | this.married = married; 19 | this.name = name; 20 | this.sons = sons; 21 | } 22 | 23 | 24 | } 25 | -------------------------------------------------------------------------------- /java/src/test/java/com/google/appengine/tools/mapreduce/testmodels/Man.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.testmodels; 2 | 3 | import com.google.appengine.tools.mapreduce.BigQueryIgnore; 4 | 5 | /** 6 | * Test class for BigQueryMarshaller testing 7 | */ 8 | 9 | public class Man { 10 | @BigQueryIgnore 11 | public int id; 12 | public String name; 13 | public String gender; 14 | 15 | /** 16 | * @param id 17 | * @param name 18 | * @param gender 19 | */ 20 | public Man(int id, String name, String gender) { 21 | this.id = id; 22 | this.name = name; 23 | this.gender = gender; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /java/src/test/java/com/google/appengine/tools/mapreduce/testmodels/SimpleJson.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.testmodels; 2 | 3 | import com.google.appengine.tools.mapreduce.BigQueryDataField; 4 | import com.google.appengine.tools.mapreduce.BigQueryFieldMode; 5 | 6 | /** 7 | * Simple class for testing 8 | */ 9 | public class SimpleJson { 10 | @BigQueryDataField(mode = BigQueryFieldMode.REQUIRED) 11 | public String name; 12 | public int id; 13 | /** 14 | * @param name 15 | * @param id 16 | */ 17 | public SimpleJson(String name, int id) { 18 | this.name = name; 19 | this.id = id; 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /java/src/test/java/com/google/appengine/tools/mapreduce/testmodels/SimplAnnotatedJson.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.testmodels; 2 | 3 | import com.google.appengine.tools.mapreduce.BigQueryDataField; 4 | 5 | public class SimplAnnotatedJson { 6 | @BigQueryDataField(name = "niceName") 7 | public String nameRandom; 8 | public String id; 9 | public int intField; 10 | /** 11 | * @param nameRandom 12 | * @param id 13 | * @param intField 14 | */ 15 | public SimplAnnotatedJson(String nameRandom, String id, int intField) { 16 | this.nameRandom = nameRandom; 17 | this.id = id; 18 | this.intField = intField; 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/ReducerContextImpl.java: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce.impl; 4 | 5 | import com.google.appengine.tools.mapreduce.OutputWriter; 6 | import com.google.appengine.tools.mapreduce.ReducerContext; 7 | 8 | /** 9 | * @author ohler@google.com (Christian Ohler) 10 | * 11 | * @param type of output values produced by the reducer 12 | */ 13 | class ReducerContextImpl extends BaseShardContext implements ReducerContext { 14 | 15 | ReducerContextImpl(IncrementalTaskContext c, OutputWriter output) { 16 | super(c, output); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /python/test/testlib/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright 2010 Google Inc. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | -------------------------------------------------------------------------------- /python/src/mapreduce/third_party/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright 2010 Google Inc. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | -------------------------------------------------------------------------------- /python/src/mapreduce/api/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2015 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /python/src/mapreduce/lib/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2015 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /python/src/mapreduce/tools/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2015 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | -------------------------------------------------------------------------------- /python/demo/mapreduce.yaml: -------------------------------------------------------------------------------- 1 | mapreduce: 2 | - name: Make messages lowercase 3 | params: 4 | - name: done_callback 5 | value: /done 6 | mapper: 7 | handler: main.lower_case_posts 8 | input_reader: mapreduce.input_readers.DatastoreInputReader 9 | params: 10 | - name: entity_kind 11 | default: main.Post 12 | - name: shard_count 13 | default: 4 14 | - name: Make messages upper case 15 | params: 16 | - name: done_callback 17 | value: /done 18 | mapper: 19 | handler: main.upper_case_posts 20 | input_reader: mapreduce.input_readers.DatastoreInputReader 21 | params: 22 | - name: entity_kind 23 | default: main.Post 24 | - name: shard_count 25 | default: 4 26 | -------------------------------------------------------------------------------- /python/src/README: -------------------------------------------------------------------------------- 1 | AppEngine Mapreduce library 2 | =========================== 3 | 4 | Official site: https://github.com/GoogleCloudPlatform/appengine-mapreduce 5 | 6 | Check the site for up to date status, latest version, getting started & user 7 | guides and other documentation. 8 | 9 | Archive contents: 10 | - python : python version of the library resides here 11 | - build.sh : use this to run tests for python library, build and run demo app 12 | - src : python source code for mapreduce library 13 | - tests : tests for mapreduce library 14 | - demo : a demo application that uses the map reduce. 15 | - java : java version of the library 16 | - build.xml : ant build file 17 | -------------------------------------------------------------------------------- /python/test/mapreduce/api/map_job/input_reader_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | # testutil must be imported before mock. 5 | # pylint: disable=unused-import 6 | # pylint: disable=g-bad-import-order 7 | from testlib import testutil 8 | 9 | import mock 10 | import unittest 11 | 12 | from mapreduce.api import map_job 13 | 14 | 15 | class InputReaderTest(unittest.TestCase): 16 | 17 | def testBeginEndSlice(self): 18 | reader = map_job.InputReader() 19 | slice_ctx = mock.Mock() 20 | reader.begin_slice(slice_ctx) 21 | self.assertEqual(slice_ctx, reader._slice_ctx) 22 | reader.end_slice(slice_ctx) 23 | self.assertEqual(None, reader._slice_ctx) 24 | 25 | 26 | if __name__ == '__main__': 27 | unittest.main() 28 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/BigQueryMarshaller.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce; 2 | 3 | import com.google.api.services.bigquery.model.TableSchema; 4 | 5 | 6 | /** 7 | * An implementation of this class should serialize the objects of type T into newline separated 8 | * json as expected by the bigquery load jobs. It should also provide an implementation for 9 | * generating the schema({@link TableSchema}) of the bigquery table. 10 | * 11 | * @param type of the object to be marshalled 12 | */ 13 | public abstract class BigQueryMarshaller extends Marshaller { 14 | private static final long serialVersionUID = 5170161329883029808L; 15 | 16 | public abstract TableSchema getSchema(); 17 | } 18 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/MapReduceResult.java: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce; 4 | 5 | import java.io.Serializable; 6 | 7 | /** 8 | * Result of a {@link MapReduceJob} or {@link MapJob}. 9 | * 10 | * @author ohler@google.com (Christian Ohler) 11 | * 12 | * @param type of result produced by the {@link Output} 13 | */ 14 | public interface MapReduceResult extends Serializable { 15 | 16 | /** 17 | * Returns the result from {@link Output#finish} or {@code null} if completed unsuccessfully. 18 | */ 19 | R getOutputResult(); 20 | 21 | /** 22 | * Returns the counter values at the end of the job. 23 | */ 24 | Counters getCounters(); 25 | } 26 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/ReducerInput.java: -------------------------------------------------------------------------------- 1 | // Copyright 2011 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce; 4 | 5 | import java.util.Iterator; 6 | 7 | /** 8 | * Enumerates the reducer's input values for a given key. 9 | * 10 | * @author ohler@google.com (Christian Ohler) 11 | * 12 | * @param type of values provided by this input 13 | */ 14 | // Not serializable; since reduce() receives an iterator, we will never end a 15 | // slice while an iterator is active. 16 | public abstract class ReducerInput implements Iterator { 17 | 18 | @Override 19 | public void remove() { 20 | throw new UnsupportedOperationException("Can't remove() on ReducerInput: " + this); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /python/test/mapreduce/api/map_job/output_writer_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | # testutil must be imported before mock. 5 | # pylint: disable=unused-import 6 | # pylint: disable=g-bad-import-order 7 | from testlib import testutil 8 | 9 | import mock 10 | import unittest 11 | 12 | from mapreduce.api.map_job import output_writer 13 | 14 | 15 | class OutputWriterTest(unittest.TestCase): 16 | 17 | def testBeginEndSlice(self): 18 | writer = output_writer.OutputWriter() 19 | slice_ctx = mock.Mock() 20 | writer.begin_slice(slice_ctx) 21 | self.assertEqual(slice_ctx, writer._slice_ctx) 22 | writer.end_slice(slice_ctx) 23 | self.assertEqual(None, writer._slice_ctx) 24 | 25 | 26 | if __name__ == '__main__': 27 | unittest.main() 28 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/CorruptDataException.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce; 2 | 3 | /** 4 | * The exception generated if any of the data appears to be corrupt. This should cause the MapReduce 5 | * to fail. 6 | */ 7 | public class CorruptDataException extends RuntimeException { 8 | 9 | private static final long serialVersionUID = 5053922369001406602L; 10 | 11 | public CorruptDataException() { 12 | super(); 13 | } 14 | 15 | public CorruptDataException(String message) { 16 | super(message); 17 | } 18 | 19 | public CorruptDataException(String message, Throwable cause) { 20 | super(message, cause); 21 | } 22 | 23 | public CorruptDataException(Throwable cause) { 24 | super(cause); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/shardedjob/RejectRequestException.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.impl.shardedjob; 2 | 3 | 4 | /** 5 | * An exception thrown to reject the current request with an error code (50X) This will usually 6 | * cause taskqueue to retry the request on another instance. 7 | * 8 | * For internal use only. User code cannot safely depend on this class. 9 | */ 10 | public class RejectRequestException extends RuntimeException { 11 | 12 | private static final long serialVersionUID = 5938529235133524752L; 13 | 14 | public RejectRequestException(String reason) { 15 | super(reason); 16 | } 17 | 18 | public RejectRequestException(String reason, Exception e) { 19 | super(reason, e); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /java/example/src/com/google/appengine/demos/mapreduce/randomcollisions/SeedToRandomMapper.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.demos.mapreduce.randomcollisions; 2 | 3 | import com.google.appengine.tools.mapreduce.Mapper; 4 | import com.google.common.primitives.Ints; 5 | 6 | import java.util.Random; 7 | 8 | /** 9 | * Maps each incoming seed using Java's Random to the first generated number. 10 | */ 11 | public final class SeedToRandomMapper extends Mapper { 12 | 13 | private static final long serialVersionUID = -3070710020513042698L; 14 | @Override 15 | // [START map_example] 16 | public void map(Long sequence) { 17 | Random r = new Random(sequence); 18 | emit(r.nextInt(), Ints.checkedCast(sequence)); 19 | } 20 | // [END map_example] 21 | } 22 | -------------------------------------------------------------------------------- /java/example/src/com/google/appengine/demos/mapreduce/bigqueryload/SampleTable.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.demos.mapreduce.bigqueryload; 2 | 3 | import java.util.Date; 4 | import java.util.List; 5 | 6 | public class SampleTable { 7 | Long colNum; 8 | int col1; 9 | String col2; 10 | List col3; 11 | String[] col4; 12 | SampleNestedRecord col5; 13 | Date col6; 14 | 15 | public SampleTable(Long colNum, 16 | int col1, 17 | String col2, 18 | List col3, 19 | String[] col4, 20 | SampleNestedRecord col5, 21 | Date col6) { 22 | this.colNum = colNum; 23 | this.col1 = col1; 24 | this.col2 = col2; 25 | this.col3 = col3; 26 | this.col4 = col4; 27 | this.col5 = col5; 28 | this.col6 = col6; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/sort/MergeContext.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.impl.sort; 2 | 3 | import com.google.appengine.tools.mapreduce.KeyValue; 4 | import com.google.appengine.tools.mapreduce.OutputWriter; 5 | import com.google.appengine.tools.mapreduce.impl.BaseShardContext; 6 | import com.google.appengine.tools.mapreduce.impl.IncrementalTaskContext; 7 | 8 | import java.nio.ByteBuffer; 9 | import java.util.List; 10 | 11 | /** 12 | * Provides a context for merging. 13 | * 14 | */ 15 | public class MergeContext extends BaseShardContext>> { 16 | 17 | MergeContext(IncrementalTaskContext c, 18 | OutputWriter>> output) { 19 | super(c, output); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /java/example/META-INF/application.xml: -------------------------------------------------------------------------------- 1 | 2 | 8 | 9 | Test Application 10 | My Test Map Reduce Java App 11 | 12 | 13 | 14 | default 15 | default 16 | 17 | 18 | 19 | 20 | 21 | mapreduce 22 | mapreduce 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/Counters.java: -------------------------------------------------------------------------------- 1 | // Copyright 2011 Google Inc. All Rights Reserved. 2 | package com.google.appengine.tools.mapreduce; 3 | 4 | import java.io.Serializable; 5 | 6 | /** 7 | * Collection of all counters. 8 | * 9 | */ 10 | public interface Counters extends Serializable { 11 | 12 | /** 13 | * @param name counter name 14 | * @return counter with a given name. Creates new counter with 0 value if it doesn't exist. 15 | */ 16 | Counter getCounter(String name); 17 | 18 | /** 19 | * @return iterable over all created counters. 20 | */ 21 | Iterable getCounters(); 22 | 23 | /** 24 | * @param other Another counter object who's counters should all be added to this one. 25 | */ 26 | void addAll(Counters other); 27 | } 28 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/MapperContextImpl.java: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce.impl; 4 | 5 | import com.google.appengine.tools.mapreduce.KeyValue; 6 | import com.google.appengine.tools.mapreduce.MapperContext; 7 | import com.google.appengine.tools.mapreduce.OutputWriter; 8 | 9 | /** 10 | * @author ohler@google.com (Christian Ohler) 11 | */ 12 | class MapperContextImpl extends BaseShardContext> 13 | implements MapperContext { 14 | 15 | MapperContextImpl(IncrementalTaskContext c, OutputWriter> output) { 16 | super(c, output); 17 | } 18 | 19 | @Override 20 | public void emit(K key, V value) { 21 | emit(KeyValue.of(key, value)); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/sort/SortContext.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.impl.sort; 2 | 3 | import com.google.appengine.tools.mapreduce.KeyValue; 4 | import com.google.appengine.tools.mapreduce.OutputWriter; 5 | import com.google.appengine.tools.mapreduce.impl.BaseShardContext; 6 | import com.google.appengine.tools.mapreduce.impl.IncrementalTaskContext; 7 | 8 | import java.nio.ByteBuffer; 9 | import java.util.List; 10 | 11 | /** 12 | * Provides a context for the in memory sort. 13 | * 14 | */ 15 | public class SortContext extends 16 | BaseShardContext>> { 17 | 18 | SortContext(IncrementalTaskContext c, 19 | OutputWriter>> output) { 20 | super(c, output); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: java 3 | jdk: 4 | - openjdk7 5 | before_install: 6 | - cd java 7 | - mvn clean 8 | - git clone -b travis `git config --get remote.origin.url` target/travis 9 | - cp target/travis/settings.xml ~/.m2/settings.xml 10 | install: mvn install -DskipTests=true 11 | script: mvn compile 12 | branches: 13 | only: 14 | - master 15 | after_success: 16 | - mvn site --settings target/travis/settings.xml 17 | env: 18 | global: 19 | - secure: DCU0tg/VgvF4Vln9wRslycxxaNZ+oPh2L3s0bMvheVZ8HLb3VoczY6CX9e75uHlLQqkXwYk1QtDTr2RkszYKJwnJgz7Zu7uAjHlM3KLXoGlDS9rIkX+S3EwMLcQcxBmU1jTmx8l9TzaHHGOSI245TBrwJ736l6UK2FKWmNb5f9A= 20 | - secure: d1XtFIq0YtqzbNMT/HKL0k2HUYAbU5GmT+KGNEC0axmXApNTQBxgkz2t5KNJ348JI+uq1O1OLb17i6eE7VmyXJNiWET8/h6RpBgMdbKxgqZaK8YcZdKRJjxHDRtVkPlOv1U1jq/r0MLm91L6srIom+RpJ3XXk92bKiPXa5EugLE= 21 | -------------------------------------------------------------------------------- /java/NOTICE: -------------------------------------------------------------------------------- 1 | The following libraries are included under the terms of their respective 2 | licenses and can be found at their respective websites: 3 | 4 | commons logging (Apache) - http://commons.apache.org/logging/ 5 | guava (Apache) - https://github.com/google/guava/ 6 | hadoop (Apache) - http://hadoop.apache.org/ 7 | charts4j (MIT) - https://github.com/julienchastang/charts4j/ 8 | json (MIT + good/evil clause) - http://www.json.org/license.html 9 | 10 | The following libraries are included under the terms of their respective 11 | licenses, but are only used for testing, and do not need to be uploaded 12 | with your code: 13 | 14 | cglib (Apache) - http://cglib.sourceforge.net/ 15 | easymock and classextension (Apache) - http://easymock.org/ 16 | junit (CPL) - http://junit.org/ 17 | objenesis (Apache) - http://objenesis.org/ 18 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/Counter.java: -------------------------------------------------------------------------------- 1 | // Copyright 2011 Google Inc. All Rights Reserved. 2 | package com.google.appengine.tools.mapreduce; 3 | 4 | /** 5 | * Counter is an integer variable that is aggregated across multiple shards. Can be used to do 6 | * statistical calculations. 7 | * 8 | */ 9 | public interface Counter { 10 | 11 | /** 12 | * @return counter name. 13 | */ 14 | String getName(); 15 | 16 | /** 17 | * @return counter value. This is the value only in the current shard. It doesn't include 18 | * contributions from other shards, if accessed from within mapper/reducer. 19 | */ 20 | long getValue(); 21 | 22 | /** 23 | * Increment counter. 24 | * 25 | * @param delta increment delta. Can be negative. 26 | */ 27 | void increment(long delta); 28 | } 29 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/shardedjob/pipeline/DeleteShardedJob.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.impl.shardedjob.pipeline; 2 | 3 | import com.google.appengine.tools.pipeline.Job; 4 | 5 | /** 6 | * A pipeline job to delete persistent data for a sharded job. 7 | */ 8 | public class DeleteShardedJob extends AbstractShardedJob { 9 | 10 | private static final long serialVersionUID = -6850669259843382958L; 11 | 12 | public DeleteShardedJob(String jobId, int taskCount) { 13 | super(jobId, taskCount); 14 | } 15 | 16 | @Override 17 | protected Job createShardsJob(int start, int end) { 18 | return new DeleteShardsInfos(getJobId(), start, end); 19 | } 20 | 21 | @Override 22 | public String getJobDisplayName() { 23 | return "DeleteShardedJob: " + getJobId(); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/shardedjob/ShardFailureException.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.impl.shardedjob; 2 | 3 | /** 4 | * An exception thrown when there should be no more attempts to continue processing the shard. 5 | */ 6 | public class ShardFailureException extends RuntimeException { 7 | 8 | private static final long serialVersionUID = -1082842736486563617L; 9 | 10 | public ShardFailureException(String errorMessage) { 11 | super(errorMessage); 12 | } 13 | 14 | public ShardFailureException(int shardNumber, Throwable rootCause) { 15 | super("Shard " + shardNumber + " failed.", rootCause); 16 | } 17 | 18 | public ShardFailureException(int shardNumber, String message, Throwable rootCause) { 19 | super("Shard " + shardNumber + " failed: " + message, rootCause); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/shardedjob/JobFailureException.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.impl.shardedjob; 2 | 3 | /** 4 | * An exception thrown when there should be no more attempts to continue processing the job. 5 | */ 6 | public class JobFailureException extends RuntimeException { 7 | 8 | private static final long serialVersionUID = -4481817785472768342L; 9 | 10 | public JobFailureException(String errorMessage) { 11 | super(errorMessage); 12 | } 13 | 14 | public JobFailureException(int shardNumber, Throwable rootCause) { 15 | super("Shard " + shardNumber + " failed the job", rootCause); 16 | } 17 | 18 | public JobFailureException(int shardNumber, String message, Throwable rootCause) { 19 | super("Shard " + shardNumber + " failed the job: " + message, rootCause); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /java/src/test/java/com/google/appengine/tools/mapreduce/testmodels/Person.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.testmodels; 2 | 3 | 4 | public class Person { 5 | public String fullName; 6 | public int age; 7 | public double height; 8 | public float weight; 9 | public String gender; 10 | public PhoneNumber phoneNumber; 11 | /** 12 | * @param fullName 13 | * @param age 14 | * @param height 15 | * @param weight 16 | * @param gender 17 | * @param phoneNumber 18 | */ 19 | public Person(String fullName, 20 | int age, 21 | double height, 22 | float weight, 23 | String gender, 24 | PhoneNumber phoneNumber) { 25 | this.fullName = fullName; 26 | this.age = age; 27 | this.height = height; 28 | this.weight = weight; 29 | this.gender = gender; 30 | this.phoneNumber = phoneNumber; 31 | } 32 | } -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/Sharder.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce; 2 | 3 | import java.io.Serializable; 4 | import java.nio.ByteBuffer; 5 | 6 | /** 7 | * Used to determine which shard an item belongs to. 8 | * This is used when emitting data from Map to specify which reduce shard it should go to. 9 | * The only criteria that is required is that the same key always map to the same shard. 10 | * 11 | */ 12 | public interface Sharder extends Serializable { 13 | 14 | /** 15 | * @return the number of shards that are items may be assigned to. 16 | */ 17 | public int getNumShards(); 18 | 19 | /** 20 | * @param key The serialized key. (The ByteBuffer should be unmodified by the implementation) 21 | * @return a number between 0 and numShards-1 inclusive 22 | */ 23 | public int getShardForKey(ByteBuffer key); 24 | } 25 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/inputs/DatastoreInputReader.java: -------------------------------------------------------------------------------- 1 | // Copyright 2011 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce.inputs; 4 | 5 | import com.google.appengine.api.datastore.Entity; 6 | import com.google.appengine.api.datastore.Query; 7 | import com.google.common.base.Functions; 8 | 9 | /** 10 | * An InputReader for Datastore entities. 11 | */ 12 | public class DatastoreInputReader extends BaseDatastoreInputReader { 13 | 14 | private static final long serialVersionUID = -2164845668646485549L; 15 | private static final long AVERAGE_ENTITY_SIZE = 100 * 1024; 16 | 17 | public DatastoreInputReader(Query query) { 18 | super(query, Functions.identity()); 19 | } 20 | 21 | @Override 22 | protected long getAvgElementSize() { 23 | return AVERAGE_ENTITY_SIZE; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/BaseMapper.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce; 4 | 5 | /** 6 | * Abstract class for Map function. 7 | * 8 | * @param type of input received 9 | * @param type of intermediate values produced 10 | */ 11 | abstract class BaseMapper> extends Worker { 12 | 13 | private static final long serialVersionUID = -6551234158528563026L; 14 | 15 | /** 16 | * Processes a single input value, emitting output through the context returned by 17 | * {@link Worker#getContext} or {@link #emit}. 18 | */ 19 | public abstract void map(I value); 20 | 21 | /** 22 | * Syntactic sugar for {@code getContext().emit(value)} 23 | */ 24 | protected void emit(O value) { 25 | getContext().emit(value); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | AppEngine Mapreduce library 2 | =========================== 3 | 4 | [![Build Status](https://travis-ci.org/GoogleCloudPlatform/appengine-mapreduce.svg?branch=master)](https://travis-ci.org/GoogleCloudPlatform/appengine-mapreduce) 5 | 6 | Official site: https://github.com/GoogleCloudPlatform/appengine-mapreduce 7 | 8 | Check the site for up to date status, latest version, getting started & user 9 | guides and other documentation. 10 | 11 | Archive contents: 12 | - python : python version of the library resides here 13 | - build.sh : use this to run tests for python library, build and run demo app 14 | - src : python source code for mapreduce library 15 | - tests : tests for mapreduce library 16 | - demo : a demo application that uses the map reduce. 17 | - java : java version of the library 18 | - build.xml : ant build file 19 | -------------------------------------------------------------------------------- /java/example/src/com/google/appengine/demos/mapreduce/entitycount/CountReducer.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.demos.mapreduce.entitycount; 2 | 3 | import com.google.appengine.tools.mapreduce.KeyValue; 4 | import com.google.appengine.tools.mapreduce.Reducer; 5 | import com.google.appengine.tools.mapreduce.ReducerInput; 6 | 7 | /** 8 | * Sums a list of numbers. The key identifies the counter, the output value is the sum of all input 9 | * values for the given key. 10 | * 11 | * @author ohler@google.com (Christian Ohler) 12 | */ 13 | class CountReducer extends Reducer> { 14 | 15 | private static final long serialVersionUID = 1316637485625852869L; 16 | 17 | @Override 18 | public void reduce(String key, ReducerInput values) { 19 | long total = 0; 20 | while (values.hasNext()) { 21 | total += values.next(); 22 | } 23 | emit(KeyValue.of(key, total)); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /python/src/mapreduce/lib/input_reader/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2015 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Various libraries regarding input readers and input in general.""" 16 | from mapreduce.lib.input_reader._gcs import GCSInputReader 17 | from mapreduce.lib.input_reader._gcs import GCSRecordInputReader 18 | from mapreduce.lib.input_reader._gcs import PathFilter 19 | 20 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/pipeline/ResultAndStatus.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.impl.pipeline; 2 | 3 | import com.google.appengine.tools.mapreduce.MapReduceResult; 4 | import com.google.appengine.tools.mapreduce.impl.shardedjob.Status; 5 | 6 | import java.io.Serializable; 7 | 8 | /** 9 | * A holder for MR result and its status. 10 | * 11 | * @param the type of {@code MapReduceResult} content. 12 | */ 13 | public final class ResultAndStatus implements Serializable { 14 | 15 | private static final long serialVersionUID = 7867829838406777714L; 16 | 17 | private final MapReduceResult result; 18 | private final Status status; 19 | 20 | public ResultAndStatus(MapReduceResult result, Status status) { 21 | this.result = result; 22 | this.status = status; 23 | } 24 | 25 | public MapReduceResult getResult() { 26 | return result; 27 | } 28 | 29 | public Status getStatus() { 30 | return status; 31 | } 32 | } -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/shardedjob/pipeline/FinalizeShardedJob.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.impl.shardedjob.pipeline; 2 | 3 | import com.google.appengine.tools.mapreduce.impl.shardedjob.Status; 4 | import com.google.appengine.tools.pipeline.Job; 5 | 6 | /** 7 | * A pipeline job for finalizing the job and cleaning up unnecessary state. 8 | */ 9 | public class FinalizeShardedJob extends AbstractShardedJob { 10 | 11 | private static final long serialVersionUID = -6850669259843382958L; 12 | private final Status status; 13 | 14 | public FinalizeShardedJob(String jobId, int taskCount, Status status) { 15 | super(jobId, taskCount); 16 | this.status = status; 17 | } 18 | 19 | @Override 20 | protected Job createShardsJob(int start, int end) { 21 | return new FinalizeShardsInfos(getJobId(), status, start, end); 22 | } 23 | 24 | @Override 25 | public String getJobDisplayName() { 26 | return "FinalizeShardedJob: " + getJobId(); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /java/example/src/com/google/appengine/demos/mapreduce/entitycount/DeleteEntityMapper.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.demos.mapreduce.entitycount; 2 | 3 | import com.google.appengine.api.datastore.Key; 4 | import com.google.appengine.tools.mapreduce.DatastoreMutationPool; 5 | import com.google.appengine.tools.mapreduce.MapOnlyMapper; 6 | 7 | /** 8 | * Deletes datastore entities. 9 | */ 10 | public class DeleteEntityMapper extends MapOnlyMapper { 11 | 12 | private static final long serialVersionUID = -6485226450501339416L; 13 | 14 | // [START datastoreMutationPool] 15 | private transient DatastoreMutationPool batcher; 16 | // [END datastoreMutationPool] 17 | 18 | // [START begin_and_endSlice] 19 | @Override 20 | public void beginSlice() { 21 | batcher = DatastoreMutationPool.create(); 22 | } 23 | 24 | @Override 25 | public void endSlice() { 26 | batcher.flush(); 27 | } 28 | // [END begin_and_endSlice] 29 | 30 | @Override 31 | public void map(Key key) { 32 | batcher.delete(key); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/shardedjob/ShardedJobController.java: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce.impl.shardedjob; 4 | 5 | import java.io.Serializable; 6 | import java.util.Iterator; 7 | 8 | /** 9 | * Aggregates results from {@link IncrementalTask}s and receives notification 10 | * when the job completes. 11 | * 12 | * @author ohler@google.com (Christian Ohler) 13 | * 14 | * @param the type of the incremental task 15 | */ 16 | public abstract class ShardedJobController implements Serializable { 17 | 18 | private static final long serialVersionUID = 6209078163062384156L; 19 | 20 | /** 21 | * Called when the sharded job has completed successfully. 22 | */ 23 | public abstract void completed(Iterator completedTasks); 24 | 25 | /** 26 | * Called when the sharded job has failed to complete successfully. 27 | * @param status 28 | */ 29 | public abstract void failed(Status status); 30 | } 31 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/MapReduceJobException.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Google Inc. All Rights Reserved. 2 | package com.google.appengine.tools.mapreduce; 3 | 4 | import com.google.appengine.tools.mapreduce.impl.shardedjob.Status; 5 | 6 | /** 7 | * An exception that is thrown upon MapReduceJob failure. 8 | */ 9 | public final class MapReduceJobException extends RuntimeException { 10 | 11 | private static final long serialVersionUID = 2875093254119004898L; 12 | private final String stage; 13 | 14 | public MapReduceJobException(String stage, Status status) { 15 | super("Stage " + stage + " was not completed successfuly (status=" + status.getStatusCode() 16 | + ", message=" + status.getException() + ")", status.getException()); 17 | this.stage = stage; 18 | } 19 | 20 | /** 21 | * Returns a string representing the MapReduce stage that failed. 22 | * The exception propagated from the stage can be fetched by {@link #getCause()}. 23 | */ 24 | public String getFailedStage() { 25 | return stage; 26 | } 27 | } -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/reducers/ValueProjectionReducer.java: -------------------------------------------------------------------------------- 1 | // Copyright 2011 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce.reducers; 4 | 5 | import com.google.appengine.tools.mapreduce.Reducer; 6 | import com.google.appengine.tools.mapreduce.ReducerInput; 7 | 8 | /** 9 | * Reducer that emits the values that occur in its input, discarding the keys. 10 | * 11 | * @author ohler@google.com (Christian Ohler) 12 | * 13 | * @param type of keys (discarded) 14 | * @param type of values 15 | */ 16 | public class ValueProjectionReducer extends Reducer { 17 | 18 | private static final long serialVersionUID = 990027274731447358L; 19 | 20 | public static ValueProjectionReducer create() { 21 | return new ValueProjectionReducer<>(); 22 | } 23 | 24 | public ValueProjectionReducer() { 25 | } 26 | 27 | @Override 28 | public void reduce(K key, ReducerInput values) { 29 | while (values.hasNext()) { 30 | emit(values.next()); 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/ShardContext.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce; 4 | 5 | 6 | /** 7 | * Context for each shard. 8 | */ 9 | public interface ShardContext extends Context { 10 | 11 | /** 12 | * Returns the total number of shards. 13 | */ 14 | int getShardCount(); 15 | 16 | /** 17 | * Returns the number of this mapper or reducer shard (zero-based). 18 | */ 19 | int getShardNumber(); 20 | 21 | /** 22 | * Returns a {@link Counters} object for doing simple aggregate calculations. 23 | */ 24 | Counters getCounters(); 25 | 26 | /** 27 | * Returns the {@link Counter} with the given name. 28 | */ 29 | Counter getCounter(String name); 30 | 31 | /** 32 | * Increments the {@link Counter} with the given name by {@code delta}. 33 | */ 34 | void incrementCounter(String name, long delta); 35 | 36 | /** 37 | * Increments the {@link Counter} with the given name by 1. 38 | */ 39 | void incrementCounter(String name); 40 | } 41 | -------------------------------------------------------------------------------- /python/src/mapreduce/operation/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright 2010 Google Inc. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """Operations which can be yielded from mappers. 18 | 19 | Operation is callable that takes context.Context as a parameter. 20 | Operations are called during mapper execution immediately 21 | on recieving from handler function. 22 | """ 23 | 24 | 25 | 26 | # These are all relative imports. 27 | import db 28 | import counters 29 | from base import Operation 30 | 31 | __all__ = ['db', 'counters', 'Operation'] 32 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/reducers/KeyProjectionReducer.java: -------------------------------------------------------------------------------- 1 | // Copyright 2011 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce.reducers; 4 | 5 | import com.google.appengine.tools.mapreduce.Reducer; 6 | import com.google.appengine.tools.mapreduce.ReducerInput; 7 | import com.google.common.base.Preconditions; 8 | 9 | /** 10 | * Reducer that emits the keys that occur in its input, discarding the values. 11 | * 12 | * @author ohler@google.com (Christian Ohler) 13 | * 14 | * @param type of keys 15 | * @param type of values (discarded) 16 | */ 17 | public class KeyProjectionReducer extends Reducer { 18 | 19 | private static final long serialVersionUID = 466599637876532403L; 20 | 21 | public static KeyProjectionReducer create() { 22 | return new KeyProjectionReducer<>(); 23 | } 24 | 25 | public KeyProjectionReducer() { 26 | } 27 | 28 | @Override 29 | public void reduce(K key, ReducerInput values) { 30 | Preconditions.checkState(values.hasNext(), "%s: No values: %s", this, key); 31 | emit(key); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /python/src/mapreduce/operation/base.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright 2010 Google Inc. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """Base operation class.""" 18 | 19 | 20 | 21 | __all__ = ['Operation'] 22 | 23 | 24 | class Operation(object): 25 | """Base class for all mapper operations. 26 | 27 | All operations should implement __call__(self, ctx) function, which will be 28 | called upon operation yield. 29 | """ 30 | 31 | def __call__(self, ctx): 32 | raise NotImplementedError("__call__() not implemented in %s" % 33 | self.__class__) 34 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/bigqueryjobs/BigQueryLoadJobReference.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.bigqueryjobs; 2 | 3 | import com.google.api.services.bigquery.model.JobReference; 4 | import com.google.appengine.tools.mapreduce.Marshallers; 5 | import com.google.appengine.tools.mapreduce.impl.util.SerializableValue; 6 | 7 | import java.io.Serializable; 8 | 9 | /** 10 | * Result of the bigquery load files pipeline job. 11 | */ 12 | public class BigQueryLoadJobReference implements Serializable { 13 | 14 | private static final long serialVersionUID = -5045977572520245900L; 15 | private final String status; 16 | private final SerializableValue jobReference; 17 | 18 | public BigQueryLoadJobReference(String status, JobReference jobReference) { 19 | this.status = status; 20 | this.jobReference = SerializableValue.of( 21 | Marshallers.getGenericJsonMarshaller(JobReference.class), jobReference); 22 | } 23 | 24 | public String getStatus() { 25 | return status; 26 | } 27 | 28 | public JobReference getJobReference() { 29 | return jobReference.getValue(); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/shardedjob/ShardedJobHandler.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.impl.shardedjob; 2 | 3 | /** 4 | * As part of its operation, the {@code ShardedJobService} will enqueue task 5 | * queue tasks that send requests to the URLs specified in 6 | * {@link ShardedJobSettings}. It is the user's responsibility to arrange 7 | * for these requests to be passed back into {@link #completeShard} 8 | * and {@link #runTask}. 9 | */ 10 | public interface ShardedJobHandler { 11 | 12 | public static final String JOB_ID_PARAM = "job"; 13 | public static final String TASK_ID_PARAM = "task"; 14 | public static final String SEQUENCE_NUMBER_PARAM = "seq"; 15 | 16 | /** 17 | * Is invoked by the servlet that handles 18 | * {@link ShardedJobSettings#getControllerPath} when a shard has completed. 19 | */ 20 | void completeShard(final String jobId, final String taskId); 21 | 22 | /** 23 | * Is invoked by the servlet that handles 24 | * {@link ShardedJobSettings#getWorkerPath} to run a task. 25 | */ 26 | void runTask(final String jobId, final String taskId, final int sequenceNumber); 27 | } 28 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/BigQueryDataField.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce; 2 | 3 | import java.lang.annotation.ElementType; 4 | import java.lang.annotation.Retention; 5 | import java.lang.annotation.RetentionPolicy; 6 | import java.lang.annotation.Target; 7 | 8 | /** 9 | * Annotation to provide additional information related to BigQuery data fields. @see 10 | * TableFieldSchema. 11 | */ 12 | @Retention(RetentionPolicy.RUNTIME) 13 | @Target(ElementType.FIELD) 14 | public @interface BigQueryDataField { 15 | /** 16 | * Description of the BigQuery field 17 | */ 18 | String description() default ""; 19 | 20 | /** 21 | * The name of the bigquery column. By default it is same as the name of the field in the class. 22 | * Use this annotation to provide a different name. 23 | */ 24 | String name() default ""; 25 | 26 | /** 27 | * Mode of a bigquery table column determines whether it is repeated, required or nullable. A 28 | * required column must not be left null or empty which loading data. By default it is nullable. 29 | */ 30 | BigQueryFieldMode mode() default BigQueryFieldMode.NULLABLE; 31 | } 32 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/Mapper.java: -------------------------------------------------------------------------------- 1 | // Copyright 2011 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce; 4 | 5 | /** 6 | * Map function for MapReduce computations. A map function processes input 7 | * values one at a time and generates zero or more output key-value pairs for 8 | * each. It emits the generated pairs to the {@link Reducer} through the 9 | * {@link MapperContext}. 10 | * 11 | *

This class is really an interface that might be evolving. In order to avoid breaking 12 | * users when we change the interface, we made it an abstract class.

13 | * 14 | * 15 | * @param type of input received 16 | * @param type of intermediate keys produced 17 | * @param type of intermediate values produced 18 | */ 19 | public abstract class Mapper extends BaseMapper, MapperContext> { 20 | 21 | private static final long serialVersionUID = 1966174340710715049L; 22 | 23 | /** 24 | * Syntactic sugar for {@code emit(KeyValue.of(key, value))} 25 | */ 26 | protected void emit(K key, V value) { 27 | getContext().emit(key, value); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/inputs/DatastoreKeyInputReader.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce.inputs; 4 | 5 | import com.google.appengine.api.datastore.Entity; 6 | import com.google.appengine.api.datastore.Key; 7 | import com.google.appengine.api.datastore.Query; 8 | import com.google.common.base.Function; 9 | import com.google.common.base.Preconditions; 10 | 11 | /** 12 | * An InputReader for Datastore entity keys. 13 | */ 14 | public class DatastoreKeyInputReader extends BaseDatastoreInputReader { 15 | 16 | private static final long serialVersionUID = 846982034548442467L; 17 | private static final long AVERAGE_KEY_SIZE = 256; 18 | 19 | private enum EntityToKeyFunction implements Function { 20 | INSTANCE; 21 | 22 | @Override 23 | public Key apply(Entity entity) { 24 | return entity.getKey(); 25 | } 26 | } 27 | 28 | public DatastoreKeyInputReader(Query query) { 29 | super(query, EntityToKeyFunction.INSTANCE); 30 | Preconditions.checkArgument(query.isKeysOnly()); 31 | } 32 | 33 | @Override 34 | protected long getAvgElementSize() { 35 | return AVERAGE_KEY_SIZE; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/shardedjob/InProcessShardedJobRunner.java: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce.impl.shardedjob; 4 | 5 | import com.google.common.base.Preconditions; 6 | 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | 10 | /** 11 | * Runs a sharded job in the current process. Only for very small jobs. Easier 12 | * to debug than a parallel execution. 13 | * 14 | * @author ohler@google.com (Christian Ohler) 15 | */ 16 | public class InProcessShardedJobRunner { 17 | 18 | private InProcessShardedJobRunner() { 19 | } 20 | 21 | /** 22 | * Runs the given job and returns its result. 23 | */ 24 | public static void runJob( 25 | List initialTasks, ShardedJobController controller) { 26 | List results = new ArrayList<>(); 27 | for (T task : initialTasks) { 28 | Preconditions.checkNotNull(task, "Null initial task: %s", initialTasks); 29 | task.prepare(); 30 | do { 31 | task.run(); 32 | } while (!task.isDone()); 33 | task.cleanup(); 34 | results.add(task); 35 | } 36 | controller.completed(results.iterator()); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/mappers/KeyProjectionMapper.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Google Inc. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.appengine.tools.mapreduce.mappers; 16 | 17 | import com.google.appengine.tools.mapreduce.Mapper; 18 | 19 | /** 20 | * A pass through mapper that passes the input to the output key. 21 | * 22 | * @param type input that is passed on as the key 23 | */ 24 | public class KeyProjectionMapper extends Mapper { 25 | 26 | private static final long serialVersionUID = -3998292521173820259L; 27 | 28 | @Override 29 | public void map(T value) { 30 | emit(value, null); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /python/src/mapreduce/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright 2010 Google Inc. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import logging 18 | import os 19 | 20 | version = os.environ.get('CURRENT_VERSION_ID', '').split('.')[0] 21 | 22 | if (__name__ == 'google.appengine.ext.mapreduce' 23 | and version != 'ah-builtin-python-bundle'): 24 | msg = ('You should not use the mapreduce library that is bundled with the' 25 | ' SDK. You can use the PyPi package at' 26 | ' https://pypi.python.org/pypi/GoogleAppEngineMapReduce or use the ' 27 | 'source at https://github.com/GoogleCloudPlatform/appengine-mapreduce ' 28 | 'instead.') 29 | logging.warn(msg) 30 | 31 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/util/BigQueryDataTypeUtil.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.impl.util; 2 | 3 | import com.google.common.collect.ImmutableMap; 4 | 5 | import java.util.Calendar; 6 | import java.util.Date; 7 | import java.util.Map; 8 | 9 | /** 10 | * Utility class for converting java types to BigQuery data types 11 | */ 12 | public final class BigQueryDataTypeUtil { 13 | 14 | private static final Map, String> javaTypeToBigQueryType = 15 | new ImmutableMap.Builder, String>() 16 | .put(Integer.class, "integer") 17 | .put(Float.class, "float") 18 | .put(Boolean.class, "boolean") 19 | .put(String.class, "string") 20 | .put(Date.class, "timestamp") 21 | .put(Calendar.class, "timestamp") 22 | .build(); 23 | 24 | /** 25 | * @param parameterType java primitive, wrapper or String types 26 | * @return BigQuery data type 27 | * */ 28 | public static String getBigQueryType(Class parameterType) { 29 | return javaTypeToBigQueryType.get(parameterType); 30 | } 31 | 32 | public static boolean isSimpleBigQueryType(Class parameterType) { 33 | return javaTypeToBigQueryType.containsKey(parameterType); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /java/example/shuffler/WEB-INF/appengine-web.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | mapreduce-example 4 | shuffler 5 | one 6 | true 7 | 8 | 9 | F4 10 | 11 | 1000ms 12 | 13 | 19 | 4 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /java/example/mapreduce/WEB-INF/appengine-web.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | mapreduce-example 4 | mapreduce 5 | one 6 | true 7 | 8 | 9 | F4 10 | 11 | 1000ms 12 | 13 | 19 | 4 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /python/src/mapreduce/api/map_job/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2015 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Map job package.""" 16 | 17 | # All the public API should be imported here. 18 | # 1. Seasoned Python user should simply import this package. 19 | # 2. Other users may import individual files so filenames should still have 20 | # "map_job" prefix. But adding the prefix won't mandate the first type 21 | # of user to type more. 22 | # 3. Class names should not have "map_job" prefix. 23 | from .input_reader import InputReader 24 | from .map_job_config import JobConfig 25 | from .map_job_control import Job 26 | from .mapper import Mapper 27 | from .output_writer import OutputWriter 28 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/util/SplitUtil.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.impl.util; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Collections; 5 | import java.util.List; 6 | import java.util.Random; 7 | 8 | /** 9 | * Convenience methods related to splitting. 10 | */ 11 | public class SplitUtil { 12 | 13 | public static List> split(List input, int numSplits, boolean randomize) { 14 | ArrayList toSplit = new ArrayList<>(input); 15 | if (randomize) { 16 | Collections.shuffle(toSplit, new Random(0L)); // Fixing seed for determinism 17 | } 18 | int minItemsPerShard = input.size() / numSplits; 19 | int remainder = input.size() % numSplits; 20 | ArrayList> result = new ArrayList<>(); 21 | int posInList = 0; 22 | for (int shard = 0; shard < numSplits; shard++) { 23 | int numItems = shard < remainder ? minItemsPerShard + 1 : minItemsPerShard; 24 | if (numItems > 0) { 25 | result.add(new ArrayList<>(toSplit.subList(posInList, posInList + numItems))); 26 | posInList += numItems; 27 | } 28 | } 29 | if (posInList != toSplit.size()) { 30 | throw new IllegalStateException(); // Impossible 31 | } 32 | return result; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /java/example/src/com/google/appengine/demos/mapreduce/randomcollisions/CollisionFindingReducer.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.demos.mapreduce.randomcollisions; 2 | 3 | import com.google.appengine.tools.mapreduce.Reducer; 4 | import com.google.appengine.tools.mapreduce.ReducerInput; 5 | import com.google.common.collect.Lists; 6 | 7 | import java.util.ArrayList; 8 | import java.util.logging.Logger; 9 | 10 | /** 11 | * Counts the number of seeds that generated the same value. If there are multiple they will be 12 | * logged and emitted to the output as a list. 13 | */ 14 | public final class CollisionFindingReducer extends Reducer> { 15 | 16 | private static final long serialVersionUID = 188147370819557065L; 17 | private static final Logger LOG = Logger.getLogger(CollisionFindingReducer.class.getName()); 18 | @Override 19 | // [START reduce_example] 20 | public void reduce(Integer valueGenerated, ReducerInput seeds) { 21 | ArrayList collidingSeeds = Lists.newArrayList(seeds); 22 | if (collidingSeeds.size() > 1) { 23 | LOG.info("Found a collision! The seeds: " + collidingSeeds 24 | + " all generaged the value " + valueGenerated); 25 | emit(collidingSeeds); 26 | } 27 | } 28 | // [END reduce_example] 29 | } 30 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/mappers/IdentityMapper.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Google Inc. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package com.google.appengine.tools.mapreduce.mappers; 16 | 17 | import com.google.appengine.tools.mapreduce.KeyValue; 18 | import com.google.appengine.tools.mapreduce.Mapper; 19 | 20 | /** 21 | * A mapper that passes an incoming KeyValue to it's output. 22 | * 23 | * @param the type of the key 24 | * @param the type of the value 25 | */ 26 | public class IdentityMapper extends Mapper, K, V> { 27 | 28 | private static final long serialVersionUID = -8243493999040989299L; 29 | 30 | @Override 31 | public void map(KeyValue input) { 32 | emit(input.getKey(), input.getValue()); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/shardedjob/ShardedJobState.java: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce.impl.shardedjob; 4 | 5 | 6 | 7 | /** 8 | * Information about execution and progress of a sharded job. 9 | * 10 | * Undefined behavior results if any of the values (such as the return value of 11 | * getSettings()) are mutated. 12 | * 13 | * @author ohler@google.com (Christian Ohler) 14 | */ 15 | public interface ShardedJobState { 16 | 17 | /** 18 | * Returns the ID of this job. 19 | */ 20 | String getJobId(); 21 | 22 | /** 23 | * Returns the execution settings of this job. 24 | */ 25 | ShardedJobSettings getSettings(); 26 | 27 | /** 28 | * Returns the total number of tasks (not including follow-up tasks) that this 29 | * job consists of. 30 | */ 31 | int getTotalTaskCount(); 32 | 33 | /** 34 | * Returns the number of tasks or follow-up tasks that are currently active. 35 | */ 36 | int getActiveTaskCount(); 37 | 38 | /** 39 | * Returns the time this job was started. 40 | */ 41 | long getStartTimeMillis(); 42 | 43 | /** 44 | * Returns the time this job's state was last updated. 45 | */ 46 | long getMostRecentUpdateTimeMillis(); 47 | 48 | /** 49 | * Returns whether this job is running, finished, etc. 50 | */ 51 | Status getStatus(); 52 | } 53 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/HashingSharder.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.impl; 2 | 3 | import static com.google.common.base.Preconditions.checkArgument; 4 | 5 | import com.google.appengine.tools.mapreduce.Sharder; 6 | import com.google.appengine.tools.mapreduce.impl.util.SerializationUtil; 7 | import com.google.common.hash.HashFunction; 8 | import com.google.common.hash.Hashing; 9 | 10 | import java.nio.ByteBuffer; 11 | 12 | /** 13 | * Splits input by hashing the key. 14 | * 15 | */ 16 | public class HashingSharder implements Sharder { 17 | 18 | private static final long serialVersionUID = 7967187256546710108L; 19 | private static final HashFunction HASH = Hashing.murmur3_32(); 20 | private final int numShards; 21 | 22 | public HashingSharder(int numShards) { 23 | this.numShards = numShards; 24 | checkArgument(numShards > 0); 25 | } 26 | 27 | @Override 28 | public int getNumShards() { 29 | return numShards; 30 | } 31 | 32 | @Override 33 | public int getShardForKey(ByteBuffer key) { 34 | byte[] bytes = SerializationUtil.getBytes(key); 35 | int hash = (HASH.hashBytes(bytes).asInt()) & Integer.MAX_VALUE; // Keeping positive 36 | // Dividing integer range rather than using modulo so as to avoid rewriting entries if they are 37 | // re-hashed. 38 | return hash / (Integer.MAX_VALUE / numShards + 1); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/outputs/GoogleCloudStorageLevelDbOutputWriter.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.outputs; 2 | 3 | import static com.google.appengine.tools.mapreduce.impl.MapReduceConstants.GCS_IO_BLOCK_SIZE; 4 | import static com.google.appengine.tools.mapreduce.impl.util.LevelDbConstants.BLOCK_SIZE; 5 | 6 | import com.google.appengine.tools.mapreduce.OutputWriter; 7 | 8 | import java.io.IOException; 9 | import java.nio.ByteBuffer; 10 | 11 | /** 12 | * A decorator for LevelDbOutputWriter that delegates to {@link GoogleCloudStorageFileOutputWriter} 13 | * and pads blocks to GCS write boundaries on end of slice. 14 | * This is needed because GCS requires data to be passed in in 256kb but LevelDb uses 32kb 15 | * blocks this class provides a way get this class to pad the output by writing empty blocks. 16 | * 17 | */ 18 | public class GoogleCloudStorageLevelDbOutputWriter extends LevelDbOutputWriter { 19 | private static final long serialVersionUID = 6507809614070157553L; 20 | 21 | public GoogleCloudStorageLevelDbOutputWriter(OutputWriter delegate) { 22 | super(delegate); 23 | } 24 | 25 | @Override 26 | public void endSlice() throws IOException { 27 | padAndWriteBlock(false); 28 | while ((getNumBlocksWritten() * BLOCK_SIZE) % GCS_IO_BLOCK_SIZE != 0) { 29 | padAndWriteBlock(true); 30 | } 31 | getDelegate().endSlice(); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /java/example/src/com/google/appengine/demos/mapreduce/bigqueryload/RandomBigQueryDataCreator.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.demos.mapreduce.bigqueryload; 2 | 3 | import com.google.appengine.tools.mapreduce.MapOnlyMapper; 4 | import com.google.common.collect.Lists; 5 | 6 | import java.util.Date; 7 | import java.util.Random; 8 | 9 | public class RandomBigQueryDataCreator extends MapOnlyMapper { 10 | 11 | private static final long serialVersionUID = -4247519870584497230L; 12 | private static Random r; 13 | 14 | @Override 15 | public void map(Long value) { 16 | SampleTable toWrite = getSampleTableData(value); 17 | 18 | emit(toWrite); 19 | } 20 | 21 | public static SampleTable getSampleTableData(Long value) { 22 | r = new Random(value); 23 | SampleTable toWrite = new SampleTable(value, 24 | randomInt(), 25 | String.format("colvalue %d", randomInt()), 26 | Lists.newArrayList(String.format("column value %d", randomInt()), 27 | String.format("colvalue %d", randomInt())), 28 | new String[] {String.format("column value %d", randomInt()), 29 | String.format("column value %d", randomInt())}, 30 | new SampleNestedRecord(randomInt(), String.format("column value %d", randomInt())), 31 | new Date(randomInt())); 32 | return toWrite; 33 | } 34 | 35 | private static int randomInt() { 36 | return r.nextInt(); 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/shardedjob/ShardedJobServiceImpl.java: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce.impl.shardedjob; 4 | 5 | import java.util.Iterator; 6 | import java.util.List; 7 | 8 | /** 9 | * Implementation of {@link ShardedJobService}. 10 | * 11 | * @author ohler@google.com (Christian Ohler) 12 | */ 13 | class ShardedJobServiceImpl implements ShardedJobService { 14 | 15 | @Override 16 | public void startJob( 17 | String jobId, 18 | List initialTasks, 19 | ShardedJobController controller, 20 | ShardedJobSettings settings) { 21 | new ShardedJobRunner().startJob(jobId, initialTasks, controller, settings); 22 | } 23 | 24 | @Override 25 | public ShardedJobState getJobState(String jobId) { 26 | return new ShardedJobRunner<>().getJobState(jobId); 27 | } 28 | 29 | @Override 30 | public Iterator> lookupTasks(ShardedJobState state) { 31 | return new ShardedJobRunner<>().lookupTasks(state.getJobId(), state.getTotalTaskCount(), true); 32 | } 33 | 34 | @Override 35 | public void abortJob(String jobId) { 36 | new ShardedJobRunner<>().abortJob(jobId); 37 | } 38 | 39 | @Override 40 | public boolean cleanupJob(String jobId) { 41 | return new ShardedJobRunner<>().cleanupJob(jobId); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/outputs/MarshallingOutputWriter.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.outputs; 2 | 3 | import static com.google.common.base.Preconditions.checkNotNull; 4 | 5 | import com.google.appengine.tools.mapreduce.Marshaller; 6 | import com.google.appengine.tools.mapreduce.OutputWriter; 7 | 8 | import java.io.IOException; 9 | import java.nio.ByteBuffer; 10 | 11 | /** 12 | * An {@link OutputWriter} that marshalls records. 13 | * 14 | * @param the type of OutputWriter that this will become. (The type of the values that will be 15 | * written to this class) 16 | */ 17 | public class MarshallingOutputWriter extends ForwardingOutputWriter { 18 | 19 | private static final long serialVersionUID = -1441650908652534613L; 20 | 21 | private final Marshaller marshaller; 22 | private final OutputWriter writer; 23 | 24 | public MarshallingOutputWriter(OutputWriter writer, 25 | Marshaller marshaller) { 26 | this.writer = checkNotNull(writer, "No writer"); 27 | this.marshaller = checkNotNull(marshaller, "No marshaller"); 28 | } 29 | 30 | @Override 31 | protected OutputWriter getDelegate() { 32 | return writer; 33 | } 34 | 35 | @Override 36 | public void write(O value) throws IOException { 37 | ByteBuffer bytes = marshaller.toBytes(value); 38 | writer.write(bytes); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/ReducerInputs.java: -------------------------------------------------------------------------------- 1 | // Copyright 2011 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce.impl; 4 | 5 | import com.google.appengine.tools.mapreduce.ReducerInput; 6 | 7 | import java.util.Iterator; 8 | 9 | /** 10 | * Utilities related to {@link ReducerInput}. 11 | * 12 | * @author ohler@google.com (Christian Ohler) 13 | */ 14 | public class ReducerInputs { 15 | 16 | private ReducerInputs() {} 17 | 18 | private static class IteratorReducerInput extends ReducerInput { 19 | 20 | private final Iterator i; 21 | 22 | public IteratorReducerInput(Iterator i) { 23 | this.i = i; 24 | } 25 | 26 | @Override 27 | public boolean hasNext() { 28 | return i.hasNext(); 29 | } 30 | 31 | @Override 32 | public V next() { 33 | return i.next(); 34 | } 35 | 36 | @Override 37 | public String toString() { 38 | return "ReducerInputs.fromIterator(" + i + ")"; 39 | } 40 | } 41 | 42 | public static ReducerInput fromIterator(Iterator i) { 43 | return new IteratorReducerInput<>(i); 44 | } 45 | 46 | public static ReducerInput fromIterable(final Iterable x) { 47 | return new IteratorReducerInput(x.iterator()) { 48 | @Override 49 | public String toString() { 50 | return "ReducerInputs.fromIterable(" + x + ")"; 51 | } 52 | }; 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /python/test/mapreduce/api/map_job/map_job_config_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import unittest 3 | 4 | from mapreduce import parameters 5 | from mapreduce.api import map_job 6 | from mapreduce.api.map_job import sample_input_reader 7 | 8 | 9 | class MapJobConfigTest(unittest.TestCase): 10 | """Test for MapJobConfig. 11 | 12 | MapJobConfig is declarative. Thus most functional tests are already 13 | done by its parent class. 14 | """ 15 | 16 | def testSmoke(self): 17 | conf = map_job.JobConfig( 18 | job_name="foo", 19 | mapper=map_job.Mapper, 20 | input_reader_cls=sample_input_reader.SampleInputReader, 21 | input_reader_params={"foo": 1}) 22 | self.assertEqual("foo", conf.job_name) 23 | self.assertTrue(conf.job_id) 24 | self.assertEqual(map_job.Mapper, conf.mapper) 25 | self.assertEqual(sample_input_reader.SampleInputReader, 26 | conf.input_reader_cls) 27 | self.assertEqual({"foo": 1}, conf.input_reader_params) 28 | self.assertEqual(parameters.config.SHARD_COUNT, conf.shard_count) 29 | 30 | def testUserProvidesJobID(self): 31 | conf = map_job.JobConfig( 32 | job_name="foo", 33 | job_id="id", 34 | mapper=map_job.Mapper, 35 | input_reader_cls=sample_input_reader.SampleInputReader, 36 | input_reader_params={"foo": 1}) 37 | self.assertEqual("id", conf.job_id) 38 | 39 | 40 | if __name__ == "__main__": 41 | unittest.main() 42 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/BigQueryConstants.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.impl; 2 | 3 | 4 | public final class BigQueryConstants { 5 | private BigQueryConstants() {} 6 | 7 | // Big query does not allow GCS files larger than 1 TB. Limiting the max size to 500GB 8 | public static final Long MAX_BIG_QUERY_GCS_FILE_SIZE = 500 * 1024 * 1024 * 1024L; 9 | 10 | // Big query limit for total size across files per load. Always keep it more than 11 | // MAX_BIG_QUERY_GCS_FILE_SIZE 12 | public static final Long BIGQUERY_LOAD_DATA_SIZE_LIMIT = 500 * 1024 * 1024 * 1024L; 13 | 14 | public static final String BQ_SCOPE = "https://www.googleapis.com/auth/bigquery"; 15 | 16 | public static final String GCS_FILE_NAME_FORMAT = 17 | "BigQueryFilesToLoad/Job-%s/Shard-%%04d/file-%%04d"; 18 | 19 | public static final String RECORD_TYPE = "record"; 20 | 21 | public static final double MAX_TIME_BEFORE_NEXT_POLL = 30; // in seconds 22 | 23 | public static final double MIN_TIME_BEFORE_NEXT_POLL = 10; // in seconds 24 | 25 | public static final String MIME_TYPE = "application/json"; 26 | 27 | public static final String NEWLINE_CHARACTER = "\n"; 28 | 29 | public static final Integer MAX_RETRIES = 5; 30 | 31 | public static final int DEFAULT_MILLIS_PER_SLICE = 30_000; 32 | 33 | public static final int DEFAULT_SHARD_RETRIES = 4; 34 | 35 | public static final int DEFAULT_SLICE_RETRIES = 20; 36 | } 37 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/reducers/NoReducer.java: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce.reducers; 4 | 5 | import com.google.appengine.tools.mapreduce.Reducer; 6 | import com.google.appengine.tools.mapreduce.ReducerInput; 7 | import com.google.appengine.tools.mapreduce.impl.shardedjob.JobFailureException; 8 | 9 | /** 10 | * A reducer that throws an exception if it receives any keys or values, and 11 | * never emits any values. 12 | * 13 | * @author ohler@google.com (Christian Ohler) 14 | * 15 | * @param type of keys formally (but not actually) accepted by this reducer 16 | * @param type of values formally (but not actually) accepted by this reducer 17 | * @param type of output formally (but not actually) emitted by this reducer\ 18 | * 19 | * @deprecated Consider using {@link com.google.appengine.tools.mapreduce.MapJob} instead. 20 | */ 21 | @Deprecated 22 | public class NoReducer extends Reducer { 23 | 24 | private static final long serialVersionUID = 904068928342205092L; 25 | 26 | public static NoReducer create() { 27 | return new NoReducer<>(); 28 | } 29 | 30 | public NoReducer() { 31 | } 32 | 33 | @Override 34 | public void reduce(K key, ReducerInput values) { 35 | throw new JobFailureException( 36 | getClass().getSimpleName() + ": reduce function was called for " + key); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/KeyValue.java: -------------------------------------------------------------------------------- 1 | // Copyright 2011 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce; 4 | 5 | 6 | import java.io.Serializable; 7 | import java.util.Objects; 8 | 9 | /** 10 | * Key-value pair. 11 | * 12 | * 13 | * @param key type 14 | * @param value type 15 | */ 16 | public class KeyValue implements Serializable { 17 | 18 | private static final long serialVersionUID = -2687854533615172943L; 19 | 20 | private final K key; 21 | private final V value; 22 | 23 | public KeyValue(K key, V value) { 24 | this.key = key; 25 | this.value = value; 26 | } 27 | 28 | public K getKey() { 29 | return key; 30 | } 31 | 32 | public V getValue() { 33 | return value; 34 | } 35 | 36 | @Override 37 | public String toString() { 38 | return "KeyValue(" + key + ", " + value + ")"; 39 | } 40 | 41 | @Override 42 | public final boolean equals(Object o) { 43 | if (o == this) { 44 | return true; 45 | } 46 | if (!(o instanceof KeyValue)) { 47 | return false; 48 | } 49 | KeyValue other = (KeyValue) o; 50 | return Objects.equals(key, other.key) && Objects.equals(value, other.value); 51 | } 52 | 53 | @Override 54 | public final int hashCode() { 55 | return Objects.hash(key, value); 56 | } 57 | 58 | public static KeyValue of(K k, V v) { 59 | return new KeyValue<>(k, v); 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /python/test/mapreduce/operation/counters_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright 2010 Google Inc. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | 19 | 20 | import mox 21 | import unittest 22 | 23 | from mapreduce import context 24 | from mapreduce import operation as op 25 | 26 | 27 | class IncrementTest(unittest.TestCase): 28 | """Test Increment operation.""" 29 | 30 | def testIncrement(self): 31 | """Test applying Increment operation.""" 32 | m = mox.Mox() 33 | 34 | ctx = context.Context(None, None) 35 | ctx._counters = m.CreateMock(context._Counters) 36 | 37 | operation = op.counters.Increment("test", 12) 38 | 39 | # Record calls 40 | ctx._counters.increment("test", 12) 41 | 42 | m.ReplayAll() 43 | try: # test, verify 44 | operation(ctx) 45 | m.VerifyAll() 46 | finally: 47 | m.UnsetStubs() 48 | 49 | 50 | if __name__ == "__main__": 51 | unittest.main() 52 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/Marshaller.java: -------------------------------------------------------------------------------- 1 | // Copyright 2011 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce; 4 | 5 | import java.io.Serializable; 6 | import java.nio.ByteBuffer; 7 | 8 | /** 9 | * Turns objects of type {@code T} into bytes and back. 10 | * 11 | * @author ohler@google.com (Christian Ohler) 12 | * 13 | * @param type to be marshalled or unmarshalled 14 | */ 15 | public abstract class Marshaller implements Serializable { 16 | private static final long serialVersionUID = 183874105234660517L; 17 | 18 | /** 19 | * Returns a new {@code ByteBuffer} {@code b} with a serialized representation 20 | * of {@code object} between {@code b.position()} and {@code b.limit()}. 21 | * {@code b.order()} is undefined. 22 | */ 23 | public abstract ByteBuffer toBytes(T object); 24 | 25 | /** 26 | * Returns the object whose serialized representation is in {@code b} between 27 | * {@code b.position()} and {@code b.limit()}. The value of {@code b.order()} 28 | * when the method is called is undefined, and this method may modify it as 29 | * well as {@code b.position()} and {@code b.limit()}. 30 | * 31 | *

The method may throw a {@link RuntimeException} if it determines that the 32 | * sequence of bytes in {@code b} was not generated by {@link #toBytes}. This 33 | * includes corrupted data as well as trailing bytes. 34 | */ 35 | public abstract T fromBytes(ByteBuffer b); 36 | } 37 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/pipeline/ExamineStatusAndReturnResult.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.impl.pipeline; 2 | 3 | import com.google.appengine.tools.mapreduce.MapReduceJobException; 4 | import com.google.appengine.tools.mapreduce.MapReduceResult; 5 | import com.google.appengine.tools.mapreduce.impl.shardedjob.Status; 6 | import com.google.appengine.tools.pipeline.Job1; 7 | import com.google.appengine.tools.pipeline.Value; 8 | 9 | /** 10 | * A pipeline job that examines {@code ResultAndStatus} and returns {@code MapReduceResult} 11 | * when status is DONE or throw a {@code MapReduceJobException} otherwise. 12 | * 13 | * @param the type of MapReduceResult content 14 | */ 15 | // TODO: This class will not be needed once 16 | // https://github.com/GoogleCloudPlatform/appengine-pipelines/issues/3 is fixed. 17 | public class ExamineStatusAndReturnResult extends Job1, ResultAndStatus> { 18 | 19 | private static final long serialVersionUID = -4916783324594785878L; 20 | 21 | private final String stage; 22 | 23 | public ExamineStatusAndReturnResult(String stage) { 24 | this.stage = stage; 25 | } 26 | 27 | @Override 28 | public Value> run(ResultAndStatus resultAndStatus) { 29 | Status status = resultAndStatus.getStatus(); 30 | if (status.getStatusCode() == Status.StatusCode.DONE) { 31 | return immediate(resultAndStatus.getResult()); 32 | } 33 | throw new MapReduceJobException(stage, status); 34 | } 35 | } -------------------------------------------------------------------------------- /python/src/mapreduce/operation/counters.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright 2010 Google Inc. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """Counters-related operations.""" 18 | 19 | 20 | 21 | __all__ = ['Increment'] 22 | 23 | # Deprecated. Use map_job_context.SliceContext.count instead. 24 | 25 | from mapreduce.operation import base 26 | 27 | # pylint: disable=protected-access 28 | 29 | 30 | class Increment(base.Operation): 31 | """Increment counter operation.""" 32 | 33 | def __init__(self, counter_name, delta=1): 34 | """Constructor. 35 | 36 | Args: 37 | counter_name: name of the counter as string 38 | delta: increment delta as int. 39 | """ 40 | self.counter_name = counter_name 41 | self.delta = delta 42 | 43 | def __call__(self, context): 44 | """Execute operation. 45 | 46 | Args: 47 | context: mapreduce context as context.Context. 48 | """ 49 | context._counters.increment(self.counter_name, self.delta) 50 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/outputs/InMemoryOutputWriter.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.outputs; 2 | 3 | import com.google.appengine.tools.mapreduce.OutputWriter; 4 | import com.google.common.base.Preconditions; 5 | import com.google.common.collect.Lists; 6 | 7 | import java.util.List; 8 | 9 | public class InMemoryOutputWriter extends OutputWriter { 10 | 11 | private static final long serialVersionUID = 528522943983621278L; 12 | 13 | private boolean closed = false; 14 | private final List accu = Lists.newArrayList(); 15 | private transient List slice; 16 | 17 | @Override 18 | public String toString() { 19 | return getClass().getName() + "(" + accu.size() + " items" + (closed ? ", closed" : " so far") 20 | + ")"; 21 | } 22 | 23 | @Override 24 | public void beginShard() { 25 | closed = false; 26 | accu.clear(); 27 | } 28 | 29 | @Override 30 | public void beginSlice() { 31 | slice = Lists.newArrayList(); 32 | } 33 | 34 | @Override 35 | public void write(O value) { 36 | Preconditions.checkState(!closed, "%s: Already closed", this); 37 | slice.add(value); 38 | } 39 | 40 | @Override 41 | public void endSlice() { 42 | accu.addAll(slice); 43 | slice = null; 44 | } 45 | 46 | @Override 47 | public void endShard() { 48 | closed = true; 49 | } 50 | 51 | @Override 52 | public boolean allowSliceRetry() { 53 | return true; 54 | } 55 | 56 | public List getResult() { 57 | return accu; 58 | } 59 | } -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/Reducer.java: -------------------------------------------------------------------------------- 1 | // Copyright 2011 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce; 4 | 5 | /** 6 | * Reduce function for use in MapReduce. Called once for each key, together 7 | * with the sequence of all values for that key that the map phase produced. 8 | * Can emit output values through the context. 9 | * 10 | *

This class is really an interface that might be evolving. In order to 11 | * avoid breaking users when we change the interface, we made it an abstract 12 | * class. 13 | * 14 | * @author ohler@google.com (Christian Ohler) 15 | * 16 | * @param type of intermediate keys received 17 | * @param type of intermediate values received 18 | * @param type of output values produced 19 | */ 20 | public abstract class Reducer extends Worker> { 21 | private static final long serialVersionUID = 1622389951004432376L; 22 | 23 | /** 24 | * Processes the values for a given key, using the context returned by 25 | * {@link Worker#getContext} to emit output to the {@link Output} of the MapReduce. 26 | * 27 | * {@code values} enumerates all values that the map phase produced for the 28 | * key {@code key}. It will always contain at least one value. 29 | */ 30 | public abstract void reduce(K key, ReducerInput values); 31 | 32 | /** 33 | * Syntactic sugar for {@code getContext().emit(value)} 34 | */ 35 | protected void emit(O value) { 36 | getContext().emit(value); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/Input.java: -------------------------------------------------------------------------------- 1 | // Copyright 2011 Google Inc. All Rights Reserved. 2 | package com.google.appengine.tools.mapreduce; 3 | 4 | import java.io.IOException; 5 | import java.io.Serializable; 6 | import java.util.List; 7 | 8 | /** 9 | * Input is the data source specification for the job. Input simply defines data source, while 10 | * {@link InputReader} handles reading itself. 11 | * 12 | *

This class is really an interface that might be evolving. In order to avoid breaking 13 | * users when we change the interface, we made it an abstract class.

14 | * 15 | * 16 | * @param type of values produced by this input 17 | */ 18 | public abstract class Input implements Serializable { 19 | 20 | private static final long serialVersionUID = 8796820298129705263L; 21 | 22 | private transient Context context; 23 | 24 | void setContext(Context context) { 25 | this.context = context; 26 | } 27 | 28 | /** 29 | * Returns the current context, or null if none. 30 | */ 31 | public Context getContext() { 32 | return context; 33 | } 34 | 35 | /** 36 | * Returns a list of readers for this input. It is the {@code Input}'s 37 | * responsibility to determine an appropriate number of readers to split into. 38 | * This could be specified by the user or determined algorithmically. 39 | * 40 | * The number of input readers returned determines the number of map shards. 41 | */ 42 | public abstract List> createReaders() throws IOException; 43 | } 44 | -------------------------------------------------------------------------------- /python/src/mapreduce/pipeline_base.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2015 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Base pipelines.""" 16 | 17 | 18 | import pipeline 19 | 20 | from mapreduce import parameters 21 | 22 | # pylint: disable=g-bad-name 23 | 24 | 25 | class PipelineBase(pipeline.Pipeline): 26 | """Base class for all pipelines within mapreduce framework. 27 | 28 | Rewrites base path to use pipeline library bundled with mapreduce. 29 | """ 30 | 31 | def start(self, **kwargs): 32 | if "base_path" not in kwargs: 33 | kwargs["base_path"] = parameters._DEFAULT_PIPELINE_BASE_PATH 34 | return pipeline.Pipeline.start(self, **kwargs) 35 | 36 | 37 | class _OutputSlotsMixin(object): 38 | """Defines common output slots for all MR user facing pipelines. 39 | 40 | result_status: one of model.MapreduceState._RESULTS. When a MR pipeline 41 | finishes, user should check this for the status of the MR job. 42 | """ 43 | 44 | output_names = ["result_status"] 45 | -------------------------------------------------------------------------------- /python/demo/static/js/custom.js: -------------------------------------------------------------------------------- 1 | // Copyright 2011 Google Inc. All Rights Reserved. 2 | 3 | /** 4 | * @fileoverview A JavaScript helper file that performs miscellaneous 5 | * functions - right now, it just keeps the form that runs MR jobs in sync with 6 | * the user's selection outside of the form. 7 | */ 8 | 9 | /* 10 | * Updates the form that runs MapReduce jobs once the user selects their input 11 | * data from the list of input files. Exists because we have two separate forms 12 | * on our HTML - one that allows users to upload new input files, and one that 13 | * allows users to run MapReduce jobs given a certain input file. Since the 14 | * latter form cannot see which input file has been selected (that button is 15 | * out of this form's scope), we throw some quick JavaScript in to sync the 16 | * value of the user's choice with a hidden field in the form as well as a 17 | * visible label displaying the input file's name for the user to see. 18 | * @param {string} filekey The internal key that the Datastore uses to reference 19 | * this input file. 20 | * @param {string} blobkey The Blobstore key associated with the input file 21 | * whose key is filekey. 22 | * @param {string} filename The name that the user has chosen to give this input 23 | * file upon uploading it. 24 | */ 25 | function updateForm(filekey, blobkey, filename) { 26 | $('#jobName').text(filename); 27 | $('#filekey').val(filekey); 28 | $('#blobkey').val(blobkey); 29 | 30 | $('#word_count').removeAttr('disabled'); 31 | $('#index').removeAttr('disabled'); 32 | $('#phrases').removeAttr('disabled'); 33 | } 34 | 35 | -------------------------------------------------------------------------------- /java/example/mapreduce/WEB-INF/web.xml: -------------------------------------------------------------------------------- 1 | 2 | 8 | 9 | 10 | 11 | PipelineServlet 12 | 13 | com.google.appengine.tools.pipeline.impl.servlets.PipelineServlet 14 | 15 | 16 | 17 | PipelineServlet 18 | /_ah/pipeline/* 19 | 20 | 21 | 22 | mapreduce 23 | 24 | com.google.appengine.tools.mapreduce.MapReduceServlet 25 | 26 | 27 | 28 | mapreduce 29 | /mapreduce/* 30 | 31 | 32 | 34 | 35 | 36 | /* 37 | 38 | 39 | admin 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /java/example/src/com/google/appengine/demos/mapreduce/entitycount/EntityCreator.java: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.demos.mapreduce.entitycount; 4 | 5 | import static com.google.common.base.Preconditions.checkNotNull; 6 | 7 | import com.google.appengine.api.datastore.Entity; 8 | import com.google.appengine.api.datastore.Text; 9 | import com.google.appengine.tools.mapreduce.MapOnlyMapper; 10 | 11 | import java.util.Random; 12 | 13 | /** 14 | * Creates random entities. 15 | * 16 | * @author ohler@google.com (Christian Ohler) 17 | */ 18 | class EntityCreator extends MapOnlyMapper { 19 | 20 | private static final long serialVersionUID = 409204195454478863L; 21 | 22 | private final String kind; 23 | private final int payloadBytesPerEntity; 24 | private final Random random = new Random(); 25 | 26 | public EntityCreator(String kind, int payloadBytesPerEntity) { 27 | this.kind = checkNotNull(kind, "Null kind"); 28 | this.payloadBytesPerEntity = payloadBytesPerEntity; 29 | } 30 | 31 | private String randomString(int length) { 32 | StringBuilder out = new StringBuilder(length); 33 | for (int i = 0; i < length; i++) { 34 | out.append((char) ('a' + random.nextInt(26))); 35 | } 36 | return out.toString(); 37 | } 38 | 39 | @Override 40 | public void map(Long value) { 41 | String name = getContext().getShardNumber() + "_" + value; 42 | Entity entity = new Entity(kind, name); 43 | entity.setProperty("foo", "bar"); 44 | entity.setProperty("payload", new Text(randomString(payloadBytesPerEntity))); 45 | emit(entity); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/inputs/DatastoreInput.java: -------------------------------------------------------------------------------- 1 | // Copyright 2011 Google Inc. All Rights Reserved. 2 | package com.google.appengine.tools.mapreduce.inputs; 3 | 4 | import com.google.appengine.api.datastore.Entity; 5 | import com.google.appengine.api.datastore.Query; 6 | 7 | /** 8 | * An input to read entities of a specified kind from the datastore. 9 | * 10 | */ 11 | public final class DatastoreInput extends BaseDatastoreInput { 12 | 13 | private static final long serialVersionUID = -106587199386345409L; 14 | 15 | /** 16 | * @param entityKind entity kind to read from the datastore. 17 | * @param shardCount number of parallel shards for the input. 18 | */ 19 | public DatastoreInput(String entityKind, int shardCount) { 20 | this(entityKind, shardCount, null); 21 | } 22 | 23 | /** 24 | * @param entityKind entity kind to read from the datastore. 25 | * @param shardCount number of parallel shards for the input. 26 | * @param namespace the namespace of the entities (if null will use current). 27 | */ 28 | public DatastoreInput(String entityKind, int shardCount, String namespace) { 29 | this(createQuery(namespace, entityKind), shardCount); 30 | } 31 | 32 | /** 33 | * @param query the query to read from the datastore. 34 | * @param shardCount the number for parallel shards for the input. 35 | */ 36 | public DatastoreInput(Query query, int shardCount) { 37 | super(query, shardCount); 38 | } 39 | 40 | @Override 41 | protected DatastoreInputReader createReader(Query query) { 42 | return new DatastoreInputReader(query); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/pipeline/DeleteFilesJob.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.impl.pipeline; 2 | 3 | import static com.google.appengine.tools.mapreduce.impl.MapReduceConstants.GCS_RETRY_PARAMETERS; 4 | 5 | import com.google.appengine.tools.cloudstorage.GcsFilename; 6 | import com.google.appengine.tools.cloudstorage.GcsService; 7 | import com.google.appengine.tools.cloudstorage.GcsServiceFactory; 8 | import com.google.appengine.tools.cloudstorage.RetriesExhaustedException; 9 | import com.google.appengine.tools.pipeline.Job1; 10 | import com.google.appengine.tools.pipeline.Value; 11 | 12 | import java.io.IOException; 13 | import java.util.List; 14 | import java.util.logging.Level; 15 | import java.util.logging.Logger; 16 | 17 | /** 18 | * A job which deletes all the files in the provided GoogleCloudStorageFileSet 19 | */ 20 | public class DeleteFilesJob extends Job1> { 21 | 22 | private static final long serialVersionUID = 4821135390816992131L; 23 | private static final GcsService gcs = GcsServiceFactory.createGcsService(GCS_RETRY_PARAMETERS); 24 | private static final Logger log = Logger.getLogger(DeleteFilesJob.class.getName()); 25 | 26 | /** 27 | * Deletes the files in the provided GoogleCloudStorageFileSet 28 | */ 29 | @Override 30 | public Value run(List files) throws Exception { 31 | for (GcsFilename file : files) { 32 | try { 33 | gcs.delete(file); 34 | } catch (RetriesExhaustedException | IOException e) { 35 | log.log(Level.WARNING, "Failed to cleanup file: " + file, e); 36 | } 37 | } 38 | return null; 39 | } 40 | } -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/outputs/ForwardingOutputWriter.java: -------------------------------------------------------------------------------- 1 | // Copyright 2011 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce.outputs; 4 | 5 | import com.google.appengine.tools.mapreduce.OutputWriter; 6 | import com.google.appengine.tools.mapreduce.ShardContext; 7 | 8 | import java.io.IOException; 9 | 10 | /** 11 | * @author ohler@google.com (Christian Ohler) 12 | * 13 | * @param type of values accepted by this output 14 | */ 15 | public abstract class ForwardingOutputWriter extends OutputWriter { 16 | private static final long serialVersionUID = 738487653896786084L; 17 | 18 | protected abstract OutputWriter getDelegate(); 19 | 20 | @Override 21 | public void beginShard() throws IOException { 22 | getDelegate().beginShard(); 23 | } 24 | 25 | @Override 26 | public void beginSlice() throws IOException { 27 | getDelegate().beginSlice(); 28 | } 29 | 30 | @Override 31 | public void endSlice() throws IOException { 32 | getDelegate().endSlice(); 33 | } 34 | 35 | @Override 36 | public void endShard() throws IOException { 37 | getDelegate().endShard(); 38 | } 39 | 40 | @Override 41 | public long estimateMemoryRequirement() { 42 | return getDelegate().estimateMemoryRequirement(); 43 | } 44 | 45 | @Override 46 | public void setContext(ShardContext context) { 47 | getDelegate().setContext(context); 48 | } 49 | 50 | @Override 51 | public ShardContext getContext() { 52 | return getDelegate().getContext(); 53 | } 54 | 55 | @Override 56 | public boolean allowSliceRetry() { 57 | return getDelegate().allowSliceRetry(); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/outputs/BigQueryStoreResult.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.outputs; 2 | 3 | import com.google.api.services.bigquery.model.TableSchema; 4 | import com.google.appengine.tools.mapreduce.GoogleCloudStorageFileSet; 5 | import com.google.appengine.tools.mapreduce.Marshallers; 6 | import com.google.appengine.tools.mapreduce.Output; 7 | import com.google.appengine.tools.mapreduce.impl.util.SerializableValue; 8 | 9 | import java.io.Serializable; 10 | 11 | /** 12 | * Result of bigQuery staging process. For e.g. currently bigquery can only load data from files 13 | * stored in Google cloud storage(GCS). So Google Cloud Storage(GCS) is the staging area. R for GCS 14 | * is {@link GoogleCloudStorageFileSet}. 15 | * 16 | * @param type of result produced by the staging process {@link Output}. 17 | */ 18 | public final class BigQueryStoreResult implements Serializable { 19 | 20 | private static final long serialVersionUID = 3843348927621484947L; 21 | private final R result; 22 | private final SerializableValue serializableSchema; 23 | 24 | /** 25 | * @param result of writing data to the staging area. 26 | * @param schema a wrapper around {@link TableSchema} to make it serializable. 27 | */ 28 | public BigQueryStoreResult(R result, TableSchema schema) { 29 | this.result = result; 30 | this.serializableSchema = 31 | SerializableValue.of(Marshallers.getGenericJsonMarshaller(TableSchema.class), schema); 32 | } 33 | 34 | public R getResult() { 35 | return result; 36 | } 37 | 38 | public TableSchema getSchema() { 39 | return serializableSchema.getValue(); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/pipeline/ShardedJob.java: -------------------------------------------------------------------------------- 1 | // Copyright 2013 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce.impl.pipeline; 4 | 5 | import com.google.appengine.tools.mapreduce.impl.shardedjob.IncrementalTask; 6 | import com.google.appengine.tools.mapreduce.impl.shardedjob.ShardedJobController; 7 | import com.google.appengine.tools.mapreduce.impl.shardedjob.ShardedJobServiceFactory; 8 | import com.google.appengine.tools.mapreduce.impl.shardedjob.ShardedJobSettings; 9 | import com.google.appengine.tools.pipeline.Job0; 10 | import com.google.appengine.tools.pipeline.Value; 11 | 12 | import java.util.List; 13 | 14 | /** 15 | * ShardedJob pipeline. 16 | * 17 | * 18 | * @param type of task 19 | */ 20 | public class ShardedJob extends Job0 { 21 | 22 | private static final long serialVersionUID = -6595147973116356334L; 23 | 24 | private final String jobId; 25 | private final List workers; 26 | private final ShardedJobController controller; 27 | private final ShardedJobSettings settings; 28 | 29 | public ShardedJob(String shardedJobId, List workers, 30 | ShardedJobController controller, ShardedJobSettings shardedJobSettings) { 31 | this.jobId = shardedJobId; 32 | this.workers = workers; 33 | this.controller = controller; 34 | this.settings = shardedJobSettings; 35 | } 36 | 37 | @Override 38 | public Value run() { 39 | ShardedJobServiceFactory.getShardedJobService().startJob(jobId, workers, controller, settings); 40 | setStatusConsoleUrl(settings.getMapReduceStatusUrl()); 41 | return null; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/inputs/DatastoreKeyInput.java: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Google Inc. All Rights Reserved. 2 | package com.google.appengine.tools.mapreduce.inputs; 3 | 4 | import com.google.appengine.api.datastore.Key; 5 | import com.google.appengine.api.datastore.Query; 6 | 7 | /** 8 | * An input to read entity keys of a specified kind from the datastore. 9 | */ 10 | public final class DatastoreKeyInput extends BaseDatastoreInput { 11 | 12 | private static final long serialVersionUID = -106587199386345409L; 13 | 14 | /** 15 | * @param entityKind entity kind to read from the datastore. 16 | * @param shardCount number of parallel shards for the input. 17 | */ 18 | public DatastoreKeyInput(String entityKind, int shardCount) { 19 | this(entityKind, shardCount, null); 20 | } 21 | 22 | /** 23 | * @param entityKind entity kind to read from the datastore. 24 | * @param shardCount the number of parallel shards to divide the input into. 25 | * @param namespace the namespace of the entities (if null will use current). 26 | */ 27 | public DatastoreKeyInput(String entityKind, int shardCount, String namespace) { 28 | this(createQuery(namespace, entityKind), shardCount); 29 | } 30 | 31 | /** 32 | * @param query The query to map read from the datastore 33 | * @param shardCount the number of parallel shards to divide the input into. 34 | */ 35 | public DatastoreKeyInput(Query query, int shardCount) { 36 | super(query.setKeysOnly(), shardCount); 37 | } 38 | 39 | @Override 40 | protected DatastoreKeyInputReader createReader(Query query) { 41 | return new DatastoreKeyInputReader(query); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/inputs/UnmarshallingInput.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.inputs; 2 | 3 | import static com.google.common.base.Preconditions.checkNotNull; 4 | 5 | import com.google.appengine.tools.mapreduce.Input; 6 | import com.google.appengine.tools.mapreduce.InputReader; 7 | import com.google.appengine.tools.mapreduce.Marshaller; 8 | 9 | import java.io.IOException; 10 | import java.nio.ByteBuffer; 11 | import java.util.ArrayList; 12 | import java.util.List; 13 | 14 | /** 15 | * An {@link Input} that unmarshalls records. 16 | * 17 | * @param type of values produced by this input 18 | */ 19 | public final class UnmarshallingInput extends Input { 20 | 21 | private static final long serialVersionUID = 6893854789021758519L; 22 | 23 | private final Input input; 24 | private final Marshaller marshaller; 25 | 26 | /** 27 | * @param input The input producing values to unmarshall. 28 | * @param marshaller The marshaller to use for unmarshalling the input values. 29 | */ 30 | public UnmarshallingInput(Input input, Marshaller marshaller) { 31 | this.input = checkNotNull(input, "Null input"); 32 | this.marshaller = checkNotNull(marshaller, "Null marshaller"); 33 | } 34 | 35 | @Override 36 | public List> createReaders() throws IOException { 37 | List> readers = input.createReaders(); 38 | List> result = new ArrayList<>(readers.size()); 39 | for (InputReader reader : readers) { 40 | result.add(new UnmarshallingInputReader<>(reader, marshaller)); 41 | } 42 | return result; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/inputs/NoInput.java: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce.inputs; 4 | 5 | import com.google.appengine.tools.mapreduce.Input; 6 | import com.google.appengine.tools.mapreduce.InputReader; 7 | import com.google.common.collect.ImmutableList; 8 | 9 | import java.util.List; 10 | import java.util.NoSuchElementException; 11 | 12 | /** 13 | * An {@link Input} that does not produce any values. 14 | * 15 | * @author ohler@google.com (Christian Ohler) 16 | * 17 | * @param the type of input values formally (but not actually) produced by 18 | * this input 19 | */ 20 | public final class NoInput extends Input { 21 | 22 | private static final long serialVersionUID = 214109122708935335L; 23 | 24 | public static NoInput create(int numShards) { 25 | return new NoInput<>(numShards); 26 | } 27 | 28 | private static class Reader extends InputReader { 29 | 30 | private static final long serialVersionUID = 171763263195134256L; 31 | 32 | @Override 33 | public Double getProgress() { 34 | return 1.0; 35 | } 36 | 37 | @Override 38 | public I next() { 39 | throw new NoSuchElementException(); 40 | } 41 | } 42 | 43 | private final int numShards; 44 | 45 | public NoInput(int numShards) { 46 | this.numShards = numShards; 47 | } 48 | 49 | @Override 50 | public List> createReaders() { 51 | ImmutableList.Builder> out = ImmutableList.builder(); 52 | for (int i = 0; i < numShards; i++) { 53 | out.add(new Reader()); 54 | } 55 | return out.build(); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/util/SerializableValue.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.impl.util; 2 | 3 | import com.google.appengine.tools.mapreduce.Marshaller; 4 | 5 | import java.io.IOException; 6 | import java.io.ObjectInputStream; 7 | import java.io.ObjectOutputStream; 8 | import java.io.Serializable; 9 | import java.nio.ByteBuffer; 10 | 11 | /** 12 | * A {@link Serializable} instance of any given value and its {@link Marshaller}. 13 | * @param 14 | */ 15 | public final class SerializableValue implements Serializable { 16 | 17 | private static final long serialVersionUID = -5188676157133889956L; 18 | 19 | private transient T value; 20 | private final Marshaller marshaller; 21 | 22 | private SerializableValue(Marshaller marshaller, T value) { 23 | this.marshaller = marshaller; 24 | this.value = value; 25 | } 26 | 27 | public static SerializableValue of(Marshaller marshaller, T value) { 28 | return new SerializableValue<>(marshaller, value); 29 | } 30 | 31 | public T getValue() { 32 | return value; 33 | } 34 | 35 | private void readObject(ObjectInputStream aInputStream) throws ClassNotFoundException, 36 | IOException { 37 | aInputStream.defaultReadObject(); 38 | value = marshaller.fromBytes(ByteBuffer.wrap((byte[]) aInputStream.readObject())); 39 | } 40 | 41 | private void writeObject(ObjectOutputStream aOutputStream) throws IOException { 42 | aOutputStream.defaultWriteObject(); 43 | ByteBuffer byteBuffer = marshaller.toBytes(value); 44 | aOutputStream.writeObject(SerializationUtil.getBytes(byteBuffer.slice())); 45 | // In case marshalling modified the item 46 | value = marshaller.fromBytes(byteBuffer); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/outputs/SizeSegmentingOutputWriter.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.outputs; 2 | 3 | import com.google.appengine.tools.mapreduce.OutputWriter; 4 | 5 | import java.io.IOException; 6 | import java.nio.ByteBuffer; 7 | 8 | /** 9 | * Segments the output base on the specified size limit. Creates a new writer if the sum of number 10 | * of bytes written by the current writer and number of bytes to write exceeds segment size limit 11 | */ 12 | public abstract class SizeSegmentingOutputWriter extends ItemSegmentingOutputWriter { 13 | 14 | private static final long serialVersionUID = 7900756955061379581L; 15 | private final long segmentSizeLimit; 16 | private long bytesWritten; 17 | 18 | public SizeSegmentingOutputWriter(long segmentSizeLimit) { 19 | this.segmentSizeLimit = segmentSizeLimit; 20 | } 21 | 22 | @Override 23 | public void beginShard() throws IOException { 24 | bytesWritten = 0; 25 | super.beginShard(); 26 | } 27 | 28 | @Override 29 | protected boolean shouldSegment(ByteBuffer value) { 30 | if (bytesWritten + value.remaining() > segmentSizeLimit) { 31 | return true; 32 | } 33 | return false; 34 | } 35 | 36 | @Override 37 | public void write(ByteBuffer value) throws IOException { 38 | long numOfBytesToWrite = value.remaining(); 39 | super.write(value); 40 | bytesWritten += numOfBytesToWrite - value.remaining(); 41 | } 42 | 43 | @Override 44 | protected final OutputWriter createNextWriter(int fileNum) { 45 | OutputWriter nextWriter = createWriter(fileNum); 46 | bytesWritten = 0; 47 | return nextWriter; 48 | } 49 | 50 | protected abstract OutputWriter createWriter(int fileNum); 51 | } 52 | -------------------------------------------------------------------------------- /java/src/main/resources/ui/overview.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | MapReduce Overview 5 | 6 | 7 | 8 | 9 | 12 | 13 | 14 | 15 | 16 | 17 |

MapReduce Overview

18 | 19 |
20 |

Running jobs

21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 |
StatusViewIDNameActivityStart timeTime elapsedControl
38 | 39 | 40 |
Loading...
50 |
51 | 52 | 53 |
54 |

Launch job

55 |
56 | Loading... 57 |
58 |
59 |
60 |
61 | 62 | 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /python/src/mapreduce/static/overview.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | MapReduce Overview 5 | 6 | 7 | 8 | 9 | 12 | 13 | 14 | 15 | 16 | 17 |

MapReduce Overview

18 | 19 |
20 |

Running jobs

21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 |
StatusViewIDNameActivityStart timeTime elapsedControl
38 | 39 | 40 |
Loading...
50 |
51 | 52 | 53 |
54 |

Launch job

55 |
56 | Loading... 57 |
58 |
59 |
60 |
61 | 62 | 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/inputs/ForwardingInputReader.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.inputs; 2 | 3 | import com.google.appengine.tools.mapreduce.InputReader; 4 | import com.google.appengine.tools.mapreduce.ShardContext; 5 | 6 | import java.io.IOException; 7 | import java.util.NoSuchElementException; 8 | 9 | 10 | /** 11 | * An {@link InputReader} delegates to another implementation. 12 | * 13 | * @param type of values returned by this reader 14 | */ 15 | public abstract class ForwardingInputReader extends InputReader { 16 | 17 | private static final long serialVersionUID = 443622749959231115L; 18 | 19 | protected abstract InputReader getDelegate(); 20 | 21 | @Override 22 | public T next() throws IOException, NoSuchElementException { 23 | return getDelegate().next(); 24 | } 25 | 26 | @Override 27 | public Double getProgress() { 28 | return getDelegate().getProgress(); 29 | } 30 | 31 | @Override 32 | public void beginSlice() throws IOException { 33 | getDelegate().beginSlice(); 34 | } 35 | 36 | @Override 37 | public void endSlice() throws IOException { 38 | getDelegate().endSlice(); 39 | } 40 | 41 | @Override 42 | public void beginShard() throws IOException { 43 | getDelegate().beginShard(); 44 | } 45 | 46 | @Override 47 | public void endShard() throws IOException { 48 | getDelegate().endShard(); 49 | } 50 | 51 | @Override 52 | public long estimateMemoryRequirement() { 53 | return getDelegate().estimateMemoryRequirement(); 54 | } 55 | 56 | @Override 57 | public void setContext(ShardContext context) { 58 | getDelegate().setContext(context); 59 | } 60 | 61 | @Override 62 | public ShardContext getContext() { 63 | return getDelegate().getContext(); 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/outputs/InMemoryOutput.java: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce.outputs; 4 | 5 | import com.google.appengine.tools.mapreduce.Output; 6 | import com.google.appengine.tools.mapreduce.OutputWriter; 7 | import com.google.common.collect.ImmutableList; 8 | 9 | import java.util.Collection; 10 | import java.util.List; 11 | 12 | /** 13 | * An {@link Output} that collects all values in memory and returns them on 14 | * {@link #finish}. Don't use this unless the entire output is small enough to 15 | * fit in memory. 16 | * 17 | * @author ohler@google.com (Christian Ohler) 18 | * 19 | * @param type of values accepted by this output 20 | */ 21 | public class InMemoryOutput extends Output>> { 22 | 23 | private static final long serialVersionUID = 184437617254585618L; 24 | 25 | @Override 26 | public List> createWriters(int numShards) { 27 | ImmutableList.Builder> out = ImmutableList.builder(); 28 | for (int i = 0; i < numShards; i++) { 29 | out.add(new InMemoryOutputWriter()); 30 | } 31 | return out.build(); 32 | } 33 | 34 | /** 35 | * Returns a list of lists where the outer list has one element for each 36 | * reduce shard, which is a list of the values emitted by that shard, in 37 | * order. 38 | */ 39 | @Override 40 | public List> finish(Collection> writers) { 41 | ImmutableList.Builder> out = ImmutableList.builder(); 42 | for (OutputWriter w : writers) { 43 | InMemoryOutputWriter writer = (InMemoryOutputWriter) w; 44 | out.add(ImmutableList.copyOf(writer.getResult())); 45 | } 46 | return out.build(); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/shardedjob/pipeline/AbstractShardedJob.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.impl.shardedjob.pipeline; 2 | 3 | import com.google.appengine.tools.pipeline.FutureValue; 4 | import com.google.appengine.tools.pipeline.Job; 5 | import com.google.appengine.tools.pipeline.Job0; 6 | import com.google.appengine.tools.pipeline.JobSetting; 7 | import com.google.appengine.tools.pipeline.Jobs; 8 | import com.google.appengine.tools.pipeline.Value; 9 | 10 | /** 11 | * A base class for a sharded-job pipeline. 12 | */ 13 | public abstract class AbstractShardedJob extends Job0 { 14 | 15 | private static final long serialVersionUID = 6498588928999409114L; 16 | private static final int SHARDS_PER_JOB = 20; 17 | private static final JobSetting[] CHILD_JOB_PARAMS = {}; 18 | private final String jobId; 19 | private final int taskCount; 20 | 21 | public AbstractShardedJob(String jobId, int taskCount) { 22 | this.jobId = jobId; 23 | this.taskCount = taskCount; 24 | } 25 | 26 | @Override 27 | public Value run() { 28 | int childJobs = (int) Math.ceil(taskCount / (double) SHARDS_PER_JOB); 29 | FutureValue[] waitFor = new FutureValue[childJobs]; 30 | int startOffset = 0; 31 | for (int i = 0; i < childJobs; i++) { 32 | int endOffset = Math.min(taskCount, startOffset + SHARDS_PER_JOB); 33 | waitFor[i] = futureCallUnchecked( 34 | getChildJobParams(), createShardsJob(startOffset, endOffset)); 35 | startOffset = endOffset; 36 | } 37 | return Jobs.waitForAllAndDelete(this, null, waitFor); 38 | } 39 | 40 | protected abstract Job createShardsJob(int start, int end); 41 | 42 | protected String getJobId() { 43 | return jobId; 44 | } 45 | 46 | protected JobSetting[] getChildJobParams() { 47 | return CHILD_JOB_PARAMS; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/outputs/SliceSegmentingOutputWriter.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.outputs; 2 | 3 | import com.google.appengine.tools.mapreduce.OutputWriter; 4 | 5 | import java.io.IOException; 6 | 7 | /** 8 | * Segments data by using a separate writer for each slice. This is used by the Sort output so that 9 | * there can be multiple sorted files per reducer. 10 | * 11 | * @param type of values accepted by this output 12 | * @param type of the output writer being written to 13 | * 14 | */ 15 | public abstract class SliceSegmentingOutputWriter> extends 16 | OutputWriter { 17 | 18 | private static final long serialVersionUID = -2846649020412508288L; 19 | private int sliceCount; 20 | private transient OutputWriter writer; 21 | 22 | @Override 23 | public void beginShard() { 24 | sliceCount = 0; 25 | } 26 | 27 | /** 28 | * Creates a new writer. 29 | */ 30 | @Override 31 | public void beginSlice() throws IOException { 32 | writer = createWriter(sliceCount++); 33 | writer.setContext(getContext()); 34 | writer.beginShard(); 35 | writer.beginSlice(); 36 | } 37 | 38 | /** 39 | * Creates a new writer. This is called once per slice 40 | */ 41 | protected abstract WriterT createWriter(int sliceNumber); 42 | 43 | /** 44 | * closes the current writer. 45 | */ 46 | @Override 47 | public void endSlice() throws IOException { 48 | writer.endSlice(); 49 | writer.endShard(); 50 | writer = null; 51 | } 52 | 53 | @Override 54 | public void write(O value) throws IOException { 55 | writer.write(value); 56 | } 57 | 58 | @Override 59 | public boolean allowSliceRetry() { 60 | return true; 61 | } 62 | 63 | @Override 64 | public abstract long estimateMemoryRequirement(); 65 | } 66 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/outputs/NoOutput.java: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce.outputs; 4 | 5 | import com.google.appengine.tools.mapreduce.Output; 6 | import com.google.appengine.tools.mapreduce.OutputWriter; 7 | import com.google.appengine.tools.mapreduce.impl.shardedjob.JobFailureException; 8 | import com.google.common.collect.ImmutableList; 9 | 10 | import java.util.Collection; 11 | import java.util.List; 12 | 13 | /** 14 | * An {@link Output} that throws an exception whenever an attempt is made to 15 | * write a value. 16 | * 17 | * @author ohler@google.com (Christian Ohler) 18 | * 19 | * @param type of output values formally (but not actually) accepted by this output 20 | * @param type of result formally returned accepted by this output (though always return null) 21 | */ 22 | public class NoOutput extends Output { 23 | 24 | private static final long serialVersionUID = 965415182637510898L; 25 | 26 | private static class Writer extends OutputWriter { 27 | 28 | private static final long serialVersionUID = 524459343516880300L; 29 | 30 | @Override 31 | public void write(O object) { 32 | throw new JobFailureException("Attempt to write to NoOutput: " + object); 33 | } 34 | 35 | @Override 36 | public boolean allowSliceRetry() { 37 | return true; 38 | } 39 | } 40 | 41 | @Override 42 | public List> createWriters(int numShards) { 43 | ImmutableList.Builder> out = ImmutableList.builder(); 44 | for (int i = 0; i < numShards; i++) { 45 | out.add(new Writer()); 46 | } 47 | return out.build(); 48 | } 49 | 50 | /** 51 | * Returns {@code null}. 52 | */ 53 | @Override 54 | public R finish(Collection> writers) { 55 | return null; 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /java/src/main/resources/ui/detail.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Loading Job Status... 5 | 6 | 7 | 8 | 9 | 10 | 11 | 15 | 16 | 17 | 18 | 19 | 20 |
21 | 22 | « Back to Overview 23 | | 24 | 25 | 26 | 29 |
30 | 31 |

Loading Job Status...

32 |

33 | 34 |
35 | 36 |
37 | 38 |
39 |

Overview

40 |
    41 |
    42 | 43 |
    44 |

    Counters

    45 |
      46 |
      47 | 48 |
      49 | 50 |
      51 |

      Shard status

      52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 |
      ShardStatusDescriptionLast work itemTime elapsed
      65 |
      66 |
      67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /python/src/mapreduce/static/detail.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Loading Job Status... 5 | 6 | 7 | 8 | 9 | 10 | 11 | 15 | 16 | 17 | 18 | 19 | 20 |
      21 | 22 | « Back to Overview 23 | | 24 | 25 | 26 | 29 |
      30 | 31 |

      Loading Job Status...

      32 |

      33 | 34 |
      35 | 36 |
      37 | 38 |
      39 |

      Overview

      40 |
        41 |
        42 | 43 |
        44 |

        Counters

        45 |
          46 |
          47 | 48 |
          49 | 50 |
          51 |

          Shard status

          52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 |
          ShardStatusDescriptionLast work itemTime elapsed
          65 |
          66 |
          67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /java/src/test/java/com/google/appengine/tools/mapreduce/inputs/GoogleCloudStorageLineInputTestCase.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.inputs; 2 | 3 | import com.google.appengine.tools.cloudstorage.GcsFileOptions; 4 | import com.google.appengine.tools.cloudstorage.GcsFilename; 5 | import com.google.appengine.tools.cloudstorage.GcsOutputChannel; 6 | import com.google.appengine.tools.cloudstorage.GcsService; 7 | import com.google.appengine.tools.cloudstorage.GcsServiceFactory; 8 | import com.google.appengine.tools.development.testing.LocalBlobstoreServiceTestConfig; 9 | import com.google.appengine.tools.development.testing.LocalDatastoreServiceTestConfig; 10 | import com.google.appengine.tools.development.testing.LocalServiceTestHelper; 11 | 12 | import junit.framework.TestCase; 13 | 14 | import java.io.IOException; 15 | import java.nio.ByteBuffer; 16 | 17 | /** 18 | */ 19 | abstract class GoogleCloudStorageLineInputTestCase extends TestCase { 20 | 21 | private final LocalServiceTestHelper helper = new LocalServiceTestHelper( 22 | new LocalBlobstoreServiceTestConfig(), 23 | new LocalDatastoreServiceTestConfig()); 24 | 25 | @Override 26 | public void setUp() throws Exception { 27 | super.setUp(); 28 | helper.setUp(); 29 | } 30 | 31 | long createFile(GcsFilename filename, String record, int recordsCount) throws IOException { 32 | GcsService gcsService = GcsServiceFactory.createGcsService(); 33 | try (GcsOutputChannel writeChannel = gcsService.createOrReplace( 34 | filename, new GcsFileOptions.Builder().mimeType("application/bin").build())) { 35 | for (int i = 0; i < recordsCount; i++) { 36 | writeChannel.write(ByteBuffer.wrap(record.getBytes())); 37 | } 38 | } 39 | return gcsService.getMetadata(filename).getLength(); 40 | } 41 | 42 | @Override 43 | public void tearDown() throws Exception { 44 | helper.tearDown(); 45 | super.tearDown(); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /python/src/mapreduce/operation/db.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright 2010 Google Inc. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """DB-related operations.""" 18 | 19 | 20 | 21 | __all__ = ['Put', 'Delete'] 22 | 23 | 24 | from mapreduce.operation import base 25 | 26 | # pylint: disable=protected-access 27 | 28 | 29 | class Put(base.Operation): 30 | """Put entity into datastore via mutation_pool. 31 | 32 | See mapreduce.context._MutationPool. 33 | """ 34 | 35 | def __init__(self, entity): 36 | """Constructor. 37 | 38 | Args: 39 | entity: an entity to put. 40 | """ 41 | self.entity = entity 42 | 43 | def __call__(self, context): 44 | """Perform operation. 45 | 46 | Args: 47 | context: mapreduce context as context.Context. 48 | """ 49 | context._mutation_pool.put(self.entity) 50 | 51 | 52 | class Delete(base.Operation): 53 | """Delete entity from datastore via mutation_pool. 54 | 55 | See mapreduce.context._MutationPool. 56 | """ 57 | 58 | def __init__(self, entity): 59 | """Constructor. 60 | 61 | Args: 62 | entity: a key or model instance to delete. 63 | """ 64 | self.entity = entity 65 | 66 | def __call__(self, context): 67 | """Perform operation. 68 | 69 | Args: 70 | context: mapreduce context as context.Context. 71 | """ 72 | context._mutation_pool.delete(self.entity) 73 | -------------------------------------------------------------------------------- /java/src/test/java/com/google/appengine/tools/mapreduce/inputs/ConcatenatingInputReaderTest.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.inputs; 2 | 3 | import com.google.appengine.tools.mapreduce.InputReader; 4 | 5 | import junit.framework.TestCase; 6 | 7 | import java.io.IOException; 8 | import java.util.ArrayList; 9 | import java.util.List; 10 | import java.util.NoSuchElementException; 11 | 12 | /** 13 | * Test that ConcatenatingInputReader does what it's name implies 14 | */ 15 | public class ConcatenatingInputReaderTest extends TestCase { 16 | 17 | private List> createReaders(int num) { 18 | ArrayList> result = new ArrayList<>(num); 19 | for (int i = 0; i < num; i++) { 20 | result.add(new ConsecutiveLongInput.Reader(0, 10)); 21 | } 22 | return result; 23 | } 24 | 25 | public void testConcatenates() throws NoSuchElementException, IOException { 26 | final int numReader = 10; 27 | ConcatenatingInputReader cat = new ConcatenatingInputReader<>(createReaders(numReader)); 28 | for (int i = 0; i < numReader; i++) { 29 | for (long j = 0; j < 10; j++) { 30 | assertEquals((Long) j, cat.next()); 31 | } 32 | } 33 | try { 34 | cat.next(); 35 | fail(); 36 | } catch (NoSuchElementException e) { 37 | // expected 38 | } 39 | } 40 | 41 | public void testProgress() throws NoSuchElementException, IOException { 42 | final int numReader = 10; 43 | ConcatenatingInputReader cat = new ConcatenatingInputReader<>(createReaders(numReader)); 44 | Double progress = cat.getProgress(); 45 | assertEquals(0.0, progress); 46 | for (int i = 0; i < 10 * numReader; i++) { 47 | cat.next(); 48 | assertTrue("Progress was " + progress + " is now " + cat.getProgress(), 49 | progress <= cat.getProgress()); 50 | progress = cat.getProgress(); 51 | } 52 | assertEquals(1.0, progress); 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/shardedjob/IncrementalTask.java: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce.impl.shardedjob; 4 | 5 | import java.io.Serializable; 6 | 7 | /** 8 | * Portion of a sharded job that will be run in a single task queue task. 9 | * 10 | * @author ohler@google.com (Christian Ohler) 11 | */ 12 | public interface IncrementalTask extends Serializable { 13 | 14 | /** 15 | * Called immediately before run. 16 | * 17 | * This method should be very limited in scope and should not block, perform IO or fail for any 18 | * reason other than Rejecting the request. 19 | * 20 | * @throws RejectRequestException when run cannot be called at this time. 21 | */ 22 | void prepare(); 23 | 24 | /** 25 | * Runs this task. This will be invoked over and over until isDone returns true. 26 | * 27 | * If this throws an exception, it may be retried a limited number of times according to a retry 28 | * policy specified in ShardedJobSettings 29 | * 30 | 31 | * @throws ShardFailureException when shard should be retried 32 | * @throws RuntimeException when a slice should be retried 33 | */ 34 | void run(); 35 | 36 | /** 37 | * Clean up and release any resources claimed in prepare. 38 | * Implementations of this method should not throw under any circumstances. 39 | */ 40 | void cleanup(); 41 | 42 | /** 43 | * @return true iff this task is done and run should no longer be invoked. 44 | */ 45 | boolean isDone(); 46 | 47 | /** 48 | * @param abandon true if a retry is due to an abandoned lock. 49 | * @return true if a slice retry after failure are permitted. 50 | */ 51 | boolean allowSliceRetry(boolean abandon); 52 | 53 | /** 54 | * A job completed callback to allow resource cleanup and compaction of the finalized state. 55 | * 56 | * @param status the status of the job 57 | */ 58 | void jobCompleted(Status status); 59 | } 60 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/sort/LexicographicalComparator.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.impl.sort; 2 | 3 | import com.google.common.primitives.Longs; 4 | import com.google.common.primitives.UnsignedBytes; 5 | import com.google.common.primitives.UnsignedLongs; 6 | 7 | import java.nio.ByteBuffer; 8 | import java.util.Comparator; 9 | 10 | /** 11 | * Acts as a comparator for two byte buffers. This class is used to sort keys lexicographically. 12 | * Meaning that if there are two strings using the same character encoding they should sort 13 | * properly. Also if positive or unsigned integers or longs are encoded to a fixed width these 14 | * should sort correctly also. 15 | * 16 | */ 17 | public final class LexicographicalComparator implements Comparator { 18 | 19 | public static int compareBuffers(ByteBuffer left, ByteBuffer right) { 20 | if (left == right) { 21 | return 0; 22 | } 23 | return compare(left, left.position(), left.remaining(), right, right.position(), right.limit()); 24 | } 25 | 26 | @Override 27 | public int compare(ByteBuffer left, ByteBuffer right) { 28 | return compareBuffers(left, right); 29 | } 30 | 31 | 32 | static int compare(ByteBuffer a, int aPos, int aLen, ByteBuffer b, int bPos, int bLen) { 33 | int minLength = Math.min(aLen, bLen); 34 | int minWords = minLength / Longs.BYTES; 35 | 36 | for (int i = 0; i < minWords; i++) { 37 | int offset = i * Longs.BYTES; 38 | int result = UnsignedLongs.compare(a.getLong(aPos + offset), b.getLong(bPos + offset)); 39 | if (result != 0) { 40 | return result; 41 | } 42 | } 43 | // The epilogue to cover the last (minLength % 8) bytes. 44 | for (int i = minWords * Longs.BYTES; i < minLength; i++) { 45 | int result = UnsignedBytes.compare(a.get(aPos + i), b.get(bPos + i)); 46 | if (result != 0) { 47 | return result; 48 | } 49 | } 50 | return aLen - bLen; 51 | } 52 | } -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/shardedjob/ShardedJobService.java: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce.impl.shardedjob; 4 | 5 | import java.util.Iterator; 6 | import java.util.List; 7 | 8 | /** 9 | * Allows interaction with sharded jobs. 10 | * 11 | * @author ohler@google.com (Christian Ohler) 12 | */ 13 | public interface ShardedJobService { 14 | 15 | /** 16 | * Starts a new sharded job with the given ID and parameters. The ID must 17 | * be unique. 18 | * 19 | * This method is idempotent -- if another invocation of this method aborted 20 | * (or is in an unknown state, possibly still running or completed), starting 21 | * the job can be retried by calling the method again with the same arguments. 22 | * The job won't start twice unless {@link #cleanupJob} is called in between. 23 | * 24 | * @param type of tasks that the job consists of 25 | */ 26 | void startJob( 27 | String jobId, 28 | List initialTasks, 29 | ShardedJobController controller, 30 | ShardedJobSettings settings); 31 | 32 | /** 33 | * Returns the state of the job with the given ID. Returns null if no such 34 | * job exists. 35 | */ 36 | ShardedJobState getJobState(String jobId); 37 | 38 | /** 39 | * Returns the tasks associated with this ShardedJob. 40 | */ 41 | Iterator> lookupTasks(ShardedJobState state); 42 | 43 | /** 44 | * Aborts execution of the job with the given ID. If the job has already 45 | * finished or does not exist, this is a no-op. 46 | */ 47 | void abortJob(String jobId); 48 | 49 | /** 50 | * Deletes all data of a completed job with the given ID. 51 | * Data is being deleted asynchronously. 52 | * Returns true if job was already deleted or asynchronous task was submitted successfully. 53 | */ 54 | boolean cleanupJob(String jobId); 55 | } 56 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/inputs/UnmarshallingInputReader.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.inputs; 2 | 3 | import static com.google.common.base.Preconditions.checkNotNull; 4 | 5 | import com.google.appengine.tools.mapreduce.InputReader; 6 | import com.google.appengine.tools.mapreduce.Marshaller; 7 | 8 | import java.io.IOException; 9 | import java.nio.ByteBuffer; 10 | import java.util.NoSuchElementException; 11 | 12 | /** 13 | * An {@link InputReader} that unmarshals records. 14 | * 15 | * @param type of values returned by this reader 16 | */ 17 | public class UnmarshallingInputReader extends InputReader { 18 | 19 | private static final long serialVersionUID = -5155146191805613155L; 20 | private final InputReader reader; 21 | private final Marshaller marshaller; 22 | 23 | public UnmarshallingInputReader(InputReader reader, Marshaller marshaller) { 24 | this.reader = checkNotNull(reader); 25 | this.marshaller = checkNotNull(marshaller); 26 | } 27 | 28 | public Marshaller getMarshaller() { 29 | return marshaller; 30 | } 31 | 32 | @Override 33 | public T next() throws IOException, NoSuchElementException { 34 | ByteBuffer byteBuffer = reader.next(); 35 | return marshaller.fromBytes(byteBuffer); 36 | } 37 | 38 | @Override 39 | public Double getProgress() { 40 | return reader.getProgress(); 41 | } 42 | 43 | @Override 44 | public void beginShard() throws IOException { 45 | reader.beginShard(); 46 | } 47 | 48 | @Override 49 | public void beginSlice() throws IOException { 50 | reader.beginSlice(); 51 | } 52 | 53 | @Override 54 | public void endSlice() throws IOException { 55 | reader.endSlice(); 56 | } 57 | 58 | @Override 59 | public void endShard() throws IOException { 60 | reader.endShard(); 61 | } 62 | 63 | @Override 64 | public long estimateMemoryRequirement() { 65 | return reader.estimateMemoryRequirement(); 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /java/src/test/java/com/google/appengine/tools/mapreduce/impl/shardedjob/TestController.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.impl.shardedjob; 2 | 3 | import static junit.framework.Assert.assertEquals; 4 | import static junit.framework.Assert.fail; 5 | import static org.junit.Assert.assertFalse; 6 | 7 | import java.util.Iterator; 8 | 9 | /** 10 | * A mock controller used for unit tests. It simply sums the inputs to combine the results. 11 | * 12 | */ 13 | public class TestController extends ShardedJobController { 14 | 15 | private static final long serialVersionUID = 1L; 16 | private final int expectedResult; 17 | private boolean completed = false; 18 | 19 | public TestController(int expectedResult) { 20 | this.expectedResult = expectedResult; 21 | } 22 | 23 | @Override 24 | public void completed(Iterator results) { 25 | int sum = 0; 26 | while (results.hasNext()) { 27 | sum += results.next().getResult(); 28 | } 29 | assertEquals(expectedResult, sum); 30 | assertFalse(completed); 31 | completed = true; 32 | } 33 | 34 | @Override 35 | public void failed(Status status) { 36 | fail("Should not have been called"); 37 | } 38 | 39 | public boolean isCompleted() { 40 | return completed; 41 | } 42 | 43 | @Override 44 | public int hashCode() { 45 | final int prime = 31; 46 | int result = 1; 47 | result = prime * result + (completed ? 1231 : 1237); 48 | result = prime * result + expectedResult; 49 | return result; 50 | } 51 | 52 | @Override 53 | public boolean equals(Object obj) { 54 | if (this == obj) { 55 | return true; 56 | } 57 | if (obj == null) { 58 | return false; 59 | } 60 | if (getClass() != obj.getClass()) { 61 | return false; 62 | } 63 | TestController other = (TestController) obj; 64 | if (completed != other.completed) { 65 | return false; 66 | } 67 | return expectedResult == other.expectedResult; 68 | } 69 | 70 | } 71 | -------------------------------------------------------------------------------- /java/src/test/java/com/google/appengine/tools/mapreduce/impl/shardedjob/TestTask.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.impl.shardedjob; 2 | 3 | import static org.junit.Assert.assertTrue; 4 | 5 | import com.google.appengine.tools.mapreduce.impl.IncrementalTaskContext; 6 | import com.google.appengine.tools.mapreduce.impl.IncrementalTaskWithContext; 7 | 8 | /** 9 | * A simple intermediate tasks object to be used in unit tests. 10 | * 11 | */ 12 | public class TestTask implements IncrementalTaskWithContext { 13 | private static final long serialVersionUID = 1L; 14 | private final IncrementalTaskContext context; 15 | private final int valueToYield; 16 | private byte[] initialPayload; 17 | private int total = 0; 18 | private int slicesRemaining; 19 | 20 | public TestTask(int shardId, int shardCount, int valueToYield, int numSlices, byte... payload) { 21 | this.context = 22 | new IncrementalTaskContext("TestMR", shardId, shardCount, "testCalls", "testCallsMillis"); 23 | this.valueToYield = valueToYield; 24 | slicesRemaining = numSlices; 25 | this.initialPayload = payload; 26 | } 27 | 28 | byte[] getPayload() { 29 | return initialPayload; 30 | } 31 | 32 | @Override 33 | public void prepare() { 34 | } 35 | 36 | @Override 37 | public void run() { 38 | assertTrue(slicesRemaining-- > 0); 39 | total += valueToYield; 40 | context.getCounters().getCounter("TestTaskSum").increment(valueToYield); 41 | } 42 | 43 | @Override 44 | public void cleanup() { 45 | } 46 | 47 | @Override 48 | public boolean isDone() { 49 | return slicesRemaining <= 0; 50 | } 51 | 52 | public Integer getResult() { 53 | return total; 54 | } 55 | 56 | @Override 57 | public IncrementalTaskContext getContext() { 58 | return context; 59 | } 60 | 61 | @Override 62 | public boolean allowSliceRetry(boolean abandon) { 63 | return false; 64 | } 65 | 66 | @Override 67 | public void jobCompleted(Status status) { 68 | initialPayload = null; 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /python/src/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2015 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Setup specs for packaging, distributing, and installing MR lib.""" 16 | 17 | import distribute_setup 18 | # User may not have setuptools installed on their machines. 19 | # This script will automatically install the right version from PyPI. 20 | distribute_setup.use_setuptools() 21 | 22 | 23 | # pylint: disable=g-import-not-at-top 24 | import setuptools 25 | 26 | 27 | # To debug, set DISTUTILS_DEBUG env var to anything. 28 | setuptools.setup( 29 | name="GoogleAppEngineMapReduce", 30 | version="1.9.21.0", 31 | packages=setuptools.find_packages(), 32 | author="Google App Engine", 33 | author_email="app-engine-pipeline-api@googlegroups.com", 34 | keywords="google app engine mapreduce data processing", 35 | url="https://code.google.com/p/appengine-mapreduce/", 36 | license="Apache License 2.0", 37 | description=("Enable MapReduce style data processing on " 38 | "App Engine"), 39 | zip_safe=True, 40 | # Include package data except README. 41 | include_package_data=True, 42 | exclude_package_data={"": ["README"]}, 43 | install_requires=[ 44 | "GoogleAppEngineCloudStorageClient >= 1.9.21", 45 | "GoogleAppEnginePipeline >= 1.9.21", 46 | "Graphy >= 1.0.0", 47 | "simplejson >= 3.6.5", 48 | "mock >= 1.0.1", 49 | "mox >= 0.5.3", 50 | ] 51 | ) 52 | -------------------------------------------------------------------------------- /java/example/shuffler/WEB-INF/web.xml: -------------------------------------------------------------------------------- 1 | 2 | 8 | 9 | 10 | 11 | PipelineServlet 12 | 13 | com.google.appengine.tools.pipeline.impl.servlets.PipelineServlet 14 | 15 | 16 | 17 | PipelineServlet 18 | /_ah/pipeline/* 19 | 20 | 21 | 22 | mapreduce 23 | 24 | com.google.appengine.tools.mapreduce.MapReduceServlet 25 | 26 | 27 | 28 | mapreduce 29 | /mapreduce/* 30 | 31 | 32 | 33 | 34 | ShufflerServlet 35 | 36 | com.google.appengine.tools.mapreduce.servlets.ShufflerServlet 37 | 38 | 39 | 40 | ShufflerServlet 41 | /shufflerServlet/* 42 | 43 | 44 | 46 | 47 | 48 | /* 49 | 50 | 51 | admin 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/outputs/LevelDbOutput.java: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce.outputs; 4 | 5 | import static com.google.common.base.Preconditions.checkNotNull; 6 | 7 | import com.google.appengine.tools.mapreduce.Output; 8 | import com.google.appengine.tools.mapreduce.OutputWriter; 9 | 10 | import java.io.IOException; 11 | import java.nio.ByteBuffer; 12 | import java.util.ArrayList; 13 | import java.util.Collection; 14 | import java.util.List; 15 | 16 | /** 17 | * An {@link Output} that writes LevelDb records. 18 | * Data written with this class can be read with 19 | * {@link com.google.appengine.tools.mapreduce.inputs.GoogleCloudStorageLevelDbInput}. 20 | * 21 | * @param type returned by {@link #finish} 22 | */ 23 | public class LevelDbOutput extends Output { 24 | private static final long serialVersionUID = 184437617254585618L; 25 | 26 | private final Output sink; 27 | 28 | /** 29 | * @param sink The output where data should be written. 30 | */ 31 | public LevelDbOutput(Output sink) { 32 | this.sink = checkNotNull(sink, "Null sink"); 33 | } 34 | 35 | @Override 36 | public List createWriters(int numShards) { 37 | List> writers = sink.createWriters(numShards); 38 | List result = new ArrayList<>(writers.size()); 39 | for (OutputWriter writer : writers) { 40 | result.add(new LevelDbOutputWriter(writer)); 41 | } 42 | return result; 43 | } 44 | 45 | @Override 46 | public R finish(Collection> writers) throws IOException { 47 | ArrayList> wrapped = new ArrayList<>(writers.size()); 48 | for (OutputWriter w : writers) { 49 | LevelDbOutputWriter writer = (LevelDbOutputWriter) w; 50 | wrapped.add(writer.getDelegate()); 51 | } 52 | return sink.finish(wrapped); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /java/src/test/java/com/google/appengine/tools/mapreduce/inputs/BlobstoreInputTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2010 Google Inc. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.google.appengine.tools.mapreduce.inputs; 18 | 19 | import com.google.appengine.tools.mapreduce.InputReader; 20 | 21 | import java.util.List; 22 | 23 | /** 24 | * Unit test for {@code BlobstoreInput}. 25 | */ 26 | public class BlobstoreInputTest extends BlobstoreInputTestCase { 27 | 28 | public static final String RECORD = "01234567890\n"; 29 | public static final int RECORDS_COUNT = 1000; 30 | 31 | @Override 32 | public void setUp() throws Exception { 33 | super.setUp(); 34 | createFile(RECORD, RECORDS_COUNT); 35 | } 36 | 37 | public void testSplit() throws Exception { 38 | BlobstoreInput input = new BlobstoreInput(blobKey.getKeyString(), (byte) '\n', 4); 39 | List> readers = input.createReaders(); 40 | assertEquals(4, readers.size()); 41 | assertSplitRange(0, 3000, readers.get(0)); 42 | assertSplitRange(3000, 6000, readers.get(1)); 43 | assertSplitRange(6000, 9000, readers.get(2)); 44 | assertSplitRange(9000, 12000, readers.get(3)); 45 | } 46 | 47 | private static void assertSplitRange(int start, int end, InputReader reader) { 48 | BlobstoreInputReader r = (BlobstoreInputReader) reader; 49 | assertEquals("Start offset mismatch", start, r.startOffset); 50 | assertEquals("End offset mismatch", end, r.endOffset); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/outputs/ItemSegmentingOutputWriter.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.outputs; 2 | 3 | import com.google.appengine.tools.mapreduce.OutputWriter; 4 | import com.google.appengine.tools.mapreduce.ShardContext; 5 | 6 | import java.io.IOException; 7 | 8 | /** 9 | * Segments data by using a separate writer each time {@link #shouldSegment} returns true. This is 10 | * used by the Merge output in the event that there are too many files to merge in one pass. 11 | * 12 | * 13 | * @param the type of the written values. 14 | */ 15 | public abstract class ItemSegmentingOutputWriter extends ForwardingOutputWriter { 16 | 17 | private static final long serialVersionUID = 5180178926565317540L; 18 | private int fileCount = 0; 19 | private OutputWriter writer; 20 | private transient ShardContext context; 21 | 22 | @Override 23 | public void beginShard() throws IOException { 24 | fileCount = 0; 25 | writer = createNextWriter(fileCount++); 26 | writer.setContext(context); 27 | super.beginShard(); 28 | } 29 | 30 | @Override 31 | public void write(O value) throws IOException { 32 | if (shouldSegment(value)) { 33 | writer.endSlice(); 34 | writer.endShard(); 35 | writer = createNextWriter(fileCount++); 36 | writer.setContext(getContext()); 37 | writer.beginShard(); 38 | writer.beginSlice(); 39 | } 40 | writer.write(value); 41 | } 42 | 43 | protected abstract boolean shouldSegment(O value); 44 | 45 | protected abstract OutputWriter createNextWriter(int fileNum); 46 | 47 | @Override 48 | protected OutputWriter getDelegate() { 49 | return writer; 50 | } 51 | 52 | @Override 53 | public abstract long estimateMemoryRequirement(); 54 | 55 | @Override 56 | public boolean allowSliceRetry() { 57 | return false; 58 | } 59 | 60 | @Override 61 | public void setContext(ShardContext context) { 62 | this.context = context; 63 | } 64 | 65 | @Override 66 | public ShardContext getContext() { 67 | return context; 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /python/src/mapreduce/api/map_job/datastore_input_reader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2015 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Datastore Input Reader implementation for the map_job API.""" 16 | import logging 17 | 18 | from mapreduce import datastore_range_iterators as db_iters 19 | from mapreduce import errors 20 | from mapreduce.api.map_job import abstract_datastore_input_reader 21 | 22 | # pylint: disable=invalid-name 23 | 24 | 25 | class DatastoreInputReader(abstract_datastore_input_reader 26 | .AbstractDatastoreInputReader): 27 | """Iterates over an entity kind and yields datastore.Entity.""" 28 | 29 | _KEY_RANGE_ITER_CLS = db_iters.KeyRangeEntityIterator 30 | 31 | @classmethod 32 | def validate(cls, job_config): 33 | """Inherit docs.""" 34 | super(DatastoreInputReader, cls).validate(job_config) 35 | params = job_config.input_reader_params 36 | entity_kind = params[cls.ENTITY_KIND_PARAM] 37 | # Check for a "." in the entity kind. 38 | if "." in entity_kind: 39 | logging.warning( 40 | ". detected in entity kind %s specified for reader %s." 41 | "Assuming entity kind contains the dot.", 42 | entity_kind, cls.__name__) 43 | # Validate the filters parameters. 44 | if cls.FILTERS_PARAM in params: 45 | filters = params[cls.FILTERS_PARAM] 46 | for f in filters: 47 | if f[1] != "=": 48 | raise errors.BadReaderParamsError( 49 | "Only equality filters are supported: %s", f) 50 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/pipeline/CleanupPipelineJob.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.impl.pipeline; 2 | 3 | import com.google.appengine.tools.cloudstorage.GcsFilename; 4 | import com.google.appengine.tools.pipeline.FutureValue; 5 | import com.google.appengine.tools.pipeline.Job1; 6 | import com.google.appengine.tools.pipeline.JobSetting; 7 | import com.google.appengine.tools.pipeline.Jobs; 8 | import com.google.appengine.tools.pipeline.PipelineService; 9 | import com.google.appengine.tools.pipeline.PipelineServiceFactory; 10 | import com.google.appengine.tools.pipeline.Value; 11 | import com.google.common.collect.Lists; 12 | 13 | import java.util.ArrayList; 14 | import java.util.List; 15 | 16 | /** 17 | * A pipeline to delete MR result with a FilesByShard and removing its traces when completed 18 | * (therefore should be called as a new pipeline via the {@link #cleanup} method). 19 | */ 20 | public class CleanupPipelineJob extends Job1> { 21 | 22 | private static final long serialVersionUID = -5473046989460252781L; 23 | private static final int DELETE_BATCH_SIZE = 100; 24 | 25 | private CleanupPipelineJob() { 26 | // should only be called by the static cleanup method 27 | } 28 | 29 | @Override 30 | public Value run(List files) { 31 | List> batches = Lists.partition(files, DELETE_BATCH_SIZE); 32 | int index = 0; 33 | @SuppressWarnings("unchecked") 34 | FutureValue[] futures = new FutureValue[batches.size()]; 35 | for (List batch : batches) { 36 | FutureValue futureCall = 37 | futureCall(new DeleteFilesJob(), immediate(new ArrayList<>(batch))); 38 | futures[index++] = futureCall; 39 | } 40 | return Jobs.waitForAllAndDelete(this, null, futures); 41 | } 42 | 43 | public static void cleanup(List toDelete, JobSetting... settings) { 44 | PipelineService service = PipelineServiceFactory.newPipelineService(); 45 | service.startNewPipeline(new CleanupPipelineJob(), toDelete, settings); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /java/example/src/com/google/appengine/demos/mapreduce/entitycount/CountMapper.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.demos.mapreduce.entitycount; 2 | 3 | import com.google.appengine.api.datastore.Entity; 4 | import com.google.appengine.api.datastore.Text; 5 | import com.google.appengine.tools.mapreduce.Mapper; 6 | 7 | import java.util.HashMap; 8 | import java.util.Map.Entry; 9 | 10 | /** 11 | * Counts occurrences of characters in the key and the "payload" property of datastore entities. 12 | * The output key is a human-readable description of the property, the value is the number of 13 | * occurrences. 14 | * 15 | * @author ohler@google.com (Christian Ohler) 16 | */ 17 | class CountMapper extends Mapper { 18 | 19 | private static final long serialVersionUID = 4973057382538885270L; 20 | 21 | private void incrementCounter(String name, long delta) { 22 | getContext().getCounter(name).increment(delta); 23 | } 24 | 25 | private void emitCharacterCounts(String s) { 26 | HashMap counts = new HashMap<>(); 27 | for (int i = 0; i < s.length(); i++) { 28 | char c = s.charAt(i); 29 | Integer count = counts.get(c); 30 | if (count == null) { 31 | counts.put(c, 1); 32 | } else { 33 | counts.put(c, count + 1); 34 | } 35 | } 36 | for (Entry kv : counts.entrySet()) { 37 | emit(String.valueOf(kv.getKey()), Long.valueOf(kv.getValue())); 38 | } 39 | } 40 | 41 | @Override 42 | public void map(Entity entity) { 43 | incrementCounter("total entities", 1); 44 | incrementCounter("map calls in shard " + getContext().getShardNumber(), 1); 45 | 46 | String name = entity.getKey().getName(); 47 | if (name != null) { 48 | incrementCounter("total entity key size", name.length()); 49 | emitCharacterCounts(name); 50 | } 51 | 52 | Text property = (Text) entity.getProperty("payload"); 53 | if (property != null) { 54 | incrementCounter("total entity payload size", property.getValue().length()); 55 | emitCharacterCounts(property.getValue()); 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/inputs/GoogleCloudStorageLevelDbInput.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.inputs; 2 | 3 | import static com.google.common.base.Preconditions.checkArgument; 4 | import static com.google.common.base.Preconditions.checkNotNull; 5 | 6 | import com.google.appengine.tools.cloudstorage.GcsFilename; 7 | import com.google.appengine.tools.mapreduce.GoogleCloudStorageFileSet; 8 | import com.google.appengine.tools.mapreduce.Input; 9 | import com.google.appengine.tools.mapreduce.InputReader; 10 | import com.google.appengine.tools.mapreduce.impl.MapReduceConstants; 11 | 12 | import java.nio.ByteBuffer; 13 | import java.util.ArrayList; 14 | import java.util.List; 15 | 16 | /** 17 | * GoogleCloudStorageLevelDbInput creates LevelDbInputReaders to read input written out by 18 | * {@link com.google.appengine.tools.mapreduce.outputs.LevelDbOutput} to files in 19 | * Google Cloud Storage. 20 | * 21 | */ 22 | public final class GoogleCloudStorageLevelDbInput extends Input { 23 | 24 | private static final long serialVersionUID = -5135725511174133847L; 25 | private final GoogleCloudStorageFileSet files; 26 | private final int bufferSize; 27 | 28 | public GoogleCloudStorageLevelDbInput(GoogleCloudStorageFileSet files) { 29 | this(files, MapReduceConstants.DEFAULT_IO_BUFFER_SIZE); 30 | } 31 | 32 | /** 33 | * @param files The set of files to create readers for. One reader per file. 34 | * @param bufferSize The size of the buffer used for each file. 35 | */ 36 | public GoogleCloudStorageLevelDbInput(GoogleCloudStorageFileSet files, int bufferSize) { 37 | this.files = checkNotNull(files, "Null files"); 38 | this.bufferSize = bufferSize; 39 | checkArgument(bufferSize > 0, "Buffersize must be > 0"); 40 | } 41 | 42 | 43 | @Override 44 | public List> createReaders() { 45 | List> result = new ArrayList<>(); 46 | for (GcsFilename file : files.getFiles()) { 47 | result.add(new GoogleCloudStorageLevelDbInputReader(file, bufferSize)); 48 | } 49 | return result; 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/outputs/MarshallingOutput.java: -------------------------------------------------------------------------------- 1 | // Copyright 2012 Google Inc. All Rights Reserved. 2 | 3 | package com.google.appengine.tools.mapreduce.outputs; 4 | 5 | import static com.google.common.base.Preconditions.checkNotNull; 6 | 7 | import com.google.appengine.tools.mapreduce.Marshaller; 8 | import com.google.appengine.tools.mapreduce.Output; 9 | import com.google.appengine.tools.mapreduce.OutputWriter; 10 | 11 | import java.io.IOException; 12 | import java.nio.ByteBuffer; 13 | import java.util.ArrayList; 14 | import java.util.Collection; 15 | import java.util.List; 16 | 17 | /** 18 | * An {@link Output} that marshalls records. 19 | * 20 | * @param type of values produced by this output 21 | * @param type returned by {@link #finish} on the supplied sink 22 | */ 23 | public class MarshallingOutput extends Output { 24 | private static final long serialVersionUID = 184437617254585618L; 25 | 26 | private final Output sink; 27 | private final Marshaller marshaller; 28 | 29 | public MarshallingOutput(Output sink, Marshaller marshaller) { 30 | this.marshaller = checkNotNull(marshaller, "Null marshaller"); 31 | this.sink = checkNotNull(sink, "Null sink"); 32 | } 33 | 34 | @Override 35 | public List> createWriters(int numShards) { 36 | List> writers = sink.createWriters(numShards); 37 | List> result = new ArrayList<>(writers.size()); 38 | for (OutputWriter writer : writers) { 39 | result.add(new MarshallingOutputWriter<>(writer, marshaller)); 40 | } 41 | return result; 42 | } 43 | 44 | @Override 45 | public R finish(Collection> writers) throws IOException { 46 | ArrayList> wrapped = new ArrayList<>(writers.size()); 47 | for (OutputWriter w : writers) { 48 | MarshallingOutputWriter writer = (MarshallingOutputWriter) w; 49 | wrapped.add(writer.getDelegate()); 50 | } 51 | return sink.finish(wrapped); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /java/src/main/resources/ui/base.css: -------------------------------------------------------------------------------- 1 | html { 2 | margin: 0; 3 | padding: 0; 4 | font-family: Arial, sans-serif; 5 | font-size: 13px; 6 | } 7 | 8 | body { 9 | margin: 0; 10 | padding: 0 3px 3px 3px; 11 | } 12 | 13 | #butter { 14 | position: absolute; 15 | left: 40%; /* todo: actually center this */ 16 | width: 200px; 17 | background-color: #C5D7EF; 18 | text-align: center; 19 | padding: 5px; 20 | border-left: 1px solid #3366CC; 21 | border-right: 1px solid #3366CC; 22 | border-bottom: 1px solid #3366CC; 23 | } 24 | 25 | h1 { 26 | margin-top: 0; 27 | margin-bottom: 0.4em; 28 | font-size: 2em; 29 | } 30 | h2 { 31 | margin-top: 1em; 32 | margin-bottom: 0.4em; 33 | font-size: 1.2em; 34 | } 35 | h3 { 36 | margin-top: 0; 37 | margin-bottom: 0.7em; 38 | font-size: 1.0em; 39 | } 40 | 41 | .status-text { 42 | text-transform: capitalize; 43 | } 44 | 45 | /* Overview page */ 46 | .editable-input, 47 | .job-static-param { 48 | margin: 0.3em; 49 | } 50 | 51 | #launch-control { 52 | margin-bottom: 0.5em; 53 | } 54 | #launch-container { 55 | margin-left: 0.5em; 56 | } 57 | 58 | /* Detail page */ 59 | #control { 60 | float: right; 61 | } 62 | 63 | #detail-graph, 64 | #aggregated-counters-container, 65 | #detail-params-container { 66 | margin-left: 1em; 67 | float: left; 68 | } 69 | 70 | /* Shared */ 71 | .status-table { 72 | margin: 5px; 73 | border-collapse: collapse; 74 | border-width: 0; 75 | empty-cells: show; 76 | border-top: 1px solid #C5D7EF; 77 | border-left: 1px solid #C5D7EF; 78 | border-right: 1px solid #C5D7EF; 79 | } 80 | 81 | .status-table > thead { 82 | height: 2em; 83 | } 84 | 85 | .status-table > tfoot { 86 | height: 1em; 87 | } 88 | 89 | .status-table > thead, 90 | .status-table > tfoot { 91 | background-color: #E5ECF9; 92 | } 93 | 94 | .status-table td { 95 | padding: 4px; 96 | border-left: 1px solid #C5D7EF; 97 | border-bottom: 1px solid #C5D7EF; 98 | border-top: 1px solid #C5D7EF; 99 | } 100 | 101 | input[name$="entity_kind"] { 102 | width:800px; 103 | } 104 | -------------------------------------------------------------------------------- /python/src/mapreduce/static/base.css: -------------------------------------------------------------------------------- 1 | html { 2 | margin: 0; 3 | padding: 0; 4 | font-family: Arial, sans-serif; 5 | font-size: 13px; 6 | } 7 | 8 | body { 9 | margin: 0; 10 | padding: 0 3px 3px 3px; 11 | } 12 | 13 | #butter { 14 | position: absolute; 15 | left: 40%; /* todo: actually center this */ 16 | width: 200px; 17 | background-color: #C5D7EF; 18 | text-align: center; 19 | padding: 5px; 20 | border-left: 1px solid #3366CC; 21 | border-right: 1px solid #3366CC; 22 | border-bottom: 1px solid #3366CC; 23 | } 24 | 25 | h1 { 26 | margin-top: 0; 27 | margin-bottom: 0.4em; 28 | font-size: 2em; 29 | } 30 | h2 { 31 | margin-top: 1em; 32 | margin-bottom: 0.4em; 33 | font-size: 1.2em; 34 | } 35 | h3 { 36 | margin-top: 0; 37 | margin-bottom: 0.7em; 38 | font-size: 1.0em; 39 | } 40 | 41 | .status-text { 42 | text-transform: capitalize; 43 | } 44 | 45 | /* Overview page */ 46 | .editable-input, 47 | .job-static-param { 48 | margin: 0.3em; 49 | } 50 | 51 | #launch-control { 52 | margin-bottom: 0.5em; 53 | } 54 | #launch-container { 55 | margin-left: 0.5em; 56 | } 57 | 58 | /* Detail page */ 59 | #control { 60 | float: right; 61 | } 62 | 63 | #detail-graph, 64 | #aggregated-counters-container, 65 | #detail-params-container { 66 | margin-left: 1em; 67 | float: left; 68 | } 69 | 70 | /* Shared */ 71 | .status-table { 72 | margin: 5px; 73 | border-collapse: collapse; 74 | border-width: 0; 75 | empty-cells: show; 76 | border-top: 1px solid #C5D7EF; 77 | border-left: 1px solid #C5D7EF; 78 | border-right: 1px solid #C5D7EF; 79 | } 80 | 81 | .status-table > thead { 82 | height: 2em; 83 | } 84 | 85 | .status-table > tfoot { 86 | height: 1em; 87 | } 88 | 89 | .status-table > thead, 90 | .status-table > tfoot { 91 | background-color: #E5ECF9; 92 | } 93 | 94 | .status-table td { 95 | padding: 4px; 96 | border-left: 1px solid #C5D7EF; 97 | border-bottom: 1px solid #C5D7EF; 98 | border-top: 1px solid #C5D7EF; 99 | } 100 | 101 | input[name$="entity_kind"] { 102 | width:800px; 103 | } 104 | -------------------------------------------------------------------------------- /python/test/mapreduce/operation/db_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright 2010 Google Inc. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | 19 | 20 | import mox 21 | import unittest 22 | 23 | from mapreduce import context 24 | from mapreduce import operation as op 25 | 26 | class TestEntity(object): 27 | """Test entity class.""" 28 | 29 | 30 | class PutTest(unittest.TestCase): 31 | """Test Put operation.""" 32 | 33 | def testPut(self): 34 | """Test applying Put operation.""" 35 | m = mox.Mox() 36 | 37 | ctx = context.Context(None, None) 38 | ctx._mutation_pool = m.CreateMock(context._MutationPool) 39 | 40 | entity = TestEntity() 41 | operation = op.db.Put(entity) 42 | 43 | # Record calls 44 | ctx._mutation_pool.put(entity) 45 | 46 | m.ReplayAll() 47 | try: # test, verify 48 | operation(ctx) 49 | m.VerifyAll() 50 | finally: 51 | m.UnsetStubs() 52 | 53 | 54 | class DeleteTest(unittest.TestCase): 55 | """Test Delete operation.""" 56 | 57 | def testDelete(self): 58 | """Test applying Delete operation.""" 59 | m = mox.Mox() 60 | 61 | ctx = context.Context(None, None) 62 | ctx._mutation_pool = m.CreateMock(context._MutationPool) 63 | 64 | entity = TestEntity() 65 | operation = op.db.Delete(entity) 66 | 67 | # Record calls 68 | ctx._mutation_pool.delete(entity) 69 | 70 | m.ReplayAll() 71 | try: # test, verify 72 | operation(ctx) 73 | m.VerifyAll() 74 | finally: 75 | m.UnsetStubs() 76 | 77 | 78 | if __name__ == '__main__': 79 | unittest.main() 80 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/inputs/InMemoryInput.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.inputs; 2 | 3 | import static com.google.common.base.Preconditions.checkNotNull; 4 | 5 | import com.google.appengine.tools.mapreduce.Input; 6 | import com.google.appengine.tools.mapreduce.InputReader; 7 | import com.google.common.collect.ImmutableList; 8 | import com.google.common.collect.ImmutableList.Builder; 9 | 10 | import java.util.List; 11 | import java.util.NoSuchElementException; 12 | 13 | /** 14 | * An input that returns values already in memory. 15 | * 16 | * @param type of values returned by this input 17 | */ 18 | public final class InMemoryInput extends Input { 19 | 20 | private static final long serialVersionUID = -7058791377469359722L; 21 | private final List> readers; 22 | 23 | private static final class InMemoryInputReader extends InputReader { 24 | 25 | private static final long serialVersionUID = -7442905939930896134L; 26 | int pos = 0; 27 | private final List results; 28 | 29 | InMemoryInputReader(List results) { 30 | this.results = ImmutableList.copyOf(results); 31 | } 32 | 33 | @Override 34 | public I next() throws NoSuchElementException { 35 | if (pos >= results.size()) { 36 | throw new NoSuchElementException(); 37 | } 38 | return results.get(pos++); 39 | } 40 | 41 | @Override 42 | public Double getProgress() { 43 | if (results.isEmpty()) { 44 | return 1.0; 45 | } 46 | return ((double) pos) / results.size(); 47 | } 48 | 49 | @Override 50 | public void beginShard() { 51 | pos = 0; 52 | } 53 | 54 | } 55 | 56 | public InMemoryInput(List> input) { 57 | checkNotNull(input, "Null input"); 58 | Builder> builder = ImmutableList.builder(); 59 | for (List shard : input) { 60 | builder.add(new InMemoryInputReader<>(shard)); 61 | } 62 | readers = builder.build(); 63 | } 64 | 65 | @Override 66 | public List> createReaders() { 67 | return readers; 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /java/src/test/java/com/google/appengine/tools/mapreduce/impl/handlers/MemoryLimiterTest.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.impl.handlers; 2 | 3 | import com.google.appengine.tools.mapreduce.impl.shardedjob.RejectRequestException; 4 | 5 | import junit.framework.TestCase; 6 | 7 | /** 8 | * Tests MemoryLimiter 9 | * 10 | */ 11 | public class MemoryLimiterTest extends TestCase { 12 | 13 | public void testZero() { 14 | MemoryLimiter limiter = new MemoryLimiter(); 15 | long claimed = limiter.claim(0); 16 | assertEquals(0, claimed); 17 | limiter.release(claimed); 18 | } 19 | 20 | public void testRequestsGoThrough() { 21 | MemoryLimiter limiter = new MemoryLimiter(); 22 | long claimed = limiter.claim(10); 23 | assertEquals(10, claimed); 24 | limiter.release(claimed); 25 | } 26 | 27 | public void testAboveMaxAllocation() { 28 | MemoryLimiter limiter = new MemoryLimiter(); 29 | long claimed = limiter.claim(Integer.MAX_VALUE); 30 | assertTrue(claimed > 0); 31 | assertTrue(claimed < Integer.MAX_VALUE); 32 | try { 33 | limiter.claim(Integer.MAX_VALUE); 34 | fail(); 35 | } catch (RejectRequestException e) { 36 | //expected 37 | } 38 | limiter.release(claimed); 39 | claimed = limiter.claim(Integer.MAX_VALUE); 40 | limiter.release(claimed); 41 | } 42 | 43 | public void testSmallBehindLargeOne() { 44 | MemoryLimiter limiter = new MemoryLimiter(); 45 | long mediumClaimed = limiter.claim(10); 46 | try { 47 | limiter.claim(Integer.MAX_VALUE); 48 | fail(); 49 | } catch (RejectRequestException e) { 50 | // Expected 51 | } 52 | long smallClaimed = limiter.claim(1); 53 | limiter.release(mediumClaimed); 54 | try { 55 | limiter.claim(Integer.MAX_VALUE); 56 | fail(); 57 | } catch (RejectRequestException e) { 58 | // Expected 59 | } 60 | limiter.release(smallClaimed); 61 | long largeClaimed = limiter.claim(Integer.MAX_VALUE); 62 | try { 63 | limiter.claim(1); 64 | fail(); 65 | } catch (RejectRequestException e) { 66 | // Expected 67 | } 68 | limiter.release(largeClaimed); 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /java/example/default/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Hello MapReduce 7 | 8 | 9 | 10 |

          MapReduce Sample Programs

          11 | 12 | 13 | 14 | 23 | 24 | 25 | 35 | 36 | 37 | 45 | 46 |
          Random collisions example
          15 | This example demonstrates parallel computation. It looks for collisions 16 | in Java's random number generator,
          17 | where a collision is defined as multiple seed values 18 | that produce the same output value when next() is called.
          19 | The input source is a range of numbers to test. Collisions are logged and 20 | written out to a file in Google Cloud Storage.
          21 | (No collisions occur so the file will be empty.)
          22 | Run the example.

          Entity counting example
          26 | The example shows how to “chain" MapReduce jobs together, 27 | running them sequentially, one after the other.
          28 | It runs three MapReduce jobs:
          29 | The first job creates some datastore entities, 30 | the second job analyzes them, and the third job deletes them.
          31 | The example also shows how to access the datastore from a 32 | MapReduce job, and how to validate a request using a token.
          33 | Run the example. 34 |

          Bigquery load example
          38 | Note: This example does not work locally as it depends on bigQuery which is not available in the dev-appserver.
          39 | The example shows how to load data into a bigquery table.
          40 | It runs two MapReduce jobs:
          41 | The first job creates files in Google Cloud Storage, 42 | the second job loads these files into the bigquery table
          43 | Run the example. 44 |
          47 | 48 | 49 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/CountersImpl.java: -------------------------------------------------------------------------------- 1 | // Copyright 2011 Google Inc. All Rights Reserved. 2 | package com.google.appengine.tools.mapreduce.impl; 3 | 4 | import com.google.appengine.tools.mapreduce.Counter; 5 | import com.google.appengine.tools.mapreduce.Counters; 6 | import com.google.common.base.Joiner; 7 | import com.google.common.collect.Iterables; 8 | 9 | import java.io.Serializable; 10 | import java.util.Map; 11 | import java.util.TreeMap; 12 | 13 | /** 14 | */ 15 | public class CountersImpl implements Counters { 16 | 17 | private static final long serialVersionUID = -8499952345096458550L; 18 | 19 | private final Map values = new TreeMap<>(); 20 | 21 | @Override 22 | public String toString() { 23 | StringBuilder out = new StringBuilder(getClass().getSimpleName() + "("); 24 | Joiner.on(',').appendTo(out, values.values()); 25 | out.append(')'); 26 | return out.toString(); 27 | } 28 | 29 | @Override 30 | public Counter getCounter(String name) { 31 | Counter counter = values.get(name); 32 | if (counter == null) { 33 | counter = new CounterImpl(name); 34 | values.put(name, counter); 35 | } 36 | return counter; 37 | } 38 | 39 | @Override 40 | public Iterable getCounters() { 41 | return Iterables.unmodifiableIterable(values.values()); 42 | } 43 | 44 | @Override 45 | public void addAll(Counters other) { 46 | for (Counter c : other.getCounters()) { 47 | getCounter(c.getName()).increment(c.getValue()); 48 | } 49 | } 50 | 51 | private static class CounterImpl implements Counter, Serializable { 52 | private static final long serialVersionUID = 5872696485441192885L; 53 | 54 | private final String name; 55 | private long value; 56 | 57 | CounterImpl(String name) { 58 | this.name = name; 59 | } 60 | 61 | @Override 62 | public String toString() { 63 | return name + "=" + value; 64 | } 65 | 66 | @Override 67 | public String getName() { 68 | return name; 69 | } 70 | 71 | @Override 72 | public long getValue() { 73 | return value; 74 | } 75 | 76 | @Override 77 | public void increment(long delta) { 78 | value += delta; 79 | } 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/inputs/BlobstoreInput.java: -------------------------------------------------------------------------------- 1 | // Copyright 2011 Google Inc. All Rights Reserved. 2 | package com.google.appengine.tools.mapreduce.inputs; 3 | 4 | import com.google.appengine.api.blobstore.BlobInfoFactory; 5 | import com.google.appengine.api.blobstore.BlobKey; 6 | import com.google.appengine.tools.mapreduce.Input; 7 | import com.google.appengine.tools.mapreduce.InputReader; 8 | import com.google.common.base.Preconditions; 9 | 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | 13 | /** 14 | * BlobstoreLineInput shards files in Blobstore on separator boundaries. 15 | * 16 | */ 17 | public final class BlobstoreInput extends Input { 18 | 19 | private static final long MIN_SHARD_SIZE = 1024L; 20 | private static final long serialVersionUID = 2235444204028285444L; 21 | 22 | private final String blobKey; 23 | private final byte separator; 24 | private final int shardCount; 25 | 26 | public BlobstoreInput(String blobKey, byte separator, int shardCount) { 27 | this.blobKey = blobKey; 28 | this.separator = separator; 29 | this.shardCount = shardCount; 30 | } 31 | 32 | @Override 33 | public List> createReaders() { 34 | long blobSize = new BlobInfoFactory().loadBlobInfo(new BlobKey(blobKey)).getSize(); 35 | return split(blobKey, blobSize, shardCount); 36 | } 37 | 38 | private List> split(String blobKey, long blobSize, int shardCount) { 39 | Preconditions.checkNotNull(blobKey); 40 | Preconditions.checkArgument(shardCount > 0); 41 | Preconditions.checkArgument(blobSize >= 0); 42 | 43 | // Sanity check 44 | if (shardCount * MIN_SHARD_SIZE > blobSize) { 45 | shardCount = (int) (blobSize / MIN_SHARD_SIZE) + 1; 46 | } 47 | 48 | long splitLength = blobSize / shardCount; 49 | 50 | List result = new ArrayList<>(); 51 | 52 | long startOffset = 0L; 53 | for (int i = 1; i < shardCount; i++) { 54 | long endOffset = i * splitLength; 55 | result.add(new BlobstoreInputReader(blobKey, startOffset, endOffset, separator)); 56 | startOffset = endOffset; 57 | } 58 | result.add(new BlobstoreInputReader(blobKey, startOffset, blobSize, separator)); 59 | return result; 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /java/src/main/java/com/google/appengine/tools/mapreduce/impl/BaseShardContext.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.impl; 2 | 3 | import static com.google.common.base.Preconditions.checkNotNull; 4 | 5 | import com.google.appengine.tools.mapreduce.Counter; 6 | import com.google.appengine.tools.mapreduce.Counters; 7 | import com.google.appengine.tools.mapreduce.OutputWriter; 8 | import com.google.appengine.tools.mapreduce.WorkerContext; 9 | 10 | import java.io.IOException; 11 | 12 | 13 | /** 14 | * Base class for all ShardContext implementations. 15 | * 16 | * @param type of emitted values 17 | */ 18 | public abstract class BaseShardContext extends BaseContext implements WorkerContext { 19 | 20 | private final int shardCount; 21 | private final int shardNumber; 22 | private final Counters counters; 23 | private final OutputWriter outputWriter; 24 | private boolean emitCalled; 25 | 26 | public BaseShardContext(IncrementalTaskContext taskContext, OutputWriter outputWriter) { 27 | super(taskContext.getJobId()); 28 | this.counters = taskContext.getCounters(); 29 | this.shardNumber = taskContext.getShardNumber(); 30 | this.shardCount = taskContext.getShardCount(); 31 | this.outputWriter = checkNotNull(outputWriter, "Null output"); 32 | } 33 | 34 | @Override 35 | public int getShardCount() { 36 | return shardCount; 37 | } 38 | 39 | @Override 40 | public int getShardNumber() { 41 | return shardNumber; 42 | } 43 | 44 | @Override 45 | public Counters getCounters() { 46 | return counters; 47 | } 48 | 49 | @Override 50 | public Counter getCounter(String name) { 51 | return counters.getCounter(name); 52 | } 53 | 54 | @Override 55 | public final void incrementCounter(String name, long delta) { 56 | getCounter(name).increment(delta); 57 | } 58 | 59 | @Override 60 | public final void incrementCounter(String name) { 61 | incrementCounter(name, 1); 62 | } 63 | 64 | @Override 65 | public void emit(O value) { 66 | emitCalled = true; 67 | try { 68 | outputWriter.write(value); 69 | } catch (IOException e) { 70 | throw new RuntimeException(outputWriter + ".write(" + value + ") threw IOException", e); 71 | } 72 | } 73 | 74 | boolean emitCalled() { 75 | return emitCalled; 76 | } 77 | } -------------------------------------------------------------------------------- /java/src/main/resources/ui/jquery.json-2.2.min.js: -------------------------------------------------------------------------------- 1 | 2 | (function($){$.toJSON=function(o) 3 | {if(typeof(JSON)=='object'&&JSON.stringify) 4 | return JSON.stringify(o);var type=typeof(o);if(o===null) 5 | return"null";if(type=="undefined") 6 | return undefined;if(type=="number"||type=="boolean") 7 | return o+"";if(type=="string") 8 | return $.quoteString(o);if(type=='object') 9 | {if(typeof o.toJSON=="function") 10 | return $.toJSON(o.toJSON());if(o.constructor===Date) 11 | {var month=o.getUTCMonth()+1;if(month<10)month='0'+month;var day=o.getUTCDate();if(day<10)day='0'+day;var year=o.getUTCFullYear();var hours=o.getUTCHours();if(hours<10)hours='0'+hours;var minutes=o.getUTCMinutes();if(minutes<10)minutes='0'+minutes;var seconds=o.getUTCSeconds();if(seconds<10)seconds='0'+seconds;var milli=o.getUTCMilliseconds();if(milli<100)milli='0'+milli;if(milli<10)milli='0'+milli;return'"'+year+'-'+month+'-'+day+'T'+ 12 | hours+':'+minutes+':'+seconds+'.'+milli+'Z"';} 13 | if(o.constructor===Array) 14 | {var ret=[];for(var i=0;i key type 21 | * @param value type 22 | */ 23 | public class KeyValueMarshaller extends Marshaller> { 24 | 25 | private static final long serialVersionUID = 4804959968008959514L; 26 | private final Marshaller keyMarshaller; 27 | private final Marshaller valueMarshaller; 28 | 29 | public KeyValueMarshaller(Marshaller keyMarshaller, Marshaller valueMarshaller) { 30 | this.keyMarshaller = checkNotNull(keyMarshaller, "Null keyMarshaller"); 31 | this.valueMarshaller = checkNotNull(valueMarshaller, "Null valueMarshaller"); 32 | } 33 | 34 | @Override 35 | public ByteBuffer toBytes(KeyValue keyValues) { 36 | KeyValuePb.KeyValues.Builder b = KeyValuePb.KeyValues.newBuilder(); 37 | b.setKey(ByteString.copyFrom(keyMarshaller.toBytes(keyValues.getKey()))); 38 | b.addValue(ByteString.copyFrom(valueMarshaller.toBytes(keyValues.getValue()))); 39 | return ByteBuffer.wrap(b.build().toByteArray()); 40 | } 41 | 42 | @Override 43 | public KeyValue fromBytes(ByteBuffer input) { 44 | KeyValuePb.KeyValues proto; 45 | try { 46 | proto = KeyValuePb.KeyValues.parseFrom(ByteString.copyFrom(input)); 47 | } catch (InvalidProtocolBufferException e) { 48 | throw new CorruptDataException(e); 49 | } 50 | K key = keyMarshaller.fromBytes(proto.getKey().asReadOnlyByteBuffer()); 51 | V value = valueMarshaller.fromBytes(proto.getValue(0).asReadOnlyByteBuffer()); 52 | return new KeyValue<>(key, value); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /java/src/test/java/com/google/appengine/tools/mapreduce/outputs/BigQueryStoreResultTest.java: -------------------------------------------------------------------------------- 1 | package com.google.appengine.tools.mapreduce.outputs; 2 | 3 | import com.google.appengine.tools.development.testing.LocalServiceTestHelper; 4 | import com.google.appengine.tools.mapreduce.GoogleCloudStorageFileSet; 5 | import com.google.appengine.tools.mapreduce.impl.BigQueryMarshallerByType; 6 | import com.google.appengine.tools.mapreduce.testmodels.Child; 7 | import com.google.appengine.tools.mapreduce.testmodels.Father; 8 | import com.google.appengine.tools.pipeline.impl.util.SerializationUtils; 9 | import com.google.common.collect.Lists; 10 | 11 | import junit.framework.TestCase; 12 | 13 | import org.junit.Test; 14 | 15 | import java.io.IOException; 16 | import java.util.List; 17 | 18 | public class BigQueryStoreResultTest extends TestCase { 19 | private static final String BUCKET = "test-bigquery-loader"; 20 | 21 | private final LocalServiceTestHelper helper = new LocalServiceTestHelper(); 22 | 23 | @Override 24 | protected void setUp() throws Exception { 25 | super.setUp(); 26 | helper.setUp(); 27 | } 28 | 29 | @Override 30 | protected void tearDown() throws Exception { 31 | super.tearDown(); 32 | helper.tearDown(); 33 | } 34 | 35 | @Test 36 | public void testSerialization() throws IOException { 37 | BigQueryGoogleCloudStorageStoreOutput creator = 38 | new BigQueryGoogleCloudStorageStoreOutput( 39 | new BigQueryMarshallerByType(Father.class), BUCKET, "testJob"); 40 | 41 | List> writers = creator.createWriters(5); 42 | for (MarshallingOutputWriter writer : writers) { 43 | writer.beginShard(); 44 | writer.beginSlice(); 45 | writer.write(new Father(true, "Father", 46 | Lists.newArrayList(new Child("Childone", 1), new Child("childtwo", 2)))); 47 | writer.endSlice(); 48 | writer.endShard(); 49 | } 50 | BigQueryStoreResult actual = creator.finish(writers); 51 | 52 | byte[] bytes = SerializationUtils.serialize(actual); 53 | @SuppressWarnings("unchecked") 54 | BigQueryStoreResult copy = 55 | (BigQueryStoreResult) SerializationUtils.deserialize(bytes); 56 | assertEquals(actual.getResult(), copy.getResult()); 57 | assertEquals(actual.getSchema(), copy.getSchema()); 58 | } 59 | } 60 | --------------------------------------------------------------------------------