├── .gitignore
├── java
├── example
│ ├── default
│ │ ├── WEB-INF
│ │ │ ├── logging.properties
│ │ │ ├── dispatch.xml
│ │ │ ├── queue.xml
│ │ │ └── appengine-web.xml
│ │ └── index.html
│ ├── mapreduce
│ │ └── WEB-INF
│ │ │ ├── logging.properties
│ │ │ ├── appengine-web.xml
│ │ │ └── web.xml
│ ├── shuffler
│ │ └── WEB-INF
│ │ │ ├── logging.properties
│ │ │ ├── appengine-web.xml
│ │ │ └── web.xml
│ ├── META-INF
│ │ ├── appengine-application.xml
│ │ └── application.xml
│ └── src
│ │ └── com
│ │ └── google
│ │ └── appengine
│ │ └── demos
│ │ └── mapreduce
│ │ ├── bigqueryload
│ │ ├── SampleNestedRecord.java
│ │ ├── SampleTable.java
│ │ └── RandomBigQueryDataCreator.java
│ │ ├── randomcollisions
│ │ ├── SeedToRandomMapper.java
│ │ └── CollisionFindingReducer.java
│ │ └── entitycount
│ │ ├── CountReducer.java
│ │ ├── DeleteEntityMapper.java
│ │ ├── EntityCreator.java
│ │ └── CountMapper.java
├── README
├── src
│ ├── test
│ │ └── java
│ │ │ └── com
│ │ │ └── google
│ │ │ └── appengine
│ │ │ └── tools
│ │ │ └── mapreduce
│ │ │ ├── testmodels
│ │ │ ├── ParameterizedClass.java
│ │ │ ├── AbstractClassSample.java
│ │ │ ├── Child.java
│ │ │ ├── PhoneNumber.java
│ │ │ ├── SampleClassWithNestedCollection.java
│ │ │ ├── SampleClassWithNonParametricList.java
│ │ │ ├── SimpleJsonWithWrapperTypes.java
│ │ │ ├── ClassExtendingAbstract.java
│ │ │ ├── ClassWithArray.java
│ │ │ ├── Father.java
│ │ │ ├── Man.java
│ │ │ ├── SimpleJson.java
│ │ │ ├── SimplAnnotatedJson.java
│ │ │ └── Person.java
│ │ │ ├── inputs
│ │ │ ├── GoogleCloudStorageLineInputTestCase.java
│ │ │ ├── ConcatenatingInputReaderTest.java
│ │ │ └── BlobstoreInputTest.java
│ │ │ ├── impl
│ │ │ ├── shardedjob
│ │ │ │ ├── TestController.java
│ │ │ │ └── TestTask.java
│ │ │ └── handlers
│ │ │ │ └── MemoryLimiterTest.java
│ │ │ └── outputs
│ │ │ └── BigQueryStoreResultTest.java
│ └── main
│ │ ├── java
│ │ └── com
│ │ │ └── google
│ │ │ └── appengine
│ │ │ └── tools
│ │ │ └── mapreduce
│ │ │ ├── Context.java
│ │ │ ├── MapOnlyMapperContext.java
│ │ │ ├── ReducerContext.java
│ │ │ ├── impl
│ │ │ ├── IncrementalTaskWithContext.java
│ │ │ ├── shardedjob
│ │ │ │ ├── RecoverableException.java
│ │ │ │ ├── ShardedJobServiceFactory.java
│ │ │ │ ├── RejectRequestException.java
│ │ │ │ ├── pipeline
│ │ │ │ │ ├── DeleteShardedJob.java
│ │ │ │ │ ├── FinalizeShardedJob.java
│ │ │ │ │ └── AbstractShardedJob.java
│ │ │ │ ├── ShardFailureException.java
│ │ │ │ ├── JobFailureException.java
│ │ │ │ ├── ShardedJobController.java
│ │ │ │ ├── ShardedJobHandler.java
│ │ │ │ ├── InProcessShardedJobRunner.java
│ │ │ │ ├── ShardedJobState.java
│ │ │ │ ├── ShardedJobServiceImpl.java
│ │ │ │ ├── IncrementalTask.java
│ │ │ │ └── ShardedJobService.java
│ │ │ ├── BaseContext.java
│ │ │ ├── BigqueryFieldMarshaller.java
│ │ │ ├── MapOnlyMapperContextImpl.java
│ │ │ ├── ReducerContextImpl.java
│ │ │ ├── sort
│ │ │ │ ├── MergeContext.java
│ │ │ │ ├── SortContext.java
│ │ │ │ └── LexicographicalComparator.java
│ │ │ ├── MapperContextImpl.java
│ │ │ ├── pipeline
│ │ │ │ ├── ResultAndStatus.java
│ │ │ │ ├── ExamineStatusAndReturnResult.java
│ │ │ │ ├── DeleteFilesJob.java
│ │ │ │ ├── ShardedJob.java
│ │ │ │ └── CleanupPipelineJob.java
│ │ │ ├── util
│ │ │ │ ├── BigQueryDataTypeUtil.java
│ │ │ │ ├── SplitUtil.java
│ │ │ │ └── SerializableValue.java
│ │ │ ├── HashingSharder.java
│ │ │ ├── ReducerInputs.java
│ │ │ ├── BigQueryConstants.java
│ │ │ ├── CountersImpl.java
│ │ │ ├── BaseShardContext.java
│ │ │ └── KeyValueMarshaller.java
│ │ │ ├── WorkerContext.java
│ │ │ ├── BigQueryFieldMode.java
│ │ │ ├── BigQueryIgnore.java
│ │ │ ├── MapperContext.java
│ │ │ ├── BigQueryMarshaller.java
│ │ │ ├── MapReduceResult.java
│ │ │ ├── ReducerInput.java
│ │ │ ├── CorruptDataException.java
│ │ │ ├── Counters.java
│ │ │ ├── Counter.java
│ │ │ ├── Sharder.java
│ │ │ ├── inputs
│ │ │ ├── DatastoreInputReader.java
│ │ │ ├── DatastoreKeyInputReader.java
│ │ │ ├── DatastoreInput.java
│ │ │ ├── DatastoreKeyInput.java
│ │ │ ├── UnmarshallingInput.java
│ │ │ ├── NoInput.java
│ │ │ ├── ForwardingInputReader.java
│ │ │ ├── UnmarshallingInputReader.java
│ │ │ ├── GoogleCloudStorageLevelDbInput.java
│ │ │ ├── InMemoryInput.java
│ │ │ └── BlobstoreInput.java
│ │ │ ├── BaseMapper.java
│ │ │ ├── MapReduceJobException.java
│ │ │ ├── reducers
│ │ │ ├── ValueProjectionReducer.java
│ │ │ ├── KeyProjectionReducer.java
│ │ │ └── NoReducer.java
│ │ │ ├── ShardContext.java
│ │ │ ├── bigqueryjobs
│ │ │ └── BigQueryLoadJobReference.java
│ │ │ ├── BigQueryDataField.java
│ │ │ ├── Mapper.java
│ │ │ ├── mappers
│ │ │ ├── KeyProjectionMapper.java
│ │ │ └── IdentityMapper.java
│ │ │ ├── outputs
│ │ │ ├── GoogleCloudStorageLevelDbOutputWriter.java
│ │ │ ├── MarshallingOutputWriter.java
│ │ │ ├── InMemoryOutputWriter.java
│ │ │ ├── ForwardingOutputWriter.java
│ │ │ ├── BigQueryStoreResult.java
│ │ │ ├── SizeSegmentingOutputWriter.java
│ │ │ ├── InMemoryOutput.java
│ │ │ ├── SliceSegmentingOutputWriter.java
│ │ │ ├── NoOutput.java
│ │ │ ├── LevelDbOutput.java
│ │ │ ├── ItemSegmentingOutputWriter.java
│ │ │ └── MarshallingOutput.java
│ │ │ ├── KeyValue.java
│ │ │ ├── Marshaller.java
│ │ │ ├── Reducer.java
│ │ │ └── Input.java
│ │ └── resources
│ │ └── ui
│ │ ├── overview.html
│ │ ├── detail.html
│ │ ├── base.css
│ │ └── jquery.json-2.2.min.js
└── NOTICE
├── img
├── detail.png
└── overview.png
├── python
├── demo
│ ├── queue.yaml
│ ├── static
│ │ ├── images
│ │ │ └── favicon.ico
│ │ └── js
│ │ │ └── custom.js
│ ├── app.yaml
│ └── mapreduce.yaml
├── src
│ ├── MANIFEST.in
│ ├── requirements.txt
│ ├── todelete.txt
│ ├── mapreduce
│ │ ├── include.yaml
│ │ ├── third_party
│ │ │ └── __init__.py
│ │ ├── api
│ │ │ ├── __init__.py
│ │ │ └── map_job
│ │ │ │ ├── __init__.py
│ │ │ │ └── datastore_input_reader.py
│ │ ├── lib
│ │ │ ├── __init__.py
│ │ │ └── input_reader
│ │ │ │ └── __init__.py
│ │ ├── tools
│ │ │ └── __init__.py
│ │ ├── operation
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── counters.py
│ │ │ └── db.py
│ │ ├── __init__.py
│ │ ├── pipeline_base.py
│ │ └── static
│ │ │ ├── overview.html
│ │ │ ├── detail.html
│ │ │ ├── base.css
│ │ │ └── jquery.json-2.2.min.js
│ ├── README
│ └── setup.py
└── test
│ ├── mapreduce
│ ├── test_data
│ │ └── appengine_config.py
│ ├── api
│ │ └── map_job
│ │ │ ├── input_reader_test.py
│ │ │ ├── output_writer_test.py
│ │ │ └── map_job_config_test.py
│ └── operation
│ │ ├── counters_test.py
│ │ └── db_test.py
│ └── testlib
│ └── __init__.py
├── .travis.yml
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | MANIFEST
2 | *.pyc
3 | .idea
4 | *.iml
5 |
--------------------------------------------------------------------------------
/java/example/default/WEB-INF/logging.properties:
--------------------------------------------------------------------------------
1 | .level = INFO
2 |
--------------------------------------------------------------------------------
/java/example/mapreduce/WEB-INF/logging.properties:
--------------------------------------------------------------------------------
1 | .level = INFO
2 |
--------------------------------------------------------------------------------
/java/example/shuffler/WEB-INF/logging.properties:
--------------------------------------------------------------------------------
1 | .level = INFO
2 |
--------------------------------------------------------------------------------
/img/detail.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/appengine-mapreduce/HEAD/img/detail.png
--------------------------------------------------------------------------------
/img/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/appengine-mapreduce/HEAD/img/overview.png
--------------------------------------------------------------------------------
/java/example/default/WEB-INF/dispatch.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/python/demo/queue.yaml:
--------------------------------------------------------------------------------
1 | queue:
2 | - name: default
3 | rate: 50/s
4 | bucket_size: 100
5 | max_concurrent_requests: 100
6 |
7 |
--------------------------------------------------------------------------------
/python/src/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include distribute_setup.py
2 | include mapreduce/include.yaml
3 | recursive-include mapreduce/static *.html *.css *.js
4 |
--------------------------------------------------------------------------------
/python/src/requirements.txt:
--------------------------------------------------------------------------------
1 | GoogleAppEngineCloudStorageClient
2 | GoogleAppEnginePipeline
3 | Graphy
4 | simplejson
5 | mock
6 | mox
7 | pg8000
8 |
--------------------------------------------------------------------------------
/python/demo/static/images/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudPlatform/appengine-mapreduce/HEAD/python/demo/static/images/favicon.ico
--------------------------------------------------------------------------------
/java/example/default/WEB-INF/queue.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | mapreduce-workers
4 | 30/s
5 |
6 |
7 |
--------------------------------------------------------------------------------
/python/src/todelete.txt:
--------------------------------------------------------------------------------
1 | GoogleAppEngineCloudStorageClient
2 | cloudstorage
3 | GoogleAppEnginePipeline
4 | pipeline
5 | Graphy
6 | graphy
7 | simplejson
8 | mock
9 | mox
10 | pg8000
11 | six
12 |
--------------------------------------------------------------------------------
/python/src/mapreduce/include.yaml:
--------------------------------------------------------------------------------
1 | handlers:
2 | - url: /mapreduce/pipeline/images
3 | static_dir: pipeline/ui/images
4 |
5 | - url: /mapreduce(/.*)?
6 | script: mapreduce.main.APP
7 | login: admin
8 |
9 |
--------------------------------------------------------------------------------
/java/example/META-INF/appengine-application.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | mapreduce-example
4 |
--------------------------------------------------------------------------------
/python/test/mapreduce/test_data/appengine_config.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Sample user config for test."""
3 |
4 | # pylint: disable=g-bad-name
5 | mapreduce_SHARD_MAX_ATTEMPTS = 5
6 | mapreduce_QUEUE_NAME = "foo"
7 | mapreduce_BASE_PATH = "/my-mapreduce"
8 |
--------------------------------------------------------------------------------
/java/README:
--------------------------------------------------------------------------------
1 | To build the library:
2 | $ ant
3 | Output files will be in dist/lib.
4 |
5 | To run the test suite:
6 | $ ant test
7 |
8 | To build the example mapreduce:
9 | $ ant compile_example
10 | Output files will be in example/war, which you can then upload with appcfg.sh
11 |
--------------------------------------------------------------------------------
/java/src/test/java/com/google/appengine/tools/mapreduce/testmodels/ParameterizedClass.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.testmodels;
2 |
3 | public class ParameterizedClass {
4 | T id;
5 | /**
6 | * @param id
7 | */
8 | public ParameterizedClass(T id) {
9 | this.id = id;
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/Context.java:
--------------------------------------------------------------------------------
1 | // Copyright 2014 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.tools.mapreduce;
4 |
5 |
6 | /**
7 | * MapReduce context.
8 | */
9 | public interface Context {
10 |
11 | /**
12 | * Returns the Id for the job.
13 | */
14 | String getJobId();
15 | }
16 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/MapOnlyMapperContext.java:
--------------------------------------------------------------------------------
1 | // Copyright 2014 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.tools.mapreduce;
4 |
5 |
6 | /**
7 | * Context for {@link MapOnlyMapper} execution.
8 | *
9 | * @param type of output values produced by the mapper
10 | */
11 | public interface MapOnlyMapperContext extends WorkerContext {
12 | }
13 |
--------------------------------------------------------------------------------
/java/src/test/java/com/google/appengine/tools/mapreduce/testmodels/AbstractClassSample.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.testmodels;
2 |
3 | public abstract class AbstractClassSample {
4 | int id;
5 | String name;
6 |
7 | /**
8 | * @param id
9 | * @param name
10 | */
11 | public AbstractClassSample(int id, String name) {
12 | this.id = id;
13 | this.name = name;
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/java/src/test/java/com/google/appengine/tools/mapreduce/testmodels/Child.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.testmodels;
2 |
3 |
4 | public class Child {
5 | public String fullName;
6 | public int age;
7 | /**
8 | * @param fullName
9 | * @param age
10 | */
11 | public Child(String fullName, int age) {
12 | this.fullName = fullName;
13 | this.age = age;
14 | }
15 |
16 |
17 | }
18 |
--------------------------------------------------------------------------------
/java/example/src/com/google/appengine/demos/mapreduce/bigqueryload/SampleNestedRecord.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.demos.mapreduce.bigqueryload;
2 |
3 | public class SampleNestedRecord {
4 | int col11;
5 | String col12;
6 | /**
7 | * @param col11
8 | * @param col12
9 | */
10 | public SampleNestedRecord(int col11, String col12) {
11 | this.col11 = col11;
12 | this.col12 = col12;
13 | }
14 |
15 | }
16 |
--------------------------------------------------------------------------------
/java/src/test/java/com/google/appengine/tools/mapreduce/testmodels/PhoneNumber.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.testmodels;
2 |
3 | public class PhoneNumber {
4 | public int areaCode;
5 | public int number;
6 | /**
7 | * @param areaCode
8 | * @param number
9 | */
10 | public PhoneNumber(int areaCode, int number) {
11 | this.areaCode = areaCode;
12 | this.number = number;
13 | }
14 |
15 | }
16 |
--------------------------------------------------------------------------------
/java/src/test/java/com/google/appengine/tools/mapreduce/testmodels/SampleClassWithNestedCollection.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.testmodels;
2 |
3 | import java.util.List;
4 |
5 | public class SampleClassWithNestedCollection {
6 | List> ll;
7 |
8 | /**
9 | * @param ll
10 | */
11 | public SampleClassWithNestedCollection(List> ll) {
12 | this.ll = ll;
13 | }
14 |
15 |
16 | }
17 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/ReducerContext.java:
--------------------------------------------------------------------------------
1 | // Copyright 2011 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.tools.mapreduce;
4 |
5 |
6 | /**
7 | * Context for {@link Reducer} execution.
8 | *
9 | * @author ohler@google.com (Christian Ohler)
10 | *
11 | * @param type of output values produced by the reducer
12 | */
13 | public interface ReducerContext extends WorkerContext {
14 | }
15 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/IncrementalTaskWithContext.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.impl;
2 |
3 | import com.google.appengine.tools.mapreduce.impl.shardedjob.IncrementalTask;
4 |
5 | /**
6 | * A simple extension of {@link IncrementalTask} to add information for display in a UI.
7 | */
8 | public interface IncrementalTaskWithContext extends IncrementalTask {
9 |
10 | IncrementalTaskContext getContext();
11 | }
12 |
--------------------------------------------------------------------------------
/java/src/test/java/com/google/appengine/tools/mapreduce/testmodels/SampleClassWithNonParametricList.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.testmodels;
2 |
3 | import java.util.List;
4 |
5 | public class SampleClassWithNonParametricList {
6 | @SuppressWarnings("rawtypes")
7 | List l;
8 |
9 | /**
10 | * @param l
11 | */
12 | @SuppressWarnings("rawtypes")
13 | public SampleClassWithNonParametricList(List l) {
14 | this.l = l;
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/WorkerContext.java:
--------------------------------------------------------------------------------
1 | // Copyright 2011 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.tools.mapreduce;
4 |
5 |
6 | /**
7 | * Context for each worker (mapper or reducer) shard.
8 | *
9 | * @param type of output values produced by the worker
10 | */
11 | public interface WorkerContext extends ShardContext {
12 |
13 | /**
14 | * Emits a value to the output.
15 | */
16 | void emit(O value);
17 | }
18 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/BigQueryFieldMode.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce;
2 |
3 | /**
4 | * The supported bigquery field modes.
5 | */
6 | public enum BigQueryFieldMode {
7 |
8 | REPEATED("repeated"), NULLABLE("nullable"), REQUIRED("required");
9 |
10 | private final String value;
11 |
12 | private BigQueryFieldMode(String value) {
13 | this.value = value;
14 | }
15 |
16 | public String getValue() {
17 | return value;
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/BigQueryIgnore.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce;
2 |
3 | import java.lang.annotation.ElementType;
4 | import java.lang.annotation.Retention;
5 | import java.lang.annotation.RetentionPolicy;
6 | import java.lang.annotation.Target;
7 |
8 | /**
9 | * A annotation for fields that should not be part of the bigquery output.
10 | */
11 | @Retention(RetentionPolicy.RUNTIME)
12 | @Target(ElementType.FIELD)
13 | public @interface BigQueryIgnore {
14 | }
15 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/shardedjob/RecoverableException.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.impl.shardedjob;
2 |
3 | /**
4 | * An exception that indicates it is safe to restart a slice.
5 | */
6 | public class RecoverableException extends RuntimeException {
7 |
8 | private static final long serialVersionUID = -1527377663569164133L;
9 |
10 | public RecoverableException(String message, Throwable rootCause) {
11 | super(message, rootCause);
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/java/src/test/java/com/google/appengine/tools/mapreduce/testmodels/SimpleJsonWithWrapperTypes.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.testmodels;
2 |
3 |
4 | public class SimpleJsonWithWrapperTypes {
5 | Integer id;
6 | String name;
7 | Float value;
8 | /**
9 | * @param id
10 | * @param name
11 | * @param value
12 | */
13 | public SimpleJsonWithWrapperTypes(Integer id, String name, Float value) {
14 | this.id = id;
15 | this.name = name;
16 | this.value = value;
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/BaseContext.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.impl;
2 |
3 | import com.google.appengine.tools.mapreduce.Context;
4 |
5 |
6 | /**
7 | * Base class for all Context implementations.
8 | */
9 | public class BaseContext implements Context {
10 |
11 | private final String jobId;
12 |
13 | public BaseContext(String jobId) {
14 | this.jobId = jobId;
15 | }
16 |
17 | @Override
18 | public String getJobId() {
19 | return jobId;
20 | }
21 | }
--------------------------------------------------------------------------------
/java/example/default/WEB-INF/appengine-web.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | mapreduce-example
4 | default
5 | one
6 |
7 | true
8 |
9 |
10 |
11 |
12 |
13 | F2
14 |
15 |
16 |
--------------------------------------------------------------------------------
/java/src/test/java/com/google/appengine/tools/mapreduce/testmodels/ClassExtendingAbstract.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.testmodels;
2 |
3 | /**
4 | * TODO: Insert description here. (generated by kumaranj)
5 | */
6 | public class ClassExtendingAbstract extends AbstractClassSample {
7 | int value;
8 |
9 | /**
10 | * @param id
11 | * @param name
12 | * @param value
13 | */
14 | public ClassExtendingAbstract(int id, String name, int value) {
15 | super(id, name);
16 | this.value = value;
17 | }
18 |
19 | }
20 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/shardedjob/ShardedJobServiceFactory.java:
--------------------------------------------------------------------------------
1 | // Copyright 2012 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.tools.mapreduce.impl.shardedjob;
4 |
5 | /**
6 | * Provides {@link ShardedJobService} implementations.
7 | *
8 | * @author ohler@google.com (Christian Ohler)
9 | */
10 | public class ShardedJobServiceFactory {
11 |
12 | private ShardedJobServiceFactory() {}
13 |
14 | public static ShardedJobService getShardedJobService() {
15 | return new ShardedJobServiceImpl();
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/BigqueryFieldMarshaller.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.impl;
2 |
3 | import com.google.api.services.bigquery.model.TableFieldSchema;
4 |
5 | import java.lang.reflect.Field;
6 |
7 |
8 | /**
9 | * Defines how a {@link Field}s should be interpreted and marshalled while generating its
10 | * {@link TableFieldSchema} for loading data into bigquery.
11 | */
12 | public interface BigqueryFieldMarshaller {
13 | Object getFieldValue(Field field, Object object);
14 |
15 | Class> getSchemaType();
16 | }
17 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/MapOnlyMapperContextImpl.java:
--------------------------------------------------------------------------------
1 | // Copyright 2014 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.tools.mapreduce.impl;
4 |
5 | import com.google.appengine.tools.mapreduce.MapOnlyMapperContext;
6 | import com.google.appengine.tools.mapreduce.OutputWriter;
7 |
8 | /**
9 | */
10 | class MapOnlyMapperContextImpl extends BaseShardContext implements MapOnlyMapperContext {
11 |
12 | MapOnlyMapperContextImpl(IncrementalTaskContext c, OutputWriter output) {
13 | super(c, output);
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/java/src/test/java/com/google/appengine/tools/mapreduce/testmodels/ClassWithArray.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.testmodels;
2 |
3 | /**
4 | * TODO: Insert description here. (generated by kumaranj)
5 | */
6 | public class ClassWithArray {
7 | public int id;
8 | public String name;
9 | public String[] values;
10 | /**
11 | * @param id
12 | * @param name
13 | * @param values
14 | */
15 | public ClassWithArray(int id, String name, String[] values) {
16 | this.id = id;
17 | this.name = name;
18 | this.values = values;
19 | }
20 |
21 | }
22 |
--------------------------------------------------------------------------------
/python/demo/app.yaml:
--------------------------------------------------------------------------------
1 | application: mapreduce-demo
2 | version: 1
3 | runtime: python27
4 | api_version: 1
5 | threadsafe: no
6 |
7 | handlers:
8 | - url: /favicon.ico
9 | static_files: static/images/favicon.ico
10 | upload: static/images/favicon.ico
11 |
12 | - url: /static/js/custom.js
13 | static_files: static/js/custom.js
14 | upload: static/js/custom.js
15 |
16 | - url: .*
17 | script: main.app
18 | login: required
19 |
20 | includes:
21 | - mapreduce/include.yaml
22 |
23 | libraries:
24 | - name: webapp2
25 | version: "2.5.1"
26 | - name: jinja2
27 | version: "2.6"
28 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/MapperContext.java:
--------------------------------------------------------------------------------
1 | // Copyright 2011 Google Inc. All Rights Reserved.
2 | package com.google.appengine.tools.mapreduce;
3 |
4 |
5 | /**
6 | * A context for mapper execution. Provides everything that might be needed by a mapper function.
7 | *
8 | *
9 | * @param type of keys produced by the mapper
10 | * @param type of values produced by the mapper
11 | */
12 | public interface MapperContext extends WorkerContext> {
13 |
14 | /**
15 | * Emits a key and a value to the output.
16 | */
17 | void emit(K key, V value);
18 | }
19 |
--------------------------------------------------------------------------------
/java/src/test/java/com/google/appengine/tools/mapreduce/testmodels/Father.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.testmodels;
2 |
3 | import java.util.List;
4 |
5 | /**
6 | * Test class for testing BigQueryDataMarshaller
7 | */
8 | public class Father {
9 | public boolean married;
10 | public String name;
11 | public List sons;
12 | /**
13 | * @param married
14 | * @param name
15 | * @param sons
16 | */
17 | public Father(boolean married, String name, List sons) {
18 | this.married = married;
19 | this.name = name;
20 | this.sons = sons;
21 | }
22 |
23 |
24 | }
25 |
--------------------------------------------------------------------------------
/java/src/test/java/com/google/appengine/tools/mapreduce/testmodels/Man.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.testmodels;
2 |
3 | import com.google.appengine.tools.mapreduce.BigQueryIgnore;
4 |
5 | /**
6 | * Test class for BigQueryMarshaller testing
7 | */
8 |
9 | public class Man {
10 | @BigQueryIgnore
11 | public int id;
12 | public String name;
13 | public String gender;
14 |
15 | /**
16 | * @param id
17 | * @param name
18 | * @param gender
19 | */
20 | public Man(int id, String name, String gender) {
21 | this.id = id;
22 | this.name = name;
23 | this.gender = gender;
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/java/src/test/java/com/google/appengine/tools/mapreduce/testmodels/SimpleJson.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.testmodels;
2 |
3 | import com.google.appengine.tools.mapreduce.BigQueryDataField;
4 | import com.google.appengine.tools.mapreduce.BigQueryFieldMode;
5 |
6 | /**
7 | * Simple class for testing
8 | */
9 | public class SimpleJson {
10 | @BigQueryDataField(mode = BigQueryFieldMode.REQUIRED)
11 | public String name;
12 | public int id;
13 | /**
14 | * @param name
15 | * @param id
16 | */
17 | public SimpleJson(String name, int id) {
18 | this.name = name;
19 | this.id = id;
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/java/src/test/java/com/google/appengine/tools/mapreduce/testmodels/SimplAnnotatedJson.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.testmodels;
2 |
3 | import com.google.appengine.tools.mapreduce.BigQueryDataField;
4 |
5 | public class SimplAnnotatedJson {
6 | @BigQueryDataField(name = "niceName")
7 | public String nameRandom;
8 | public String id;
9 | public int intField;
10 | /**
11 | * @param nameRandom
12 | * @param id
13 | * @param intField
14 | */
15 | public SimplAnnotatedJson(String nameRandom, String id, int intField) {
16 | this.nameRandom = nameRandom;
17 | this.id = id;
18 | this.intField = intField;
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/ReducerContextImpl.java:
--------------------------------------------------------------------------------
1 | // Copyright 2012 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.tools.mapreduce.impl;
4 |
5 | import com.google.appengine.tools.mapreduce.OutputWriter;
6 | import com.google.appengine.tools.mapreduce.ReducerContext;
7 |
8 | /**
9 | * @author ohler@google.com (Christian Ohler)
10 | *
11 | * @param type of output values produced by the reducer
12 | */
13 | class ReducerContextImpl extends BaseShardContext implements ReducerContext {
14 |
15 | ReducerContextImpl(IncrementalTaskContext c, OutputWriter output) {
16 | super(c, output);
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/python/test/testlib/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # Copyright 2010 Google Inc.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
--------------------------------------------------------------------------------
/python/src/mapreduce/third_party/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # Copyright 2010 Google Inc.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
--------------------------------------------------------------------------------
/python/src/mapreduce/api/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Copyright 2015 Google Inc. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
--------------------------------------------------------------------------------
/python/src/mapreduce/lib/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Copyright 2015 Google Inc. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
--------------------------------------------------------------------------------
/python/src/mapreduce/tools/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Copyright 2015 Google Inc. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
--------------------------------------------------------------------------------
/python/demo/mapreduce.yaml:
--------------------------------------------------------------------------------
1 | mapreduce:
2 | - name: Make messages lowercase
3 | params:
4 | - name: done_callback
5 | value: /done
6 | mapper:
7 | handler: main.lower_case_posts
8 | input_reader: mapreduce.input_readers.DatastoreInputReader
9 | params:
10 | - name: entity_kind
11 | default: main.Post
12 | - name: shard_count
13 | default: 4
14 | - name: Make messages upper case
15 | params:
16 | - name: done_callback
17 | value: /done
18 | mapper:
19 | handler: main.upper_case_posts
20 | input_reader: mapreduce.input_readers.DatastoreInputReader
21 | params:
22 | - name: entity_kind
23 | default: main.Post
24 | - name: shard_count
25 | default: 4
26 |
--------------------------------------------------------------------------------
/python/src/README:
--------------------------------------------------------------------------------
1 | AppEngine Mapreduce library
2 | ===========================
3 |
4 | Official site: https://github.com/GoogleCloudPlatform/appengine-mapreduce
5 |
6 | Check the site for up to date status, latest version, getting started & user
7 | guides and other documentation.
8 |
9 | Archive contents:
10 | - python : python version of the library resides here
11 | - build.sh : use this to run tests for python library, build and run demo app
12 | - src : python source code for mapreduce library
13 | - tests : tests for mapreduce library
14 | - demo : a demo application that uses the map reduce.
15 | - java : java version of the library
16 | - build.xml : ant build file
17 |
--------------------------------------------------------------------------------
/python/test/mapreduce/api/map_job/input_reader_test.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 |
4 | # testutil must be imported before mock.
5 | # pylint: disable=unused-import
6 | # pylint: disable=g-bad-import-order
7 | from testlib import testutil
8 |
9 | import mock
10 | import unittest
11 |
12 | from mapreduce.api import map_job
13 |
14 |
15 | class InputReaderTest(unittest.TestCase):
16 |
17 | def testBeginEndSlice(self):
18 | reader = map_job.InputReader()
19 | slice_ctx = mock.Mock()
20 | reader.begin_slice(slice_ctx)
21 | self.assertEqual(slice_ctx, reader._slice_ctx)
22 | reader.end_slice(slice_ctx)
23 | self.assertEqual(None, reader._slice_ctx)
24 |
25 |
26 | if __name__ == '__main__':
27 | unittest.main()
28 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/BigQueryMarshaller.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce;
2 |
3 | import com.google.api.services.bigquery.model.TableSchema;
4 |
5 |
6 | /**
7 | * An implementation of this class should serialize the objects of type T into newline separated
8 | * json as expected by the bigquery load jobs. It should also provide an implementation for
9 | * generating the schema({@link TableSchema}) of the bigquery table.
10 | *
11 | * @param type of the object to be marshalled
12 | */
13 | public abstract class BigQueryMarshaller extends Marshaller {
14 | private static final long serialVersionUID = 5170161329883029808L;
15 |
16 | public abstract TableSchema getSchema();
17 | }
18 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/MapReduceResult.java:
--------------------------------------------------------------------------------
1 | // Copyright 2012 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.tools.mapreduce;
4 |
5 | import java.io.Serializable;
6 |
7 | /**
8 | * Result of a {@link MapReduceJob} or {@link MapJob}.
9 | *
10 | * @author ohler@google.com (Christian Ohler)
11 | *
12 | * @param type of result produced by the {@link Output}
13 | */
14 | public interface MapReduceResult extends Serializable {
15 |
16 | /**
17 | * Returns the result from {@link Output#finish} or {@code null} if completed unsuccessfully.
18 | */
19 | R getOutputResult();
20 |
21 | /**
22 | * Returns the counter values at the end of the job.
23 | */
24 | Counters getCounters();
25 | }
26 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/ReducerInput.java:
--------------------------------------------------------------------------------
1 | // Copyright 2011 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.tools.mapreduce;
4 |
5 | import java.util.Iterator;
6 |
7 | /**
8 | * Enumerates the reducer's input values for a given key.
9 | *
10 | * @author ohler@google.com (Christian Ohler)
11 | *
12 | * @param type of values provided by this input
13 | */
14 | // Not serializable; since reduce() receives an iterator, we will never end a
15 | // slice while an iterator is active.
16 | public abstract class ReducerInput implements Iterator {
17 |
18 | @Override
19 | public void remove() {
20 | throw new UnsupportedOperationException("Can't remove() on ReducerInput: " + this);
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/python/test/mapreduce/api/map_job/output_writer_test.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 |
4 | # testutil must be imported before mock.
5 | # pylint: disable=unused-import
6 | # pylint: disable=g-bad-import-order
7 | from testlib import testutil
8 |
9 | import mock
10 | import unittest
11 |
12 | from mapreduce.api.map_job import output_writer
13 |
14 |
15 | class OutputWriterTest(unittest.TestCase):
16 |
17 | def testBeginEndSlice(self):
18 | writer = output_writer.OutputWriter()
19 | slice_ctx = mock.Mock()
20 | writer.begin_slice(slice_ctx)
21 | self.assertEqual(slice_ctx, writer._slice_ctx)
22 | writer.end_slice(slice_ctx)
23 | self.assertEqual(None, writer._slice_ctx)
24 |
25 |
26 | if __name__ == '__main__':
27 | unittest.main()
28 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/CorruptDataException.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce;
2 |
3 | /**
4 | * The exception generated if any of the data appears to be corrupt. This should cause the MapReduce
5 | * to fail.
6 | */
7 | public class CorruptDataException extends RuntimeException {
8 |
9 | private static final long serialVersionUID = 5053922369001406602L;
10 |
11 | public CorruptDataException() {
12 | super();
13 | }
14 |
15 | public CorruptDataException(String message) {
16 | super(message);
17 | }
18 |
19 | public CorruptDataException(String message, Throwable cause) {
20 | super(message, cause);
21 | }
22 |
23 | public CorruptDataException(Throwable cause) {
24 | super(cause);
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/shardedjob/RejectRequestException.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.impl.shardedjob;
2 |
3 |
4 | /**
5 | * An exception thrown to reject the current request with an error code (50X) This will usually
6 | * cause taskqueue to retry the request on another instance.
7 | *
8 | * For internal use only. User code cannot safely depend on this class.
9 | */
10 | public class RejectRequestException extends RuntimeException {
11 |
12 | private static final long serialVersionUID = 5938529235133524752L;
13 |
14 | public RejectRequestException(String reason) {
15 | super(reason);
16 | }
17 |
18 | public RejectRequestException(String reason, Exception e) {
19 | super(reason, e);
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/java/example/src/com/google/appengine/demos/mapreduce/randomcollisions/SeedToRandomMapper.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.demos.mapreduce.randomcollisions;
2 |
3 | import com.google.appengine.tools.mapreduce.Mapper;
4 | import com.google.common.primitives.Ints;
5 |
6 | import java.util.Random;
7 |
8 | /**
9 | * Maps each incoming seed using Java's Random to the first generated number.
10 | */
11 | public final class SeedToRandomMapper extends Mapper {
12 |
13 | private static final long serialVersionUID = -3070710020513042698L;
14 | @Override
15 | // [START map_example]
16 | public void map(Long sequence) {
17 | Random r = new Random(sequence);
18 | emit(r.nextInt(), Ints.checkedCast(sequence));
19 | }
20 | // [END map_example]
21 | }
22 |
--------------------------------------------------------------------------------
/java/example/src/com/google/appengine/demos/mapreduce/bigqueryload/SampleTable.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.demos.mapreduce.bigqueryload;
2 |
3 | import java.util.Date;
4 | import java.util.List;
5 |
6 | public class SampleTable {
7 | Long colNum;
8 | int col1;
9 | String col2;
10 | List col3;
11 | String[] col4;
12 | SampleNestedRecord col5;
13 | Date col6;
14 |
15 | public SampleTable(Long colNum,
16 | int col1,
17 | String col2,
18 | List col3,
19 | String[] col4,
20 | SampleNestedRecord col5,
21 | Date col6) {
22 | this.colNum = colNum;
23 | this.col1 = col1;
24 | this.col2 = col2;
25 | this.col3 = col3;
26 | this.col4 = col4;
27 | this.col5 = col5;
28 | this.col6 = col6;
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/sort/MergeContext.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.impl.sort;
2 |
3 | import com.google.appengine.tools.mapreduce.KeyValue;
4 | import com.google.appengine.tools.mapreduce.OutputWriter;
5 | import com.google.appengine.tools.mapreduce.impl.BaseShardContext;
6 | import com.google.appengine.tools.mapreduce.impl.IncrementalTaskContext;
7 |
8 | import java.nio.ByteBuffer;
9 | import java.util.List;
10 |
11 | /**
12 | * Provides a context for merging.
13 | *
14 | */
15 | public class MergeContext extends BaseShardContext>> {
16 |
17 | MergeContext(IncrementalTaskContext c,
18 | OutputWriter>> output) {
19 | super(c, output);
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/java/example/META-INF/application.xml:
--------------------------------------------------------------------------------
1 |
2 |
8 |
9 | Test Application
10 | My Test Map Reduce Java App
11 |
12 |
13 |
14 | default
15 | default
16 |
17 |
18 |
19 |
20 |
21 | mapreduce
22 | mapreduce
23 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/Counters.java:
--------------------------------------------------------------------------------
1 | // Copyright 2011 Google Inc. All Rights Reserved.
2 | package com.google.appengine.tools.mapreduce;
3 |
4 | import java.io.Serializable;
5 |
6 | /**
7 | * Collection of all counters.
8 | *
9 | */
10 | public interface Counters extends Serializable {
11 |
12 | /**
13 | * @param name counter name
14 | * @return counter with a given name. Creates new counter with 0 value if it doesn't exist.
15 | */
16 | Counter getCounter(String name);
17 |
18 | /**
19 | * @return iterable over all created counters.
20 | */
21 | Iterable extends Counter> getCounters();
22 |
23 | /**
24 | * @param other Another counter object who's counters should all be added to this one.
25 | */
26 | void addAll(Counters other);
27 | }
28 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/MapperContextImpl.java:
--------------------------------------------------------------------------------
1 | // Copyright 2012 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.tools.mapreduce.impl;
4 |
5 | import com.google.appengine.tools.mapreduce.KeyValue;
6 | import com.google.appengine.tools.mapreduce.MapperContext;
7 | import com.google.appengine.tools.mapreduce.OutputWriter;
8 |
9 | /**
10 | * @author ohler@google.com (Christian Ohler)
11 | */
12 | class MapperContextImpl extends BaseShardContext>
13 | implements MapperContext {
14 |
15 | MapperContextImpl(IncrementalTaskContext c, OutputWriter> output) {
16 | super(c, output);
17 | }
18 |
19 | @Override
20 | public void emit(K key, V value) {
21 | emit(KeyValue.of(key, value));
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/sort/SortContext.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.impl.sort;
2 |
3 | import com.google.appengine.tools.mapreduce.KeyValue;
4 | import com.google.appengine.tools.mapreduce.OutputWriter;
5 | import com.google.appengine.tools.mapreduce.impl.BaseShardContext;
6 | import com.google.appengine.tools.mapreduce.impl.IncrementalTaskContext;
7 |
8 | import java.nio.ByteBuffer;
9 | import java.util.List;
10 |
11 | /**
12 | * Provides a context for the in memory sort.
13 | *
14 | */
15 | public class SortContext extends
16 | BaseShardContext>> {
17 |
18 | SortContext(IncrementalTaskContext c,
19 | OutputWriter>> output) {
20 | super(c, output);
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | sudo: false
2 | language: java
3 | jdk:
4 | - openjdk7
5 | before_install:
6 | - cd java
7 | - mvn clean
8 | - git clone -b travis `git config --get remote.origin.url` target/travis
9 | - cp target/travis/settings.xml ~/.m2/settings.xml
10 | install: mvn install -DskipTests=true
11 | script: mvn compile
12 | branches:
13 | only:
14 | - master
15 | after_success:
16 | - mvn site --settings target/travis/settings.xml
17 | env:
18 | global:
19 | - secure: DCU0tg/VgvF4Vln9wRslycxxaNZ+oPh2L3s0bMvheVZ8HLb3VoczY6CX9e75uHlLQqkXwYk1QtDTr2RkszYKJwnJgz7Zu7uAjHlM3KLXoGlDS9rIkX+S3EwMLcQcxBmU1jTmx8l9TzaHHGOSI245TBrwJ736l6UK2FKWmNb5f9A=
20 | - secure: d1XtFIq0YtqzbNMT/HKL0k2HUYAbU5GmT+KGNEC0axmXApNTQBxgkz2t5KNJ348JI+uq1O1OLb17i6eE7VmyXJNiWET8/h6RpBgMdbKxgqZaK8YcZdKRJjxHDRtVkPlOv1U1jq/r0MLm91L6srIom+RpJ3XXk92bKiPXa5EugLE=
21 |
--------------------------------------------------------------------------------
/java/NOTICE:
--------------------------------------------------------------------------------
1 | The following libraries are included under the terms of their respective
2 | licenses and can be found at their respective websites:
3 |
4 | commons logging (Apache) - http://commons.apache.org/logging/
5 | guava (Apache) - https://github.com/google/guava/
6 | hadoop (Apache) - http://hadoop.apache.org/
7 | charts4j (MIT) - https://github.com/julienchastang/charts4j/
8 | json (MIT + good/evil clause) - http://www.json.org/license.html
9 |
10 | The following libraries are included under the terms of their respective
11 | licenses, but are only used for testing, and do not need to be uploaded
12 | with your code:
13 |
14 | cglib (Apache) - http://cglib.sourceforge.net/
15 | easymock and classextension (Apache) - http://easymock.org/
16 | junit (CPL) - http://junit.org/
17 | objenesis (Apache) - http://objenesis.org/
18 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/Counter.java:
--------------------------------------------------------------------------------
1 | // Copyright 2011 Google Inc. All Rights Reserved.
2 | package com.google.appengine.tools.mapreduce;
3 |
4 | /**
5 | * Counter is an integer variable that is aggregated across multiple shards. Can be used to do
6 | * statistical calculations.
7 | *
8 | */
9 | public interface Counter {
10 |
11 | /**
12 | * @return counter name.
13 | */
14 | String getName();
15 |
16 | /**
17 | * @return counter value. This is the value only in the current shard. It doesn't include
18 | * contributions from other shards, if accessed from within mapper/reducer.
19 | */
20 | long getValue();
21 |
22 | /**
23 | * Increment counter.
24 | *
25 | * @param delta increment delta. Can be negative.
26 | */
27 | void increment(long delta);
28 | }
29 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/shardedjob/pipeline/DeleteShardedJob.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.impl.shardedjob.pipeline;
2 |
3 | import com.google.appengine.tools.pipeline.Job;
4 |
5 | /**
6 | * A pipeline job to delete persistent data for a sharded job.
7 | */
8 | public class DeleteShardedJob extends AbstractShardedJob {
9 |
10 | private static final long serialVersionUID = -6850669259843382958L;
11 |
12 | public DeleteShardedJob(String jobId, int taskCount) {
13 | super(jobId, taskCount);
14 | }
15 |
16 | @Override
17 | protected Job> createShardsJob(int start, int end) {
18 | return new DeleteShardsInfos(getJobId(), start, end);
19 | }
20 |
21 | @Override
22 | public String getJobDisplayName() {
23 | return "DeleteShardedJob: " + getJobId();
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/shardedjob/ShardFailureException.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.impl.shardedjob;
2 |
3 | /**
4 | * An exception thrown when there should be no more attempts to continue processing the shard.
5 | */
6 | public class ShardFailureException extends RuntimeException {
7 |
8 | private static final long serialVersionUID = -1082842736486563617L;
9 |
10 | public ShardFailureException(String errorMessage) {
11 | super(errorMessage);
12 | }
13 |
14 | public ShardFailureException(int shardNumber, Throwable rootCause) {
15 | super("Shard " + shardNumber + " failed.", rootCause);
16 | }
17 |
18 | public ShardFailureException(int shardNumber, String message, Throwable rootCause) {
19 | super("Shard " + shardNumber + " failed: " + message, rootCause);
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/shardedjob/JobFailureException.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.impl.shardedjob;
2 |
3 | /**
4 | * An exception thrown when there should be no more attempts to continue processing the job.
5 | */
6 | public class JobFailureException extends RuntimeException {
7 |
8 | private static final long serialVersionUID = -4481817785472768342L;
9 |
10 | public JobFailureException(String errorMessage) {
11 | super(errorMessage);
12 | }
13 |
14 | public JobFailureException(int shardNumber, Throwable rootCause) {
15 | super("Shard " + shardNumber + " failed the job", rootCause);
16 | }
17 |
18 | public JobFailureException(int shardNumber, String message, Throwable rootCause) {
19 | super("Shard " + shardNumber + " failed the job: " + message, rootCause);
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/java/src/test/java/com/google/appengine/tools/mapreduce/testmodels/Person.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.testmodels;
2 |
3 |
4 | public class Person {
5 | public String fullName;
6 | public int age;
7 | public double height;
8 | public float weight;
9 | public String gender;
10 | public PhoneNumber phoneNumber;
11 | /**
12 | * @param fullName
13 | * @param age
14 | * @param height
15 | * @param weight
16 | * @param gender
17 | * @param phoneNumber
18 | */
19 | public Person(String fullName,
20 | int age,
21 | double height,
22 | float weight,
23 | String gender,
24 | PhoneNumber phoneNumber) {
25 | this.fullName = fullName;
26 | this.age = age;
27 | this.height = height;
28 | this.weight = weight;
29 | this.gender = gender;
30 | this.phoneNumber = phoneNumber;
31 | }
32 | }
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/Sharder.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce;
2 |
3 | import java.io.Serializable;
4 | import java.nio.ByteBuffer;
5 |
6 | /**
7 | * Used to determine which shard an item belongs to.
8 | * This is used when emitting data from Map to specify which reduce shard it should go to.
9 | * The only criteria that is required is that the same key always map to the same shard.
10 | *
11 | */
12 | public interface Sharder extends Serializable {
13 |
14 | /**
15 | * @return the number of shards that are items may be assigned to.
16 | */
17 | public int getNumShards();
18 |
19 | /**
20 | * @param key The serialized key. (The ByteBuffer should be unmodified by the implementation)
21 | * @return a number between 0 and numShards-1 inclusive
22 | */
23 | public int getShardForKey(ByteBuffer key);
24 | }
25 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/inputs/DatastoreInputReader.java:
--------------------------------------------------------------------------------
1 | // Copyright 2011 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.tools.mapreduce.inputs;
4 |
5 | import com.google.appengine.api.datastore.Entity;
6 | import com.google.appengine.api.datastore.Query;
7 | import com.google.common.base.Functions;
8 |
9 | /**
10 | * An InputReader for Datastore entities.
11 | */
12 | public class DatastoreInputReader extends BaseDatastoreInputReader {
13 |
14 | private static final long serialVersionUID = -2164845668646485549L;
15 | private static final long AVERAGE_ENTITY_SIZE = 100 * 1024;
16 |
17 | public DatastoreInputReader(Query query) {
18 | super(query, Functions.identity());
19 | }
20 |
21 | @Override
22 | protected long getAvgElementSize() {
23 | return AVERAGE_ENTITY_SIZE;
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/BaseMapper.java:
--------------------------------------------------------------------------------
1 | // Copyright 2014 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.tools.mapreduce;
4 |
5 | /**
6 | * Abstract class for Map function.
7 | *
8 | * @param type of input received
9 | * @param type of intermediate values produced
10 | */
11 | abstract class BaseMapper> extends Worker {
12 |
13 | private static final long serialVersionUID = -6551234158528563026L;
14 |
15 | /**
16 | * Processes a single input value, emitting output through the context returned by
17 | * {@link Worker#getContext} or {@link #emit}.
18 | */
19 | public abstract void map(I value);
20 |
21 | /**
22 | * Syntactic sugar for {@code getContext().emit(value)}
23 | */
24 | protected void emit(O value) {
25 | getContext().emit(value);
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | AppEngine Mapreduce library
2 | ===========================
3 |
4 | [](https://travis-ci.org/GoogleCloudPlatform/appengine-mapreduce)
5 |
6 | Official site: https://github.com/GoogleCloudPlatform/appengine-mapreduce
7 |
8 | Check the site for up to date status, latest version, getting started & user
9 | guides and other documentation.
10 |
11 | Archive contents:
12 | - python : python version of the library resides here
13 | - build.sh : use this to run tests for python library, build and run demo app
14 | - src : python source code for mapreduce library
15 | - tests : tests for mapreduce library
16 | - demo : a demo application that uses the map reduce.
17 | - java : java version of the library
18 | - build.xml : ant build file
19 |
--------------------------------------------------------------------------------
/java/example/src/com/google/appengine/demos/mapreduce/entitycount/CountReducer.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.demos.mapreduce.entitycount;
2 |
3 | import com.google.appengine.tools.mapreduce.KeyValue;
4 | import com.google.appengine.tools.mapreduce.Reducer;
5 | import com.google.appengine.tools.mapreduce.ReducerInput;
6 |
7 | /**
8 | * Sums a list of numbers. The key identifies the counter, the output value is the sum of all input
9 | * values for the given key.
10 | *
11 | * @author ohler@google.com (Christian Ohler)
12 | */
13 | class CountReducer extends Reducer> {
14 |
15 | private static final long serialVersionUID = 1316637485625852869L;
16 |
17 | @Override
18 | public void reduce(String key, ReducerInput values) {
19 | long total = 0;
20 | while (values.hasNext()) {
21 | total += values.next();
22 | }
23 | emit(KeyValue.of(key, total));
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/python/src/mapreduce/lib/input_reader/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Copyright 2015 Google Inc. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Various libraries regarding input readers and input in general."""
16 | from mapreduce.lib.input_reader._gcs import GCSInputReader
17 | from mapreduce.lib.input_reader._gcs import GCSRecordInputReader
18 | from mapreduce.lib.input_reader._gcs import PathFilter
19 |
20 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/pipeline/ResultAndStatus.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.impl.pipeline;
2 |
3 | import com.google.appengine.tools.mapreduce.MapReduceResult;
4 | import com.google.appengine.tools.mapreduce.impl.shardedjob.Status;
5 |
6 | import java.io.Serializable;
7 |
8 | /**
9 | * A holder for MR result and its status.
10 | *
11 | * @param the type of {@code MapReduceResult} content.
12 | */
13 | public final class ResultAndStatus implements Serializable {
14 |
15 | private static final long serialVersionUID = 7867829838406777714L;
16 |
17 | private final MapReduceResult result;
18 | private final Status status;
19 |
20 | public ResultAndStatus(MapReduceResult result, Status status) {
21 | this.result = result;
22 | this.status = status;
23 | }
24 |
25 | public MapReduceResult getResult() {
26 | return result;
27 | }
28 |
29 | public Status getStatus() {
30 | return status;
31 | }
32 | }
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/shardedjob/pipeline/FinalizeShardedJob.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.impl.shardedjob.pipeline;
2 |
3 | import com.google.appengine.tools.mapreduce.impl.shardedjob.Status;
4 | import com.google.appengine.tools.pipeline.Job;
5 |
6 | /**
7 | * A pipeline job for finalizing the job and cleaning up unnecessary state.
8 | */
9 | public class FinalizeShardedJob extends AbstractShardedJob {
10 |
11 | private static final long serialVersionUID = -6850669259843382958L;
12 | private final Status status;
13 |
14 | public FinalizeShardedJob(String jobId, int taskCount, Status status) {
15 | super(jobId, taskCount);
16 | this.status = status;
17 | }
18 |
19 | @Override
20 | protected Job> createShardsJob(int start, int end) {
21 | return new FinalizeShardsInfos(getJobId(), status, start, end);
22 | }
23 |
24 | @Override
25 | public String getJobDisplayName() {
26 | return "FinalizeShardedJob: " + getJobId();
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/java/example/src/com/google/appengine/demos/mapreduce/entitycount/DeleteEntityMapper.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.demos.mapreduce.entitycount;
2 |
3 | import com.google.appengine.api.datastore.Key;
4 | import com.google.appengine.tools.mapreduce.DatastoreMutationPool;
5 | import com.google.appengine.tools.mapreduce.MapOnlyMapper;
6 |
7 | /**
8 | * Deletes datastore entities.
9 | */
10 | public class DeleteEntityMapper extends MapOnlyMapper {
11 |
12 | private static final long serialVersionUID = -6485226450501339416L;
13 |
14 | // [START datastoreMutationPool]
15 | private transient DatastoreMutationPool batcher;
16 | // [END datastoreMutationPool]
17 |
18 | // [START begin_and_endSlice]
19 | @Override
20 | public void beginSlice() {
21 | batcher = DatastoreMutationPool.create();
22 | }
23 |
24 | @Override
25 | public void endSlice() {
26 | batcher.flush();
27 | }
28 | // [END begin_and_endSlice]
29 |
30 | @Override
31 | public void map(Key key) {
32 | batcher.delete(key);
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/shardedjob/ShardedJobController.java:
--------------------------------------------------------------------------------
1 | // Copyright 2012 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.tools.mapreduce.impl.shardedjob;
4 |
5 | import java.io.Serializable;
6 | import java.util.Iterator;
7 |
8 | /**
9 | * Aggregates results from {@link IncrementalTask}s and receives notification
10 | * when the job completes.
11 | *
12 | * @author ohler@google.com (Christian Ohler)
13 | *
14 | * @param the type of the incremental task
15 | */
16 | public abstract class ShardedJobController implements Serializable {
17 |
18 | private static final long serialVersionUID = 6209078163062384156L;
19 |
20 | /**
21 | * Called when the sharded job has completed successfully.
22 | */
23 | public abstract void completed(Iterator completedTasks);
24 |
25 | /**
26 | * Called when the sharded job has failed to complete successfully.
27 | * @param status
28 | */
29 | public abstract void failed(Status status);
30 | }
31 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/MapReduceJobException.java:
--------------------------------------------------------------------------------
1 | // Copyright 2014 Google Inc. All Rights Reserved.
2 | package com.google.appengine.tools.mapreduce;
3 |
4 | import com.google.appengine.tools.mapreduce.impl.shardedjob.Status;
5 |
6 | /**
7 | * An exception that is thrown upon MapReduceJob failure.
8 | */
9 | public final class MapReduceJobException extends RuntimeException {
10 |
11 | private static final long serialVersionUID = 2875093254119004898L;
12 | private final String stage;
13 |
14 | public MapReduceJobException(String stage, Status status) {
15 | super("Stage " + stage + " was not completed successfuly (status=" + status.getStatusCode()
16 | + ", message=" + status.getException() + ")", status.getException());
17 | this.stage = stage;
18 | }
19 |
20 | /**
21 | * Returns a string representing the MapReduce stage that failed.
22 | * The exception propagated from the stage can be fetched by {@link #getCause()}.
23 | */
24 | public String getFailedStage() {
25 | return stage;
26 | }
27 | }
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/reducers/ValueProjectionReducer.java:
--------------------------------------------------------------------------------
1 | // Copyright 2011 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.tools.mapreduce.reducers;
4 |
5 | import com.google.appengine.tools.mapreduce.Reducer;
6 | import com.google.appengine.tools.mapreduce.ReducerInput;
7 |
8 | /**
9 | * Reducer that emits the values that occur in its input, discarding the keys.
10 | *
11 | * @author ohler@google.com (Christian Ohler)
12 | *
13 | * @param type of keys (discarded)
14 | * @param type of values
15 | */
16 | public class ValueProjectionReducer extends Reducer {
17 |
18 | private static final long serialVersionUID = 990027274731447358L;
19 |
20 | public static ValueProjectionReducer create() {
21 | return new ValueProjectionReducer<>();
22 | }
23 |
24 | public ValueProjectionReducer() {
25 | }
26 |
27 | @Override
28 | public void reduce(K key, ReducerInput values) {
29 | while (values.hasNext()) {
30 | emit(values.next());
31 | }
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/ShardContext.java:
--------------------------------------------------------------------------------
1 | // Copyright 2014 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.tools.mapreduce;
4 |
5 |
6 | /**
7 | * Context for each shard.
8 | */
9 | public interface ShardContext extends Context {
10 |
11 | /**
12 | * Returns the total number of shards.
13 | */
14 | int getShardCount();
15 |
16 | /**
17 | * Returns the number of this mapper or reducer shard (zero-based).
18 | */
19 | int getShardNumber();
20 |
21 | /**
22 | * Returns a {@link Counters} object for doing simple aggregate calculations.
23 | */
24 | Counters getCounters();
25 |
26 | /**
27 | * Returns the {@link Counter} with the given name.
28 | */
29 | Counter getCounter(String name);
30 |
31 | /**
32 | * Increments the {@link Counter} with the given name by {@code delta}.
33 | */
34 | void incrementCounter(String name, long delta);
35 |
36 | /**
37 | * Increments the {@link Counter} with the given name by 1.
38 | */
39 | void incrementCounter(String name);
40 | }
41 |
--------------------------------------------------------------------------------
/python/src/mapreduce/operation/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # Copyright 2010 Google Inc.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | """Operations which can be yielded from mappers.
18 |
19 | Operation is callable that takes context.Context as a parameter.
20 | Operations are called during mapper execution immediately
21 | on recieving from handler function.
22 | """
23 |
24 |
25 |
26 | # These are all relative imports.
27 | import db
28 | import counters
29 | from base import Operation
30 |
31 | __all__ = ['db', 'counters', 'Operation']
32 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/reducers/KeyProjectionReducer.java:
--------------------------------------------------------------------------------
1 | // Copyright 2011 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.tools.mapreduce.reducers;
4 |
5 | import com.google.appengine.tools.mapreduce.Reducer;
6 | import com.google.appengine.tools.mapreduce.ReducerInput;
7 | import com.google.common.base.Preconditions;
8 |
9 | /**
10 | * Reducer that emits the keys that occur in its input, discarding the values.
11 | *
12 | * @author ohler@google.com (Christian Ohler)
13 | *
14 | * @param type of keys
15 | * @param type of values (discarded)
16 | */
17 | public class KeyProjectionReducer extends Reducer {
18 |
19 | private static final long serialVersionUID = 466599637876532403L;
20 |
21 | public static KeyProjectionReducer create() {
22 | return new KeyProjectionReducer<>();
23 | }
24 |
25 | public KeyProjectionReducer() {
26 | }
27 |
28 | @Override
29 | public void reduce(K key, ReducerInput values) {
30 | Preconditions.checkState(values.hasNext(), "%s: No values: %s", this, key);
31 | emit(key);
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/python/src/mapreduce/operation/base.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # Copyright 2010 Google Inc.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | """Base operation class."""
18 |
19 |
20 |
21 | __all__ = ['Operation']
22 |
23 |
24 | class Operation(object):
25 | """Base class for all mapper operations.
26 |
27 | All operations should implement __call__(self, ctx) function, which will be
28 | called upon operation yield.
29 | """
30 |
31 | def __call__(self, ctx):
32 | raise NotImplementedError("__call__() not implemented in %s" %
33 | self.__class__)
34 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/bigqueryjobs/BigQueryLoadJobReference.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.bigqueryjobs;
2 |
3 | import com.google.api.services.bigquery.model.JobReference;
4 | import com.google.appengine.tools.mapreduce.Marshallers;
5 | import com.google.appengine.tools.mapreduce.impl.util.SerializableValue;
6 |
7 | import java.io.Serializable;
8 |
9 | /**
10 | * Result of the bigquery load files pipeline job.
11 | */
12 | public class BigQueryLoadJobReference implements Serializable {
13 |
14 | private static final long serialVersionUID = -5045977572520245900L;
15 | private final String status;
16 | private final SerializableValue jobReference;
17 |
18 | public BigQueryLoadJobReference(String status, JobReference jobReference) {
19 | this.status = status;
20 | this.jobReference = SerializableValue.of(
21 | Marshallers.getGenericJsonMarshaller(JobReference.class), jobReference);
22 | }
23 |
24 | public String getStatus() {
25 | return status;
26 | }
27 |
28 | public JobReference getJobReference() {
29 | return jobReference.getValue();
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/shardedjob/ShardedJobHandler.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.impl.shardedjob;
2 |
3 | /**
4 | * As part of its operation, the {@code ShardedJobService} will enqueue task
5 | * queue tasks that send requests to the URLs specified in
6 | * {@link ShardedJobSettings}. It is the user's responsibility to arrange
7 | * for these requests to be passed back into {@link #completeShard}
8 | * and {@link #runTask}.
9 | */
10 | public interface ShardedJobHandler {
11 |
12 | public static final String JOB_ID_PARAM = "job";
13 | public static final String TASK_ID_PARAM = "task";
14 | public static final String SEQUENCE_NUMBER_PARAM = "seq";
15 |
16 | /**
17 | * Is invoked by the servlet that handles
18 | * {@link ShardedJobSettings#getControllerPath} when a shard has completed.
19 | */
20 | void completeShard(final String jobId, final String taskId);
21 |
22 | /**
23 | * Is invoked by the servlet that handles
24 | * {@link ShardedJobSettings#getWorkerPath} to run a task.
25 | */
26 | void runTask(final String jobId, final String taskId, final int sequenceNumber);
27 | }
28 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/BigQueryDataField.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce;
2 |
3 | import java.lang.annotation.ElementType;
4 | import java.lang.annotation.Retention;
5 | import java.lang.annotation.RetentionPolicy;
6 | import java.lang.annotation.Target;
7 |
8 | /**
9 | * Annotation to provide additional information related to BigQuery data fields. @see
10 | * TableFieldSchema.
11 | */
12 | @Retention(RetentionPolicy.RUNTIME)
13 | @Target(ElementType.FIELD)
14 | public @interface BigQueryDataField {
15 | /**
16 | * Description of the BigQuery field
17 | */
18 | String description() default "";
19 |
20 | /**
21 | * The name of the bigquery column. By default it is same as the name of the field in the class.
22 | * Use this annotation to provide a different name.
23 | */
24 | String name() default "";
25 |
26 | /**
27 | * Mode of a bigquery table column determines whether it is repeated, required or nullable. A
28 | * required column must not be left null or empty which loading data. By default it is nullable.
29 | */
30 | BigQueryFieldMode mode() default BigQueryFieldMode.NULLABLE;
31 | }
32 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/Mapper.java:
--------------------------------------------------------------------------------
1 | // Copyright 2011 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.tools.mapreduce;
4 |
5 | /**
6 | * Map function for MapReduce computations. A map function processes input
7 | * values one at a time and generates zero or more output key-value pairs for
8 | * each. It emits the generated pairs to the {@link Reducer} through the
9 | * {@link MapperContext}.
10 | *
11 | *
This class is really an interface that might be evolving. In order to avoid breaking
12 | * users when we change the interface, we made it an abstract class.
13 | *
14 | *
15 | * @param type of input received
16 | * @param type of intermediate keys produced
17 | * @param type of intermediate values produced
18 | */
19 | public abstract class Mapper extends BaseMapper, MapperContext> {
20 |
21 | private static final long serialVersionUID = 1966174340710715049L;
22 |
23 | /**
24 | * Syntactic sugar for {@code emit(KeyValue.of(key, value))}
25 | */
26 | protected void emit(K key, V value) {
27 | getContext().emit(key, value);
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/inputs/DatastoreKeyInputReader.java:
--------------------------------------------------------------------------------
1 | // Copyright 2014 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.tools.mapreduce.inputs;
4 |
5 | import com.google.appengine.api.datastore.Entity;
6 | import com.google.appengine.api.datastore.Key;
7 | import com.google.appengine.api.datastore.Query;
8 | import com.google.common.base.Function;
9 | import com.google.common.base.Preconditions;
10 |
11 | /**
12 | * An InputReader for Datastore entity keys.
13 | */
14 | public class DatastoreKeyInputReader extends BaseDatastoreInputReader {
15 |
16 | private static final long serialVersionUID = 846982034548442467L;
17 | private static final long AVERAGE_KEY_SIZE = 256;
18 |
19 | private enum EntityToKeyFunction implements Function {
20 | INSTANCE;
21 |
22 | @Override
23 | public Key apply(Entity entity) {
24 | return entity.getKey();
25 | }
26 | }
27 |
28 | public DatastoreKeyInputReader(Query query) {
29 | super(query, EntityToKeyFunction.INSTANCE);
30 | Preconditions.checkArgument(query.isKeysOnly());
31 | }
32 |
33 | @Override
34 | protected long getAvgElementSize() {
35 | return AVERAGE_KEY_SIZE;
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/shardedjob/InProcessShardedJobRunner.java:
--------------------------------------------------------------------------------
1 | // Copyright 2012 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.tools.mapreduce.impl.shardedjob;
4 |
5 | import com.google.common.base.Preconditions;
6 |
7 | import java.util.ArrayList;
8 | import java.util.List;
9 |
10 | /**
11 | * Runs a sharded job in the current process. Only for very small jobs. Easier
12 | * to debug than a parallel execution.
13 | *
14 | * @author ohler@google.com (Christian Ohler)
15 | */
16 | public class InProcessShardedJobRunner {
17 |
18 | private InProcessShardedJobRunner() {
19 | }
20 |
21 | /**
22 | * Runs the given job and returns its result.
23 | */
24 | public static void runJob(
25 | List initialTasks, ShardedJobController controller) {
26 | List results = new ArrayList<>();
27 | for (T task : initialTasks) {
28 | Preconditions.checkNotNull(task, "Null initial task: %s", initialTasks);
29 | task.prepare();
30 | do {
31 | task.run();
32 | } while (!task.isDone());
33 | task.cleanup();
34 | results.add(task);
35 | }
36 | controller.completed(results.iterator());
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/mappers/KeyProjectionMapper.java:
--------------------------------------------------------------------------------
1 | // Copyright 2014 Google Inc. All rights reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.appengine.tools.mapreduce.mappers;
16 |
17 | import com.google.appengine.tools.mapreduce.Mapper;
18 |
19 | /**
20 | * A pass through mapper that passes the input to the output key.
21 | *
22 | * @param type input that is passed on as the key
23 | */
24 | public class KeyProjectionMapper extends Mapper {
25 |
26 | private static final long serialVersionUID = -3998292521173820259L;
27 |
28 | @Override
29 | public void map(T value) {
30 | emit(value, null);
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/python/src/mapreduce/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # Copyright 2010 Google Inc.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | import logging
18 | import os
19 |
20 | version = os.environ.get('CURRENT_VERSION_ID', '').split('.')[0]
21 |
22 | if (__name__ == 'google.appengine.ext.mapreduce'
23 | and version != 'ah-builtin-python-bundle'):
24 | msg = ('You should not use the mapreduce library that is bundled with the'
25 | ' SDK. You can use the PyPi package at'
26 | ' https://pypi.python.org/pypi/GoogleAppEngineMapReduce or use the '
27 | 'source at https://github.com/GoogleCloudPlatform/appengine-mapreduce '
28 | 'instead.')
29 | logging.warn(msg)
30 |
31 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/util/BigQueryDataTypeUtil.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.impl.util;
2 |
3 | import com.google.common.collect.ImmutableMap;
4 |
5 | import java.util.Calendar;
6 | import java.util.Date;
7 | import java.util.Map;
8 |
9 | /**
10 | * Utility class for converting java types to BigQuery data types
11 | */
12 | public final class BigQueryDataTypeUtil {
13 |
14 | private static final Map, String> javaTypeToBigQueryType =
15 | new ImmutableMap.Builder, String>()
16 | .put(Integer.class, "integer")
17 | .put(Float.class, "float")
18 | .put(Boolean.class, "boolean")
19 | .put(String.class, "string")
20 | .put(Date.class, "timestamp")
21 | .put(Calendar.class, "timestamp")
22 | .build();
23 |
24 | /**
25 | * @param parameterType java primitive, wrapper or String types
26 | * @return BigQuery data type
27 | * */
28 | public static String getBigQueryType(Class> parameterType) {
29 | return javaTypeToBigQueryType.get(parameterType);
30 | }
31 |
32 | public static boolean isSimpleBigQueryType(Class> parameterType) {
33 | return javaTypeToBigQueryType.containsKey(parameterType);
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/java/example/shuffler/WEB-INF/appengine-web.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | mapreduce-example
4 | shuffler
5 | one
6 | true
7 |
8 |
9 | F4
10 |
11 | 1000ms
12 |
13 |
19 | 4
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/java/example/mapreduce/WEB-INF/appengine-web.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | mapreduce-example
4 | mapreduce
5 | one
6 | true
7 |
8 |
9 | F4
10 |
11 | 1000ms
12 |
13 |
19 | 4
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/python/src/mapreduce/api/map_job/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Copyright 2015 Google Inc. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Map job package."""
16 |
17 | # All the public API should be imported here.
18 | # 1. Seasoned Python user should simply import this package.
19 | # 2. Other users may import individual files so filenames should still have
20 | # "map_job" prefix. But adding the prefix won't mandate the first type
21 | # of user to type more.
22 | # 3. Class names should not have "map_job" prefix.
23 | from .input_reader import InputReader
24 | from .map_job_config import JobConfig
25 | from .map_job_control import Job
26 | from .mapper import Mapper
27 | from .output_writer import OutputWriter
28 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/util/SplitUtil.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.impl.util;
2 |
3 | import java.util.ArrayList;
4 | import java.util.Collections;
5 | import java.util.List;
6 | import java.util.Random;
7 |
8 | /**
9 | * Convenience methods related to splitting.
10 | */
11 | public class SplitUtil {
12 |
13 | public static List> split(List input, int numSplits, boolean randomize) {
14 | ArrayList toSplit = new ArrayList<>(input);
15 | if (randomize) {
16 | Collections.shuffle(toSplit, new Random(0L)); // Fixing seed for determinism
17 | }
18 | int minItemsPerShard = input.size() / numSplits;
19 | int remainder = input.size() % numSplits;
20 | ArrayList> result = new ArrayList<>();
21 | int posInList = 0;
22 | for (int shard = 0; shard < numSplits; shard++) {
23 | int numItems = shard < remainder ? minItemsPerShard + 1 : minItemsPerShard;
24 | if (numItems > 0) {
25 | result.add(new ArrayList<>(toSplit.subList(posInList, posInList + numItems)));
26 | posInList += numItems;
27 | }
28 | }
29 | if (posInList != toSplit.size()) {
30 | throw new IllegalStateException(); // Impossible
31 | }
32 | return result;
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/java/example/src/com/google/appengine/demos/mapreduce/randomcollisions/CollisionFindingReducer.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.demos.mapreduce.randomcollisions;
2 |
3 | import com.google.appengine.tools.mapreduce.Reducer;
4 | import com.google.appengine.tools.mapreduce.ReducerInput;
5 | import com.google.common.collect.Lists;
6 |
7 | import java.util.ArrayList;
8 | import java.util.logging.Logger;
9 |
10 | /**
11 | * Counts the number of seeds that generated the same value. If there are multiple they will be
12 | * logged and emitted to the output as a list.
13 | */
14 | public final class CollisionFindingReducer extends Reducer> {
15 |
16 | private static final long serialVersionUID = 188147370819557065L;
17 | private static final Logger LOG = Logger.getLogger(CollisionFindingReducer.class.getName());
18 | @Override
19 | // [START reduce_example]
20 | public void reduce(Integer valueGenerated, ReducerInput seeds) {
21 | ArrayList collidingSeeds = Lists.newArrayList(seeds);
22 | if (collidingSeeds.size() > 1) {
23 | LOG.info("Found a collision! The seeds: " + collidingSeeds
24 | + " all generaged the value " + valueGenerated);
25 | emit(collidingSeeds);
26 | }
27 | }
28 | // [END reduce_example]
29 | }
30 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/mappers/IdentityMapper.java:
--------------------------------------------------------------------------------
1 | // Copyright 2014 Google Inc. All rights reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | package com.google.appengine.tools.mapreduce.mappers;
16 |
17 | import com.google.appengine.tools.mapreduce.KeyValue;
18 | import com.google.appengine.tools.mapreduce.Mapper;
19 |
20 | /**
21 | * A mapper that passes an incoming KeyValue to it's output.
22 | *
23 | * @param the type of the key
24 | * @param the type of the value
25 | */
26 | public class IdentityMapper extends Mapper, K, V> {
27 |
28 | private static final long serialVersionUID = -8243493999040989299L;
29 |
30 | @Override
31 | public void map(KeyValue input) {
32 | emit(input.getKey(), input.getValue());
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/shardedjob/ShardedJobState.java:
--------------------------------------------------------------------------------
1 | // Copyright 2012 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.tools.mapreduce.impl.shardedjob;
4 |
5 |
6 |
7 | /**
8 | * Information about execution and progress of a sharded job.
9 | *
10 | * Undefined behavior results if any of the values (such as the return value of
11 | * getSettings()) are mutated.
12 | *
13 | * @author ohler@google.com (Christian Ohler)
14 | */
15 | public interface ShardedJobState {
16 |
17 | /**
18 | * Returns the ID of this job.
19 | */
20 | String getJobId();
21 |
22 | /**
23 | * Returns the execution settings of this job.
24 | */
25 | ShardedJobSettings getSettings();
26 |
27 | /**
28 | * Returns the total number of tasks (not including follow-up tasks) that this
29 | * job consists of.
30 | */
31 | int getTotalTaskCount();
32 |
33 | /**
34 | * Returns the number of tasks or follow-up tasks that are currently active.
35 | */
36 | int getActiveTaskCount();
37 |
38 | /**
39 | * Returns the time this job was started.
40 | */
41 | long getStartTimeMillis();
42 |
43 | /**
44 | * Returns the time this job's state was last updated.
45 | */
46 | long getMostRecentUpdateTimeMillis();
47 |
48 | /**
49 | * Returns whether this job is running, finished, etc.
50 | */
51 | Status getStatus();
52 | }
53 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/HashingSharder.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.impl;
2 |
3 | import static com.google.common.base.Preconditions.checkArgument;
4 |
5 | import com.google.appengine.tools.mapreduce.Sharder;
6 | import com.google.appengine.tools.mapreduce.impl.util.SerializationUtil;
7 | import com.google.common.hash.HashFunction;
8 | import com.google.common.hash.Hashing;
9 |
10 | import java.nio.ByteBuffer;
11 |
12 | /**
13 | * Splits input by hashing the key.
14 | *
15 | */
16 | public class HashingSharder implements Sharder {
17 |
18 | private static final long serialVersionUID = 7967187256546710108L;
19 | private static final HashFunction HASH = Hashing.murmur3_32();
20 | private final int numShards;
21 |
22 | public HashingSharder(int numShards) {
23 | this.numShards = numShards;
24 | checkArgument(numShards > 0);
25 | }
26 |
27 | @Override
28 | public int getNumShards() {
29 | return numShards;
30 | }
31 |
32 | @Override
33 | public int getShardForKey(ByteBuffer key) {
34 | byte[] bytes = SerializationUtil.getBytes(key);
35 | int hash = (HASH.hashBytes(bytes).asInt()) & Integer.MAX_VALUE; // Keeping positive
36 | // Dividing integer range rather than using modulo so as to avoid rewriting entries if they are
37 | // re-hashed.
38 | return hash / (Integer.MAX_VALUE / numShards + 1);
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/outputs/GoogleCloudStorageLevelDbOutputWriter.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.outputs;
2 |
3 | import static com.google.appengine.tools.mapreduce.impl.MapReduceConstants.GCS_IO_BLOCK_SIZE;
4 | import static com.google.appengine.tools.mapreduce.impl.util.LevelDbConstants.BLOCK_SIZE;
5 |
6 | import com.google.appengine.tools.mapreduce.OutputWriter;
7 |
8 | import java.io.IOException;
9 | import java.nio.ByteBuffer;
10 |
11 | /**
12 | * A decorator for LevelDbOutputWriter that delegates to {@link GoogleCloudStorageFileOutputWriter}
13 | * and pads blocks to GCS write boundaries on end of slice.
14 | * This is needed because GCS requires data to be passed in in 256kb but LevelDb uses 32kb
15 | * blocks this class provides a way get this class to pad the output by writing empty blocks.
16 | *
17 | */
18 | public class GoogleCloudStorageLevelDbOutputWriter extends LevelDbOutputWriter {
19 | private static final long serialVersionUID = 6507809614070157553L;
20 |
21 | public GoogleCloudStorageLevelDbOutputWriter(OutputWriter delegate) {
22 | super(delegate);
23 | }
24 |
25 | @Override
26 | public void endSlice() throws IOException {
27 | padAndWriteBlock(false);
28 | while ((getNumBlocksWritten() * BLOCK_SIZE) % GCS_IO_BLOCK_SIZE != 0) {
29 | padAndWriteBlock(true);
30 | }
31 | getDelegate().endSlice();
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/java/example/src/com/google/appengine/demos/mapreduce/bigqueryload/RandomBigQueryDataCreator.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.demos.mapreduce.bigqueryload;
2 |
3 | import com.google.appengine.tools.mapreduce.MapOnlyMapper;
4 | import com.google.common.collect.Lists;
5 |
6 | import java.util.Date;
7 | import java.util.Random;
8 |
9 | public class RandomBigQueryDataCreator extends MapOnlyMapper {
10 |
11 | private static final long serialVersionUID = -4247519870584497230L;
12 | private static Random r;
13 |
14 | @Override
15 | public void map(Long value) {
16 | SampleTable toWrite = getSampleTableData(value);
17 |
18 | emit(toWrite);
19 | }
20 |
21 | public static SampleTable getSampleTableData(Long value) {
22 | r = new Random(value);
23 | SampleTable toWrite = new SampleTable(value,
24 | randomInt(),
25 | String.format("colvalue %d", randomInt()),
26 | Lists.newArrayList(String.format("column value %d", randomInt()),
27 | String.format("colvalue %d", randomInt())),
28 | new String[] {String.format("column value %d", randomInt()),
29 | String.format("column value %d", randomInt())},
30 | new SampleNestedRecord(randomInt(), String.format("column value %d", randomInt())),
31 | new Date(randomInt()));
32 | return toWrite;
33 | }
34 |
35 | private static int randomInt() {
36 | return r.nextInt();
37 | }
38 |
39 | }
40 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/shardedjob/ShardedJobServiceImpl.java:
--------------------------------------------------------------------------------
1 | // Copyright 2012 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.tools.mapreduce.impl.shardedjob;
4 |
5 | import java.util.Iterator;
6 | import java.util.List;
7 |
8 | /**
9 | * Implementation of {@link ShardedJobService}.
10 | *
11 | * @author ohler@google.com (Christian Ohler)
12 | */
13 | class ShardedJobServiceImpl implements ShardedJobService {
14 |
15 | @Override
16 | public void startJob(
17 | String jobId,
18 | List extends T> initialTasks,
19 | ShardedJobController controller,
20 | ShardedJobSettings settings) {
21 | new ShardedJobRunner().startJob(jobId, initialTasks, controller, settings);
22 | }
23 |
24 | @Override
25 | public ShardedJobState getJobState(String jobId) {
26 | return new ShardedJobRunner<>().getJobState(jobId);
27 | }
28 |
29 | @Override
30 | public Iterator> lookupTasks(ShardedJobState state) {
31 | return new ShardedJobRunner<>().lookupTasks(state.getJobId(), state.getTotalTaskCount(), true);
32 | }
33 |
34 | @Override
35 | public void abortJob(String jobId) {
36 | new ShardedJobRunner<>().abortJob(jobId);
37 | }
38 |
39 | @Override
40 | public boolean cleanupJob(String jobId) {
41 | return new ShardedJobRunner<>().cleanupJob(jobId);
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/outputs/MarshallingOutputWriter.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.outputs;
2 |
3 | import static com.google.common.base.Preconditions.checkNotNull;
4 |
5 | import com.google.appengine.tools.mapreduce.Marshaller;
6 | import com.google.appengine.tools.mapreduce.OutputWriter;
7 |
8 | import java.io.IOException;
9 | import java.nio.ByteBuffer;
10 |
11 | /**
12 | * An {@link OutputWriter} that marshalls records.
13 | *
14 | * @param the type of OutputWriter that this will become. (The type of the values that will be
15 | * written to this class)
16 | */
17 | public class MarshallingOutputWriter extends ForwardingOutputWriter {
18 |
19 | private static final long serialVersionUID = -1441650908652534613L;
20 |
21 | private final Marshaller super O> marshaller;
22 | private final OutputWriter writer;
23 |
24 | public MarshallingOutputWriter(OutputWriter writer,
25 | Marshaller super O> marshaller) {
26 | this.writer = checkNotNull(writer, "No writer");
27 | this.marshaller = checkNotNull(marshaller, "No marshaller");
28 | }
29 |
30 | @Override
31 | protected OutputWriter getDelegate() {
32 | return writer;
33 | }
34 |
35 | @Override
36 | public void write(O value) throws IOException {
37 | ByteBuffer bytes = marshaller.toBytes(value);
38 | writer.write(bytes);
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/ReducerInputs.java:
--------------------------------------------------------------------------------
1 | // Copyright 2011 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.tools.mapreduce.impl;
4 |
5 | import com.google.appengine.tools.mapreduce.ReducerInput;
6 |
7 | import java.util.Iterator;
8 |
9 | /**
10 | * Utilities related to {@link ReducerInput}.
11 | *
12 | * @author ohler@google.com (Christian Ohler)
13 | */
14 | public class ReducerInputs {
15 |
16 | private ReducerInputs() {}
17 |
18 | private static class IteratorReducerInput extends ReducerInput {
19 |
20 | private final Iterator i;
21 |
22 | public IteratorReducerInput(Iterator i) {
23 | this.i = i;
24 | }
25 |
26 | @Override
27 | public boolean hasNext() {
28 | return i.hasNext();
29 | }
30 |
31 | @Override
32 | public V next() {
33 | return i.next();
34 | }
35 |
36 | @Override
37 | public String toString() {
38 | return "ReducerInputs.fromIterator(" + i + ")";
39 | }
40 | }
41 |
42 | public static ReducerInput fromIterator(Iterator i) {
43 | return new IteratorReducerInput<>(i);
44 | }
45 |
46 | public static ReducerInput fromIterable(final Iterable x) {
47 | return new IteratorReducerInput(x.iterator()) {
48 | @Override
49 | public String toString() {
50 | return "ReducerInputs.fromIterable(" + x + ")";
51 | }
52 | };
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/python/test/mapreduce/api/map_job/map_job_config_test.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import unittest
3 |
4 | from mapreduce import parameters
5 | from mapreduce.api import map_job
6 | from mapreduce.api.map_job import sample_input_reader
7 |
8 |
9 | class MapJobConfigTest(unittest.TestCase):
10 | """Test for MapJobConfig.
11 |
12 | MapJobConfig is declarative. Thus most functional tests are already
13 | done by its parent class.
14 | """
15 |
16 | def testSmoke(self):
17 | conf = map_job.JobConfig(
18 | job_name="foo",
19 | mapper=map_job.Mapper,
20 | input_reader_cls=sample_input_reader.SampleInputReader,
21 | input_reader_params={"foo": 1})
22 | self.assertEqual("foo", conf.job_name)
23 | self.assertTrue(conf.job_id)
24 | self.assertEqual(map_job.Mapper, conf.mapper)
25 | self.assertEqual(sample_input_reader.SampleInputReader,
26 | conf.input_reader_cls)
27 | self.assertEqual({"foo": 1}, conf.input_reader_params)
28 | self.assertEqual(parameters.config.SHARD_COUNT, conf.shard_count)
29 |
30 | def testUserProvidesJobID(self):
31 | conf = map_job.JobConfig(
32 | job_name="foo",
33 | job_id="id",
34 | mapper=map_job.Mapper,
35 | input_reader_cls=sample_input_reader.SampleInputReader,
36 | input_reader_params={"foo": 1})
37 | self.assertEqual("id", conf.job_id)
38 |
39 |
40 | if __name__ == "__main__":
41 | unittest.main()
42 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/BigQueryConstants.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.impl;
2 |
3 |
4 | public final class BigQueryConstants {
5 | private BigQueryConstants() {}
6 |
7 | // Big query does not allow GCS files larger than 1 TB. Limiting the max size to 500GB
8 | public static final Long MAX_BIG_QUERY_GCS_FILE_SIZE = 500 * 1024 * 1024 * 1024L;
9 |
10 | // Big query limit for total size across files per load. Always keep it more than
11 | // MAX_BIG_QUERY_GCS_FILE_SIZE
12 | public static final Long BIGQUERY_LOAD_DATA_SIZE_LIMIT = 500 * 1024 * 1024 * 1024L;
13 |
14 | public static final String BQ_SCOPE = "https://www.googleapis.com/auth/bigquery";
15 |
16 | public static final String GCS_FILE_NAME_FORMAT =
17 | "BigQueryFilesToLoad/Job-%s/Shard-%%04d/file-%%04d";
18 |
19 | public static final String RECORD_TYPE = "record";
20 |
21 | public static final double MAX_TIME_BEFORE_NEXT_POLL = 30; // in seconds
22 |
23 | public static final double MIN_TIME_BEFORE_NEXT_POLL = 10; // in seconds
24 |
25 | public static final String MIME_TYPE = "application/json";
26 |
27 | public static final String NEWLINE_CHARACTER = "\n";
28 |
29 | public static final Integer MAX_RETRIES = 5;
30 |
31 | public static final int DEFAULT_MILLIS_PER_SLICE = 30_000;
32 |
33 | public static final int DEFAULT_SHARD_RETRIES = 4;
34 |
35 | public static final int DEFAULT_SLICE_RETRIES = 20;
36 | }
37 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/reducers/NoReducer.java:
--------------------------------------------------------------------------------
1 | // Copyright 2012 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.tools.mapreduce.reducers;
4 |
5 | import com.google.appengine.tools.mapreduce.Reducer;
6 | import com.google.appengine.tools.mapreduce.ReducerInput;
7 | import com.google.appengine.tools.mapreduce.impl.shardedjob.JobFailureException;
8 |
9 | /**
10 | * A reducer that throws an exception if it receives any keys or values, and
11 | * never emits any values.
12 | *
13 | * @author ohler@google.com (Christian Ohler)
14 | *
15 | * @param type of keys formally (but not actually) accepted by this reducer
16 | * @param type of values formally (but not actually) accepted by this reducer
17 | * @param type of output formally (but not actually) emitted by this reducer\
18 | *
19 | * @deprecated Consider using {@link com.google.appengine.tools.mapreduce.MapJob} instead.
20 | */
21 | @Deprecated
22 | public class NoReducer extends Reducer {
23 |
24 | private static final long serialVersionUID = 904068928342205092L;
25 |
26 | public static NoReducer create() {
27 | return new NoReducer<>();
28 | }
29 |
30 | public NoReducer() {
31 | }
32 |
33 | @Override
34 | public void reduce(K key, ReducerInput values) {
35 | throw new JobFailureException(
36 | getClass().getSimpleName() + ": reduce function was called for " + key);
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/KeyValue.java:
--------------------------------------------------------------------------------
1 | // Copyright 2011 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.tools.mapreduce;
4 |
5 |
6 | import java.io.Serializable;
7 | import java.util.Objects;
8 |
9 | /**
10 | * Key-value pair.
11 | *
12 | *
13 | * @param key type
14 | * @param value type
15 | */
16 | public class KeyValue implements Serializable {
17 |
18 | private static final long serialVersionUID = -2687854533615172943L;
19 |
20 | private final K key;
21 | private final V value;
22 |
23 | public KeyValue(K key, V value) {
24 | this.key = key;
25 | this.value = value;
26 | }
27 |
28 | public K getKey() {
29 | return key;
30 | }
31 |
32 | public V getValue() {
33 | return value;
34 | }
35 |
36 | @Override
37 | public String toString() {
38 | return "KeyValue(" + key + ", " + value + ")";
39 | }
40 |
41 | @Override
42 | public final boolean equals(Object o) {
43 | if (o == this) {
44 | return true;
45 | }
46 | if (!(o instanceof KeyValue)) {
47 | return false;
48 | }
49 | KeyValue, ?> other = (KeyValue, ?>) o;
50 | return Objects.equals(key, other.key) && Objects.equals(value, other.value);
51 | }
52 |
53 | @Override
54 | public final int hashCode() {
55 | return Objects.hash(key, value);
56 | }
57 |
58 | public static KeyValue of(K k, V v) {
59 | return new KeyValue<>(k, v);
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/python/test/mapreduce/operation/counters_test.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # Copyright 2010 Google Inc.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 |
18 |
19 |
20 | import mox
21 | import unittest
22 |
23 | from mapreduce import context
24 | from mapreduce import operation as op
25 |
26 |
27 | class IncrementTest(unittest.TestCase):
28 | """Test Increment operation."""
29 |
30 | def testIncrement(self):
31 | """Test applying Increment operation."""
32 | m = mox.Mox()
33 |
34 | ctx = context.Context(None, None)
35 | ctx._counters = m.CreateMock(context._Counters)
36 |
37 | operation = op.counters.Increment("test", 12)
38 |
39 | # Record calls
40 | ctx._counters.increment("test", 12)
41 |
42 | m.ReplayAll()
43 | try: # test, verify
44 | operation(ctx)
45 | m.VerifyAll()
46 | finally:
47 | m.UnsetStubs()
48 |
49 |
50 | if __name__ == "__main__":
51 | unittest.main()
52 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/Marshaller.java:
--------------------------------------------------------------------------------
1 | // Copyright 2011 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.tools.mapreduce;
4 |
5 | import java.io.Serializable;
6 | import java.nio.ByteBuffer;
7 |
8 | /**
9 | * Turns objects of type {@code T} into bytes and back.
10 | *
11 | * @author ohler@google.com (Christian Ohler)
12 | *
13 | * @param type to be marshalled or unmarshalled
14 | */
15 | public abstract class Marshaller implements Serializable {
16 | private static final long serialVersionUID = 183874105234660517L;
17 |
18 | /**
19 | * Returns a new {@code ByteBuffer} {@code b} with a serialized representation
20 | * of {@code object} between {@code b.position()} and {@code b.limit()}.
21 | * {@code b.order()} is undefined.
22 | */
23 | public abstract ByteBuffer toBytes(T object);
24 |
25 | /**
26 | * Returns the object whose serialized representation is in {@code b} between
27 | * {@code b.position()} and {@code b.limit()}. The value of {@code b.order()}
28 | * when the method is called is undefined, and this method may modify it as
29 | * well as {@code b.position()} and {@code b.limit()}.
30 | *
31 | *
The method may throw a {@link RuntimeException} if it determines that the
32 | * sequence of bytes in {@code b} was not generated by {@link #toBytes}. This
33 | * includes corrupted data as well as trailing bytes.
34 | */
35 | public abstract T fromBytes(ByteBuffer b);
36 | }
37 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/pipeline/ExamineStatusAndReturnResult.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.impl.pipeline;
2 |
3 | import com.google.appengine.tools.mapreduce.MapReduceJobException;
4 | import com.google.appengine.tools.mapreduce.MapReduceResult;
5 | import com.google.appengine.tools.mapreduce.impl.shardedjob.Status;
6 | import com.google.appengine.tools.pipeline.Job1;
7 | import com.google.appengine.tools.pipeline.Value;
8 |
9 | /**
10 | * A pipeline job that examines {@code ResultAndStatus} and returns {@code MapReduceResult}
11 | * when status is DONE or throw a {@code MapReduceJobException} otherwise.
12 | *
13 | * @param the type of MapReduceResult content
14 | */
15 | // TODO: This class will not be needed once
16 | // https://github.com/GoogleCloudPlatform/appengine-pipelines/issues/3 is fixed.
17 | public class ExamineStatusAndReturnResult extends Job1, ResultAndStatus> {
18 |
19 | private static final long serialVersionUID = -4916783324594785878L;
20 |
21 | private final String stage;
22 |
23 | public ExamineStatusAndReturnResult(String stage) {
24 | this.stage = stage;
25 | }
26 |
27 | @Override
28 | public Value> run(ResultAndStatus resultAndStatus) {
29 | Status status = resultAndStatus.getStatus();
30 | if (status.getStatusCode() == Status.StatusCode.DONE) {
31 | return immediate(resultAndStatus.getResult());
32 | }
33 | throw new MapReduceJobException(stage, status);
34 | }
35 | }
--------------------------------------------------------------------------------
/python/src/mapreduce/operation/counters.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # Copyright 2010 Google Inc.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | """Counters-related operations."""
18 |
19 |
20 |
21 | __all__ = ['Increment']
22 |
23 | # Deprecated. Use map_job_context.SliceContext.count instead.
24 |
25 | from mapreduce.operation import base
26 |
27 | # pylint: disable=protected-access
28 |
29 |
30 | class Increment(base.Operation):
31 | """Increment counter operation."""
32 |
33 | def __init__(self, counter_name, delta=1):
34 | """Constructor.
35 |
36 | Args:
37 | counter_name: name of the counter as string
38 | delta: increment delta as int.
39 | """
40 | self.counter_name = counter_name
41 | self.delta = delta
42 |
43 | def __call__(self, context):
44 | """Execute operation.
45 |
46 | Args:
47 | context: mapreduce context as context.Context.
48 | """
49 | context._counters.increment(self.counter_name, self.delta)
50 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/outputs/InMemoryOutputWriter.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.outputs;
2 |
3 | import com.google.appengine.tools.mapreduce.OutputWriter;
4 | import com.google.common.base.Preconditions;
5 | import com.google.common.collect.Lists;
6 |
7 | import java.util.List;
8 |
9 | public class InMemoryOutputWriter extends OutputWriter {
10 |
11 | private static final long serialVersionUID = 528522943983621278L;
12 |
13 | private boolean closed = false;
14 | private final List accu = Lists.newArrayList();
15 | private transient List slice;
16 |
17 | @Override
18 | public String toString() {
19 | return getClass().getName() + "(" + accu.size() + " items" + (closed ? ", closed" : " so far")
20 | + ")";
21 | }
22 |
23 | @Override
24 | public void beginShard() {
25 | closed = false;
26 | accu.clear();
27 | }
28 |
29 | @Override
30 | public void beginSlice() {
31 | slice = Lists.newArrayList();
32 | }
33 |
34 | @Override
35 | public void write(O value) {
36 | Preconditions.checkState(!closed, "%s: Already closed", this);
37 | slice.add(value);
38 | }
39 |
40 | @Override
41 | public void endSlice() {
42 | accu.addAll(slice);
43 | slice = null;
44 | }
45 |
46 | @Override
47 | public void endShard() {
48 | closed = true;
49 | }
50 |
51 | @Override
52 | public boolean allowSliceRetry() {
53 | return true;
54 | }
55 |
56 | public List getResult() {
57 | return accu;
58 | }
59 | }
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/Reducer.java:
--------------------------------------------------------------------------------
1 | // Copyright 2011 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.tools.mapreduce;
4 |
5 | /**
6 | * Reduce function for use in MapReduce. Called once for each key, together
7 | * with the sequence of all values for that key that the map phase produced.
8 | * Can emit output values through the context.
9 | *
10 | *
This class is really an interface that might be evolving. In order to
11 | * avoid breaking users when we change the interface, we made it an abstract
12 | * class.
13 | *
14 | * @author ohler@google.com (Christian Ohler)
15 | *
16 | * @param type of intermediate keys received
17 | * @param type of intermediate values received
18 | * @param type of output values produced
19 | */
20 | public abstract class Reducer extends Worker> {
21 | private static final long serialVersionUID = 1622389951004432376L;
22 |
23 | /**
24 | * Processes the values for a given key, using the context returned by
25 | * {@link Worker#getContext} to emit output to the {@link Output} of the MapReduce.
26 | *
27 | * {@code values} enumerates all values that the map phase produced for the
28 | * key {@code key}. It will always contain at least one value.
29 | */
30 | public abstract void reduce(K key, ReducerInput values);
31 |
32 | /**
33 | * Syntactic sugar for {@code getContext().emit(value)}
34 | */
35 | protected void emit(O value) {
36 | getContext().emit(value);
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/Input.java:
--------------------------------------------------------------------------------
1 | // Copyright 2011 Google Inc. All Rights Reserved.
2 | package com.google.appengine.tools.mapreduce;
3 |
4 | import java.io.IOException;
5 | import java.io.Serializable;
6 | import java.util.List;
7 |
8 | /**
9 | * Input is the data source specification for the job. Input simply defines data source, while
10 | * {@link InputReader} handles reading itself.
11 | *
12 | *
This class is really an interface that might be evolving. In order to avoid breaking
13 | * users when we change the interface, we made it an abstract class.
14 | *
15 | *
16 | * @param type of values produced by this input
17 | */
18 | public abstract class Input implements Serializable {
19 |
20 | private static final long serialVersionUID = 8796820298129705263L;
21 |
22 | private transient Context context;
23 |
24 | void setContext(Context context) {
25 | this.context = context;
26 | }
27 |
28 | /**
29 | * Returns the current context, or null if none.
30 | */
31 | public Context getContext() {
32 | return context;
33 | }
34 |
35 | /**
36 | * Returns a list of readers for this input. It is the {@code Input}'s
37 | * responsibility to determine an appropriate number of readers to split into.
38 | * This could be specified by the user or determined algorithmically.
39 | *
40 | * The number of input readers returned determines the number of map shards.
41 | */
42 | public abstract List extends InputReader> createReaders() throws IOException;
43 | }
44 |
--------------------------------------------------------------------------------
/python/src/mapreduce/pipeline_base.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Copyright 2015 Google Inc. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Base pipelines."""
16 |
17 |
18 | import pipeline
19 |
20 | from mapreduce import parameters
21 |
22 | # pylint: disable=g-bad-name
23 |
24 |
25 | class PipelineBase(pipeline.Pipeline):
26 | """Base class for all pipelines within mapreduce framework.
27 |
28 | Rewrites base path to use pipeline library bundled with mapreduce.
29 | """
30 |
31 | def start(self, **kwargs):
32 | if "base_path" not in kwargs:
33 | kwargs["base_path"] = parameters._DEFAULT_PIPELINE_BASE_PATH
34 | return pipeline.Pipeline.start(self, **kwargs)
35 |
36 |
37 | class _OutputSlotsMixin(object):
38 | """Defines common output slots for all MR user facing pipelines.
39 |
40 | result_status: one of model.MapreduceState._RESULTS. When a MR pipeline
41 | finishes, user should check this for the status of the MR job.
42 | """
43 |
44 | output_names = ["result_status"]
45 |
--------------------------------------------------------------------------------
/python/demo/static/js/custom.js:
--------------------------------------------------------------------------------
1 | // Copyright 2011 Google Inc. All Rights Reserved.
2 |
3 | /**
4 | * @fileoverview A JavaScript helper file that performs miscellaneous
5 | * functions - right now, it just keeps the form that runs MR jobs in sync with
6 | * the user's selection outside of the form.
7 | */
8 |
9 | /*
10 | * Updates the form that runs MapReduce jobs once the user selects their input
11 | * data from the list of input files. Exists because we have two separate forms
12 | * on our HTML - one that allows users to upload new input files, and one that
13 | * allows users to run MapReduce jobs given a certain input file. Since the
14 | * latter form cannot see which input file has been selected (that button is
15 | * out of this form's scope), we throw some quick JavaScript in to sync the
16 | * value of the user's choice with a hidden field in the form as well as a
17 | * visible label displaying the input file's name for the user to see.
18 | * @param {string} filekey The internal key that the Datastore uses to reference
19 | * this input file.
20 | * @param {string} blobkey The Blobstore key associated with the input file
21 | * whose key is filekey.
22 | * @param {string} filename The name that the user has chosen to give this input
23 | * file upon uploading it.
24 | */
25 | function updateForm(filekey, blobkey, filename) {
26 | $('#jobName').text(filename);
27 | $('#filekey').val(filekey);
28 | $('#blobkey').val(blobkey);
29 |
30 | $('#word_count').removeAttr('disabled');
31 | $('#index').removeAttr('disabled');
32 | $('#phrases').removeAttr('disabled');
33 | }
34 |
35 |
--------------------------------------------------------------------------------
/java/example/mapreduce/WEB-INF/web.xml:
--------------------------------------------------------------------------------
1 |
2 |
8 |
9 |
10 |
11 | PipelineServlet
12 |
13 | com.google.appengine.tools.pipeline.impl.servlets.PipelineServlet
14 |
15 |
16 |
17 | PipelineServlet
18 | /_ah/pipeline/*
19 |
20 |
21 |
22 | mapreduce
23 |
24 | com.google.appengine.tools.mapreduce.MapReduceServlet
25 |
26 |
27 |
28 | mapreduce
29 | /mapreduce/*
30 |
31 |
32 |
34 |
35 |
36 | /*
37 |
38 |
39 | admin
40 |
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/java/example/src/com/google/appengine/demos/mapreduce/entitycount/EntityCreator.java:
--------------------------------------------------------------------------------
1 | // Copyright 2012 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.demos.mapreduce.entitycount;
4 |
5 | import static com.google.common.base.Preconditions.checkNotNull;
6 |
7 | import com.google.appengine.api.datastore.Entity;
8 | import com.google.appengine.api.datastore.Text;
9 | import com.google.appengine.tools.mapreduce.MapOnlyMapper;
10 |
11 | import java.util.Random;
12 |
13 | /**
14 | * Creates random entities.
15 | *
16 | * @author ohler@google.com (Christian Ohler)
17 | */
18 | class EntityCreator extends MapOnlyMapper {
19 |
20 | private static final long serialVersionUID = 409204195454478863L;
21 |
22 | private final String kind;
23 | private final int payloadBytesPerEntity;
24 | private final Random random = new Random();
25 |
26 | public EntityCreator(String kind, int payloadBytesPerEntity) {
27 | this.kind = checkNotNull(kind, "Null kind");
28 | this.payloadBytesPerEntity = payloadBytesPerEntity;
29 | }
30 |
31 | private String randomString(int length) {
32 | StringBuilder out = new StringBuilder(length);
33 | for (int i = 0; i < length; i++) {
34 | out.append((char) ('a' + random.nextInt(26)));
35 | }
36 | return out.toString();
37 | }
38 |
39 | @Override
40 | public void map(Long value) {
41 | String name = getContext().getShardNumber() + "_" + value;
42 | Entity entity = new Entity(kind, name);
43 | entity.setProperty("foo", "bar");
44 | entity.setProperty("payload", new Text(randomString(payloadBytesPerEntity)));
45 | emit(entity);
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/inputs/DatastoreInput.java:
--------------------------------------------------------------------------------
1 | // Copyright 2011 Google Inc. All Rights Reserved.
2 | package com.google.appengine.tools.mapreduce.inputs;
3 |
4 | import com.google.appengine.api.datastore.Entity;
5 | import com.google.appengine.api.datastore.Query;
6 |
7 | /**
8 | * An input to read entities of a specified kind from the datastore.
9 | *
10 | */
11 | public final class DatastoreInput extends BaseDatastoreInput {
12 |
13 | private static final long serialVersionUID = -106587199386345409L;
14 |
15 | /**
16 | * @param entityKind entity kind to read from the datastore.
17 | * @param shardCount number of parallel shards for the input.
18 | */
19 | public DatastoreInput(String entityKind, int shardCount) {
20 | this(entityKind, shardCount, null);
21 | }
22 |
23 | /**
24 | * @param entityKind entity kind to read from the datastore.
25 | * @param shardCount number of parallel shards for the input.
26 | * @param namespace the namespace of the entities (if null will use current).
27 | */
28 | public DatastoreInput(String entityKind, int shardCount, String namespace) {
29 | this(createQuery(namespace, entityKind), shardCount);
30 | }
31 |
32 | /**
33 | * @param query the query to read from the datastore.
34 | * @param shardCount the number for parallel shards for the input.
35 | */
36 | public DatastoreInput(Query query, int shardCount) {
37 | super(query, shardCount);
38 | }
39 |
40 | @Override
41 | protected DatastoreInputReader createReader(Query query) {
42 | return new DatastoreInputReader(query);
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/pipeline/DeleteFilesJob.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.impl.pipeline;
2 |
3 | import static com.google.appengine.tools.mapreduce.impl.MapReduceConstants.GCS_RETRY_PARAMETERS;
4 |
5 | import com.google.appengine.tools.cloudstorage.GcsFilename;
6 | import com.google.appengine.tools.cloudstorage.GcsService;
7 | import com.google.appengine.tools.cloudstorage.GcsServiceFactory;
8 | import com.google.appengine.tools.cloudstorage.RetriesExhaustedException;
9 | import com.google.appengine.tools.pipeline.Job1;
10 | import com.google.appengine.tools.pipeline.Value;
11 |
12 | import java.io.IOException;
13 | import java.util.List;
14 | import java.util.logging.Level;
15 | import java.util.logging.Logger;
16 |
17 | /**
18 | * A job which deletes all the files in the provided GoogleCloudStorageFileSet
19 | */
20 | public class DeleteFilesJob extends Job1> {
21 |
22 | private static final long serialVersionUID = 4821135390816992131L;
23 | private static final GcsService gcs = GcsServiceFactory.createGcsService(GCS_RETRY_PARAMETERS);
24 | private static final Logger log = Logger.getLogger(DeleteFilesJob.class.getName());
25 |
26 | /**
27 | * Deletes the files in the provided GoogleCloudStorageFileSet
28 | */
29 | @Override
30 | public Value run(List files) throws Exception {
31 | for (GcsFilename file : files) {
32 | try {
33 | gcs.delete(file);
34 | } catch (RetriesExhaustedException | IOException e) {
35 | log.log(Level.WARNING, "Failed to cleanup file: " + file, e);
36 | }
37 | }
38 | return null;
39 | }
40 | }
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/outputs/ForwardingOutputWriter.java:
--------------------------------------------------------------------------------
1 | // Copyright 2011 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.tools.mapreduce.outputs;
4 |
5 | import com.google.appengine.tools.mapreduce.OutputWriter;
6 | import com.google.appengine.tools.mapreduce.ShardContext;
7 |
8 | import java.io.IOException;
9 |
10 | /**
11 | * @author ohler@google.com (Christian Ohler)
12 | *
13 | * @param type of values accepted by this output
14 | */
15 | public abstract class ForwardingOutputWriter extends OutputWriter {
16 | private static final long serialVersionUID = 738487653896786084L;
17 |
18 | protected abstract OutputWriter> getDelegate();
19 |
20 | @Override
21 | public void beginShard() throws IOException {
22 | getDelegate().beginShard();
23 | }
24 |
25 | @Override
26 | public void beginSlice() throws IOException {
27 | getDelegate().beginSlice();
28 | }
29 |
30 | @Override
31 | public void endSlice() throws IOException {
32 | getDelegate().endSlice();
33 | }
34 |
35 | @Override
36 | public void endShard() throws IOException {
37 | getDelegate().endShard();
38 | }
39 |
40 | @Override
41 | public long estimateMemoryRequirement() {
42 | return getDelegate().estimateMemoryRequirement();
43 | }
44 |
45 | @Override
46 | public void setContext(ShardContext context) {
47 | getDelegate().setContext(context);
48 | }
49 |
50 | @Override
51 | public ShardContext getContext() {
52 | return getDelegate().getContext();
53 | }
54 |
55 | @Override
56 | public boolean allowSliceRetry() {
57 | return getDelegate().allowSliceRetry();
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/outputs/BigQueryStoreResult.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.outputs;
2 |
3 | import com.google.api.services.bigquery.model.TableSchema;
4 | import com.google.appengine.tools.mapreduce.GoogleCloudStorageFileSet;
5 | import com.google.appengine.tools.mapreduce.Marshallers;
6 | import com.google.appengine.tools.mapreduce.Output;
7 | import com.google.appengine.tools.mapreduce.impl.util.SerializableValue;
8 |
9 | import java.io.Serializable;
10 |
11 | /**
12 | * Result of bigQuery staging process. For e.g. currently bigquery can only load data from files
13 | * stored in Google cloud storage(GCS). So Google Cloud Storage(GCS) is the staging area. R for GCS
14 | * is {@link GoogleCloudStorageFileSet}.
15 | *
16 | * @param type of result produced by the staging process {@link Output}.
17 | */
18 | public final class BigQueryStoreResult implements Serializable {
19 |
20 | private static final long serialVersionUID = 3843348927621484947L;
21 | private final R result;
22 | private final SerializableValue serializableSchema;
23 |
24 | /**
25 | * @param result of writing data to the staging area.
26 | * @param schema a wrapper around {@link TableSchema} to make it serializable.
27 | */
28 | public BigQueryStoreResult(R result, TableSchema schema) {
29 | this.result = result;
30 | this.serializableSchema =
31 | SerializableValue.of(Marshallers.getGenericJsonMarshaller(TableSchema.class), schema);
32 | }
33 |
34 | public R getResult() {
35 | return result;
36 | }
37 |
38 | public TableSchema getSchema() {
39 | return serializableSchema.getValue();
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/pipeline/ShardedJob.java:
--------------------------------------------------------------------------------
1 | // Copyright 2013 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.tools.mapreduce.impl.pipeline;
4 |
5 | import com.google.appengine.tools.mapreduce.impl.shardedjob.IncrementalTask;
6 | import com.google.appengine.tools.mapreduce.impl.shardedjob.ShardedJobController;
7 | import com.google.appengine.tools.mapreduce.impl.shardedjob.ShardedJobServiceFactory;
8 | import com.google.appengine.tools.mapreduce.impl.shardedjob.ShardedJobSettings;
9 | import com.google.appengine.tools.pipeline.Job0;
10 | import com.google.appengine.tools.pipeline.Value;
11 |
12 | import java.util.List;
13 |
14 | /**
15 | * ShardedJob pipeline.
16 | *
17 | *
18 | * @param type of task
19 | */
20 | public class ShardedJob extends Job0 {
21 |
22 | private static final long serialVersionUID = -6595147973116356334L;
23 |
24 | private final String jobId;
25 | private final List extends T> workers;
26 | private final ShardedJobController controller;
27 | private final ShardedJobSettings settings;
28 |
29 | public ShardedJob(String shardedJobId, List extends T> workers,
30 | ShardedJobController controller, ShardedJobSettings shardedJobSettings) {
31 | this.jobId = shardedJobId;
32 | this.workers = workers;
33 | this.controller = controller;
34 | this.settings = shardedJobSettings;
35 | }
36 |
37 | @Override
38 | public Value run() {
39 | ShardedJobServiceFactory.getShardedJobService().startJob(jobId, workers, controller, settings);
40 | setStatusConsoleUrl(settings.getMapReduceStatusUrl());
41 | return null;
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/inputs/DatastoreKeyInput.java:
--------------------------------------------------------------------------------
1 | // Copyright 2014 Google Inc. All Rights Reserved.
2 | package com.google.appengine.tools.mapreduce.inputs;
3 |
4 | import com.google.appengine.api.datastore.Key;
5 | import com.google.appengine.api.datastore.Query;
6 |
7 | /**
8 | * An input to read entity keys of a specified kind from the datastore.
9 | */
10 | public final class DatastoreKeyInput extends BaseDatastoreInput {
11 |
12 | private static final long serialVersionUID = -106587199386345409L;
13 |
14 | /**
15 | * @param entityKind entity kind to read from the datastore.
16 | * @param shardCount number of parallel shards for the input.
17 | */
18 | public DatastoreKeyInput(String entityKind, int shardCount) {
19 | this(entityKind, shardCount, null);
20 | }
21 |
22 | /**
23 | * @param entityKind entity kind to read from the datastore.
24 | * @param shardCount the number of parallel shards to divide the input into.
25 | * @param namespace the namespace of the entities (if null will use current).
26 | */
27 | public DatastoreKeyInput(String entityKind, int shardCount, String namespace) {
28 | this(createQuery(namespace, entityKind), shardCount);
29 | }
30 |
31 | /**
32 | * @param query The query to map read from the datastore
33 | * @param shardCount the number of parallel shards to divide the input into.
34 | */
35 | public DatastoreKeyInput(Query query, int shardCount) {
36 | super(query.setKeysOnly(), shardCount);
37 | }
38 |
39 | @Override
40 | protected DatastoreKeyInputReader createReader(Query query) {
41 | return new DatastoreKeyInputReader(query);
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/inputs/UnmarshallingInput.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.inputs;
2 |
3 | import static com.google.common.base.Preconditions.checkNotNull;
4 |
5 | import com.google.appengine.tools.mapreduce.Input;
6 | import com.google.appengine.tools.mapreduce.InputReader;
7 | import com.google.appengine.tools.mapreduce.Marshaller;
8 |
9 | import java.io.IOException;
10 | import java.nio.ByteBuffer;
11 | import java.util.ArrayList;
12 | import java.util.List;
13 |
14 | /**
15 | * An {@link Input} that unmarshalls records.
16 | *
17 | * @param type of values produced by this input
18 | */
19 | public final class UnmarshallingInput extends Input {
20 |
21 | private static final long serialVersionUID = 6893854789021758519L;
22 |
23 | private final Input input;
24 | private final Marshaller marshaller;
25 |
26 | /**
27 | * @param input The input producing values to unmarshall.
28 | * @param marshaller The marshaller to use for unmarshalling the input values.
29 | */
30 | public UnmarshallingInput(Input input, Marshaller marshaller) {
31 | this.input = checkNotNull(input, "Null input");
32 | this.marshaller = checkNotNull(marshaller, "Null marshaller");
33 | }
34 |
35 | @Override
36 | public List> createReaders() throws IOException {
37 | List extends InputReader> readers = input.createReaders();
38 | List> result = new ArrayList<>(readers.size());
39 | for (InputReader reader : readers) {
40 | result.add(new UnmarshallingInputReader<>(reader, marshaller));
41 | }
42 | return result;
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/inputs/NoInput.java:
--------------------------------------------------------------------------------
1 | // Copyright 2012 Google Inc. All Rights Reserved.
2 |
3 | package com.google.appengine.tools.mapreduce.inputs;
4 |
5 | import com.google.appengine.tools.mapreduce.Input;
6 | import com.google.appengine.tools.mapreduce.InputReader;
7 | import com.google.common.collect.ImmutableList;
8 |
9 | import java.util.List;
10 | import java.util.NoSuchElementException;
11 |
12 | /**
13 | * An {@link Input} that does not produce any values.
14 | *
15 | * @author ohler@google.com (Christian Ohler)
16 | *
17 | * @param the type of input values formally (but not actually) produced by
18 | * this input
19 | */
20 | public final class NoInput extends Input {
21 |
22 | private static final long serialVersionUID = 214109122708935335L;
23 |
24 | public static NoInput create(int numShards) {
25 | return new NoInput<>(numShards);
26 | }
27 |
28 | private static class Reader extends InputReader {
29 |
30 | private static final long serialVersionUID = 171763263195134256L;
31 |
32 | @Override
33 | public Double getProgress() {
34 | return 1.0;
35 | }
36 |
37 | @Override
38 | public I next() {
39 | throw new NoSuchElementException();
40 | }
41 | }
42 |
43 | private final int numShards;
44 |
45 | public NoInput(int numShards) {
46 | this.numShards = numShards;
47 | }
48 |
49 | @Override
50 | public List extends InputReader> createReaders() {
51 | ImmutableList.Builder> out = ImmutableList.builder();
52 | for (int i = 0; i < numShards; i++) {
53 | out.add(new Reader());
54 | }
55 | return out.build();
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/impl/util/SerializableValue.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.impl.util;
2 |
3 | import com.google.appengine.tools.mapreduce.Marshaller;
4 |
5 | import java.io.IOException;
6 | import java.io.ObjectInputStream;
7 | import java.io.ObjectOutputStream;
8 | import java.io.Serializable;
9 | import java.nio.ByteBuffer;
10 |
11 | /**
12 | * A {@link Serializable} instance of any given value and its {@link Marshaller}.
13 | * @param
14 | */
15 | public final class SerializableValue implements Serializable {
16 |
17 | private static final long serialVersionUID = -5188676157133889956L;
18 |
19 | private transient T value;
20 | private final Marshaller marshaller;
21 |
22 | private SerializableValue(Marshaller marshaller, T value) {
23 | this.marshaller = marshaller;
24 | this.value = value;
25 | }
26 |
27 | public static SerializableValue of(Marshaller marshaller, T value) {
28 | return new SerializableValue<>(marshaller, value);
29 | }
30 |
31 | public T getValue() {
32 | return value;
33 | }
34 |
35 | private void readObject(ObjectInputStream aInputStream) throws ClassNotFoundException,
36 | IOException {
37 | aInputStream.defaultReadObject();
38 | value = marshaller.fromBytes(ByteBuffer.wrap((byte[]) aInputStream.readObject()));
39 | }
40 |
41 | private void writeObject(ObjectOutputStream aOutputStream) throws IOException {
42 | aOutputStream.defaultWriteObject();
43 | ByteBuffer byteBuffer = marshaller.toBytes(value);
44 | aOutputStream.writeObject(SerializationUtil.getBytes(byteBuffer.slice()));
45 | // In case marshalling modified the item
46 | value = marshaller.fromBytes(byteBuffer);
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/java/src/main/java/com/google/appengine/tools/mapreduce/outputs/SizeSegmentingOutputWriter.java:
--------------------------------------------------------------------------------
1 | package com.google.appengine.tools.mapreduce.outputs;
2 |
3 | import com.google.appengine.tools.mapreduce.OutputWriter;
4 |
5 | import java.io.IOException;
6 | import java.nio.ByteBuffer;
7 |
8 | /**
9 | * Segments the output base on the specified size limit. Creates a new writer if the sum of number
10 | * of bytes written by the current writer and number of bytes to write exceeds segment size limit
11 | */
12 | public abstract class SizeSegmentingOutputWriter extends ItemSegmentingOutputWriter {
13 |
14 | private static final long serialVersionUID = 7900756955061379581L;
15 | private final long segmentSizeLimit;
16 | private long bytesWritten;
17 |
18 | public SizeSegmentingOutputWriter(long segmentSizeLimit) {
19 | this.segmentSizeLimit = segmentSizeLimit;
20 | }
21 |
22 | @Override
23 | public void beginShard() throws IOException {
24 | bytesWritten = 0;
25 | super.beginShard();
26 | }
27 |
28 | @Override
29 | protected boolean shouldSegment(ByteBuffer value) {
30 | if (bytesWritten + value.remaining() > segmentSizeLimit) {
31 | return true;
32 | }
33 | return false;
34 | }
35 |
36 | @Override
37 | public void write(ByteBuffer value) throws IOException {
38 | long numOfBytesToWrite = value.remaining();
39 | super.write(value);
40 | bytesWritten += numOfBytesToWrite - value.remaining();
41 | }
42 |
43 | @Override
44 | protected final OutputWriter createNextWriter(int fileNum) {
45 | OutputWriter nextWriter = createWriter(fileNum);
46 | bytesWritten = 0;
47 | return nextWriter;
48 | }
49 |
50 | protected abstract OutputWriter createWriter(int fileNum);
51 | }
52 |
--------------------------------------------------------------------------------
/java/src/main/resources/ui/overview.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | MapReduce Overview
5 |
6 |
7 |
8 |
9 |
12 |
13 |
14 |
15 |
16 |
17 |