├── .kokoro
    ├── common.cfg
    ├── scala_spark_tf_unit_test.cfg
    ├── tests
    │   └── scala_unit_tests.sh
    └── trampoline.sh
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── benchmarks
    ├── README.MD
    ├── benchmarkUtil.py
    ├── bigbench.sh
    ├── cfg.yaml.tmpl
    ├── examples
    │   ├── bigbench-dataproc-1-3.yaml
    │   ├── bigbench-hive-scenarios.yaml
    │   ├── bigbench-spark-sql-scenarios.yaml
    │   └── hibench-dataproc-1-2-scenarios.yaml
    ├── hibench.sh
    ├── initial_benchmark
    │   ├── cfg-1.yaml
    │   └── cpu-testing-scenarios.yaml
    ├── requirements.txt
    ├── runPerformanceTesting.py
    ├── testing_scenarios.yaml.tmpl
    ├── trigger_bigbench_benchmark.py
    ├── trigger_hibench_benchmark.py
    └── unittests.py
├── codelabs
    ├── opencv-haarcascade
    │   ├── .gitignore
    │   ├── FeatureDetector.scala
    │   ├── build.sbt
    │   └── project
    │   │   └── plugins.sbt
    ├── spark-bigquery
    │   ├── backfill.py
    │   ├── backfill.sh
    │   └── counts_by_subreddit.py
    ├── spark-hudi
    │   └── pyspark_hudi_example.py
    └── spark-nlp
    │   └── topic_model.py
├── gcloud
    ├── .gitignore
    ├── README.md
    ├── autoscaling-policy.yaml
    ├── bin
    │   ├── create-dpgce
    │   ├── create-dpgke
    │   ├── destroy-dpgce
    │   ├── destroy-dpgke
    │   ├── recreate-dpgce
    │   ├── recreate-dpgke
    │   ├── scp-master
    │   ├── scp-worker
    │   ├── ssh-master
    │   └── ssh-worker
    ├── env.json.sample
    ├── init
    │   ├── startup-script.pl
    │   └── startup-script.sh
    └── lib
    │   ├── database-functions.sh
    │   ├── env.sh
    │   ├── net-functions.sh
    │   ├── secure-boot
    │       └── create-key-pair.sh
    │   └── shared-functions.sh
├── notebooks
    ├── README.md
    ├── ai-ml
    │   ├── HuggingFaceEmbeddingGenerationInSpark.ipynb
    │   ├── ImageClassificationInSpark.ipynb
    │   ├── README.md
    │   └── SparkXGBoostCustomerChurn.ipynb
    └── python
    │   ├── 1.1. BigQuery Storage & Spark DataFrames - Python.ipynb
    │   ├── 1.2. BigQuery Storage & Spark SQL - Python.ipynb
    │   ├── 1.3. BigQuery Storage & Spark MLlib - Python.ipynb
    │   ├── 2.1. Google Cloud Storage (CSV) & Spark DataFrames - Python.ipynb
    │   ├── 3.1. Spark DataFrame & Pandas Plotting - Python.ipynb
    │   └── neo4j
    │       ├── 1.1. Similarity Example, Graph Data Science Client - Python.ipynb
    │       ├── 1.2. Similarity Example, Python Client - Python.ipynb
    │       ├── 1.3. Similarity Example, BigQuery Storage & Spark Connector & GDS Client - Python.ipynb
    │       ├── README.md
    │       ├── data
    │           └── census_demographics_by_metro_area.csv
    │       └── images
    │           ├── neo4j_dataproc_api_selector.png
    │           ├── neo4j_dataproc_mkture.png
    │           ├── neo4j_dataproc_process.png
    │           └── neo4j_dataproc_stack.png
├── spark-hbase
    ├── README.md
    ├── pom.xml
    ├── scripts
    │   └── pyspark-hbase.py
    └── src
    │   └── main
    │       └── java
    │           └── hbase
    │               └── SparkHBaseMain.java
├── spark-tensorflow
    ├── .gitignore
    ├── README.md
    ├── config-standard.yaml
    ├── doc
    │   ├── ml-pipeline.png
    │   ├── prereqs.md
    │   ├── spark-package.md
    │   ├── spark-submit.md
    │   ├── tf-serve.md
    │   └── tf-train.md
    ├── gcloud-tests
    │   ├── config.yaml
    │   ├── request.json
    │   └── test-tf-tsv.sh
    ├── prepare
    │   ├── Dockerfile
    │   ├── build.sbt
    │   ├── create-cluster.sh
    │   ├── project
    │   │   ├── build.properties
    │   │   └── plugins.sbt
    │   ├── scalastyle-config.xml
    │   ├── src
    │   │   ├── main
    │   │   │   └── scala
    │   │   │   │   └── com
    │   │   │   │       └── google
    │   │   │   │           └── cloud
    │   │   │   │               └── ml
    │   │   │   │                   └── samples
    │   │   │   │                       └── criteo
    │   │   │   │                           ├── ArtifactExporter.scala
    │   │   │   │                           ├── CriteoAnalyzer.scala
    │   │   │   │                           ├── CriteoExporter.scala
    │   │   │   │                           ├── CriteoFeatures.scala
    │   │   │   │                           ├── CriteoImporter.scala
    │   │   │   │                           ├── CriteoIndexer.scala
    │   │   │   │                           ├── CriteoMissingReplacer.scala
    │   │   │   │                           ├── CriteoPreprocessingApplication.scala
    │   │   │   │                           └── CriteoTransformer.scala
    │   │   └── test
    │   │   │   ├── resources
    │   │   │       └── test_train.csv
    │   │   │   └── scala
    │   │   │       └── com
    │   │   │           └── google
    │   │   │               └── cloud
    │   │   │                   └── ml
    │   │   │                       └── samples
    │   │   │                           └── criteo
    │   │   │                               ├── CriteoImporterTest.scala
    │   │   │                               ├── CriteoIndexerTest.scala
    │   │   │                               ├── CriteoMissingReplacerTest.scala
    │   │   │                               ├── CriteoTransformerTest.scala
    │   │   │                               └── SparkSpec.scala
    │   ├── submit-gcloud.sh
    │   └── submit-local.sh
    ├── setup.py
    ├── test-tf.sh
    ├── train-gcloud.sh
    └── trainer
    │   ├── __init__.py
    │   ├── data.py
    │   ├── model.py
    │   ├── preprocess_artifacts_gcs.py
    │   ├── preprocess_artifacts_local.py
    │   ├── requirements.txt
    │   ├── task.py
    │   ├── test
    │       ├── __init__.py
    │       ├── artifacts
    │       │   ├── categorical-feature-1
    │       │   │   ├── count.txt
    │       │   │   └── index.txt
    │       │   ├── categorical-feature-10
    │       │   │   ├── count.txt
    │       │   │   └── index.txt
    │       │   ├── categorical-feature-11
    │       │   │   ├── count.txt
    │       │   │   └── index.txt
    │       │   ├── categorical-feature-12
    │       │   │   ├── count.txt
    │       │   │   └── index.txt
    │       │   ├── categorical-feature-13
    │       │   │   ├── count.txt
    │       │   │   └── index.txt
    │       │   ├── categorical-feature-14
    │       │   │   ├── count.txt
    │       │   │   └── index.txt
    │       │   ├── categorical-feature-15
    │       │   │   ├── count.txt
    │       │   │   └── index.txt
    │       │   ├── categorical-feature-16
    │       │   │   ├── count.txt
    │       │   │   └── index.txt
    │       │   ├── categorical-feature-17
    │       │   │   ├── count.txt
    │       │   │   └── index.txt
    │       │   ├── categorical-feature-18
    │       │   │   ├── count.txt
    │       │   │   └── index.txt
    │       │   ├── categorical-feature-19
    │       │   │   ├── count.txt
    │       │   │   └── index.txt
    │       │   ├── categorical-feature-2
    │       │   │   ├── count.txt
    │       │   │   └── index.txt
    │       │   ├── categorical-feature-20
    │       │   │   ├── count.txt
    │       │   │   └── index.txt
    │       │   ├── categorical-feature-21
    │       │   │   ├── count.txt
    │       │   │   └── index.txt
    │       │   ├── categorical-feature-22
    │       │   │   ├── count.txt
    │       │   │   └── index.txt
    │       │   ├── categorical-feature-23
    │       │   │   ├── count.txt
    │       │   │   └── index.txt
    │       │   ├── categorical-feature-24
    │       │   │   ├── count.txt
    │       │   │   └── index.txt
    │       │   ├── categorical-feature-25
    │       │   │   ├── count.txt
    │       │   │   └── index.txt
    │       │   ├── categorical-feature-26
    │       │   │   ├── count.txt
    │       │   │   └── index.txt
    │       │   ├── categorical-feature-3
    │       │   │   ├── count.txt
    │       │   │   └── index.txt
    │       │   ├── categorical-feature-4
    │       │   │   ├── count.txt
    │       │   │   └── index.txt
    │       │   ├── categorical-feature-5
    │       │   │   ├── count.txt
    │       │   │   └── index.txt
    │       │   ├── categorical-feature-6
    │       │   │   ├── count.txt
    │       │   │   └── index.txt
    │       │   ├── categorical-feature-7
    │       │   │   ├── count.txt
    │       │   │   └── index.txt
    │       │   ├── categorical-feature-8
    │       │   │   ├── count.txt
    │       │   │   └── index.txt
    │       │   ├── categorical-feature-9
    │       │   │   ├── count.txt
    │       │   │   └── index.txt
    │       │   ├── integer-feature-1
    │       │   │   └── mean.txt
    │       │   ├── integer-feature-10
    │       │   │   └── mean.txt
    │       │   ├── integer-feature-11
    │       │   │   └── mean.txt
    │       │   ├── integer-feature-12
    │       │   │   └── mean.txt
    │       │   ├── integer-feature-13
    │       │   │   └── mean.txt
    │       │   ├── integer-feature-2
    │       │   │   └── mean.txt
    │       │   ├── integer-feature-3
    │       │   │   └── mean.txt
    │       │   ├── integer-feature-4
    │       │   │   └── mean.txt
    │       │   ├── integer-feature-5
    │       │   │   └── mean.txt
    │       │   ├── integer-feature-6
    │       │   │   └── mean.txt
    │       │   ├── integer-feature-7
    │       │   │   └── mean.txt
    │       │   ├── integer-feature-8
    │       │   │   └── mean.txt
    │       │   └── integer-feature-9
    │       │   │   └── mean.txt
    │       ├── data.tfrecords
    │       ├── test.csv
    │       ├── test1.expr
    │       ├── test2.expr
    │       ├── test3.expr
    │       ├── train.csv
    │       └── train.tsv
    │   └── tests.py
├── spark-translate
    ├── .gitignore
    ├── README.md
    ├── maven
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       └── scala
    │   │           └── demo
    │   │               └── TranslateExample.scala
    ├── sbt
    │   ├── build.sbt
    │   ├── project
    │   │   └── assembly.sbt
    │   └── src
    │   │   └── main
    │   │       └── scala
    │   │           └── demo
    │   │               └── TranslateExample.scala
    └── words.txt
└── workshops
    └── social-media
        ├── README.md
        ├── slides.pdf
        └── social-media.txt


/.kokoro/common.cfg:
--------------------------------------------------------------------------------
 1 | # Download trampoline resources. These will be in ${KOKORO_GFILE_DIR}
 2 | gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline"
 3 | 
 4 | # All builds use the trampoline script to run in docker.
 5 | build_file: "cloud-dataproc/.kokoro/trampoline.sh"
 6 | 
 7 | # Use the Python worker docker iamge.
 8 | env_vars: {
 9 |     key: "TRAMPOLINE_IMAGE"
10 |     value: "gcr.io/cloud-devrel-kokoro-resources/spark-tf-unit-tests"
11 | }
12 | 


--------------------------------------------------------------------------------
/.kokoro/scala_spark_tf_unit_test.cfg:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2017 Google Inc.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # Tell the trampoline which build file to use.
17 | 
18 | env_vars: {
19 |     key: "TRAMPOLINE_BUILD_FILE"
20 |     value: "github/cloud-dataproc/.kokoro/tests/scala_unit_tests.sh"
21 | }
22 | 
23 | 


--------------------------------------------------------------------------------
/.kokoro/tests/scala_unit_tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2017 Google Inc.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | set -eo pipefail
18 | 
19 | cd github/cloud-dataproc/spark-tensorflow/prepare
20 | sbt test
21 | 


--------------------------------------------------------------------------------
/.kokoro/trampoline.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | #!/bin/bash
 3 | 
 4 | # Copyright 2017 Google Inc.
 5 | #
 6 | # Licensed under the Apache License, Version 2.0 (the "License");
 7 | # you may not use this file except in compliance with the License.
 8 | # You may obtain a copy of the License at
 9 | #
10 | #      http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | python3 "${KOKORO_GFILE_DIR}/trampoline_v1.py"
19 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to become a contributor and submit your own code
 2 | 
 3 | ## Contributor License Agreements
 4 | 
 5 | We'd love to accept your patches! Before we can take them, we
 6 | have to jump a couple of legal hurdles.
 7 | 
 8 | Please fill out either the individual or corporate Contributor License Agreement
 9 | (CLA).
10 | 
11 |   * If you are an individual writing original source code and you're sure you
12 |     own the intellectual property, then you'll need to sign an [individual CLA]
13 |     (https://developers.google.com/open-source/cla/individual).
14 |   * If you work for a company that wants to allow you to contribute your work,
15 |     then you'll need to sign a [corporate CLA]
16 |     (https://developers.google.com/open-source/cla/corporate).
17 | 
18 | Follow either of the two links above to access the appropriate CLA and
19 | instructions for how to sign and return it. Once we receive it, we'll be able to
20 | accept your pull requests.
21 | 
22 | ## Contributing A Patch
23 | 
24 | 1. Submit an issue describing your proposed change to the repo in question.
25 | 1. The repo owner will respond to your issue promptly.
26 | 1. If your proposed change is accepted, and you haven't already done so, sign a
27 |    Contributor License Agreement (see details above).
28 | 1. Fork the desired repo, develop and test your code changes.
29 | 1. Ensure that your code adheres to the existing style in the sample to which
30 |    you are contributing. Refer to the
31 |    [Google Cloud Platform Samples Style Guide]
32 |    (https://github.com/GoogleCloudPlatform/Template/wiki/style.html) for the
33 |    recommended coding standards for this organization.
34 | 1. Ensure that your code has an appropriate set of unit tests which all pass.
35 | 1. Submit a pull request.
36 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Google Cloud Dataproc
 2 | 
 3 | This repository contains code and documentation for use with
 4 | [Google Cloud Dataproc](https://cloud.google.com/dataproc/).
 5 | 
 6 | ## Samples in this Repository
 7 |  * `codelabs/opencv-haarcascade` provides the source code for the [OpenCV Dataproc Codelab](https://codelabs.developers.google.com/codelabs/cloud-dataproc-opencv/index.html), which demonstrates a Spark job that adds facial detection to a set of images. 
 8 | * `codelabs/spark-bigquery` provides the source code for the [PySpark for Preprocessing BigQuery Data  Codelab](https://codelabs.developers.google.com/codelabs/pyspark-bigquery/index.html), which demonstrates using PySpark on Cloud Dataproc to process data from BigQuery.
 9 | * `codelabs/spark-nlp` provides the source code for the [PySpark for Natural Language Processing Codelab](https://codelabs.developers.google.com/codelabs/spark-nlp/index.html), which demonstrates using [spark-nlp](https://github.com/JohnSnowLabs/spark-nlp) library for Natural Language Processing.
10 | * `notebooks/ai-ml/` provides source code for Spark for AI/ML use cases, including a [PyTorch](https://pytorch.org/) sample for image classification.
11 | * `notebooks/python` provides example Jupyter notebooks to demonstrate using PySpark with the [BigQuery Storage Connector](https://github.com/GoogleCloudPlatform/spark-bigquery-connector) and the [Spark GCS Connector](https://github.com/GoogleCloudPlatform/bigdata-interop/tree/master/gcs)
12 |  * `spark-tensorflow` provides an example of using Spark as a preprocessing toolchain for Tensorflow jobs. Optionally,
13 |  it demonstrates the [spark-tensorflow-connector](https://github.com/tensorflow/ecosystem/tree/master/spark/spark-tensorflow-connector) to convert CSV files to TFRecords.
14 |  * `spark-translate` provides a simple demo Spark application that translates words using Google's Translation API and running on Cloud Dataproc.
15 |  * `gcloud` provides a set of scripts to provision dataproc clusters for use in exercising arbitrary initialization-actions.
16 | 
17 | See each directories README for more information.
18 | 
19 | 
20 | ## Additional Dataproc Repositories
21 | 
22 | You can find more Dataproc resources in these github repositories:
23 | 
24 | ### Dataproc projects
25 | * [Dataproc initialization actions](https://github.com/GoogleCloudDataproc/initialization-actions)
26 | * [GCP Token Broker](https://github.com/GoogleCloudPlatform/gcp-token-broker)
27 | * [Dataproc Custom Images](https://github.com/GoogleCloudDataproc/custom-images)
28 | * [JupyterHub Dataproc Spawner](https://github.com/GoogleCloudDataproc/jupyterhub-dataprocspawner)
29 | 
30 | ### Connectors
31 | * [Hadoop/Spark GCS Connector](https://github.com/GoogleCloudDataproc/hadoop-connectors/tree/master/gcs)
32 | * [Spark BigTable Connector](https://github.com/GoogleCloudDataproc/spark-bigtable-connector)
33 | * [Spark BigQuery Connector](https://github.com/GoogleCloudDataproc/spark-bigquery-connector)
34 | * [Flink BigQuery Connector](https://github.com/GoogleCloudDataproc/flink-bigquery-connector)
35 | * [Spark Spanner Connector](https://github.com/GoogleCloudDataproc/spark-spanner-connector)
36 | * [Hive BigQuery Connector](https://github.com/GoogleCloudDataproc/hive-bigquery-connector)
37 | * [Hive Bigquery Storage Handler](https://github.com/GoogleCloudDataproc/hive-bigquery-storage-handler) [No Longer Maintained]
38 | * [Dataproc JDBC Connector](https://github.com/GoogleCloudDataproc/dataproc-jdbc-connector)
39 | 
40 | ### Examples
41 | * [Dataproc Python examples](https://github.com/GoogleCloudPlatform/python-docs-samples/tree/main/dataproc/snippets)
42 | * [Dataproc Pubsub Spark Streaming example](https://github.com/GoogleCloudPlatform/dataproc-pubsub-spark-streaming)
43 | * [Dataproc Java Bigtable sample](https://github.com/GoogleCloudPlatform/cloud-bigtable-examples/tree/main/java/dataproc-wordcount)
44 | * [Dataproc Spark-Bigtable samples](https://github.com/GoogleCloudPlatform/java-docs-samples/tree/main/bigtable/spark)
45 | 
46 | ## For more information
47 | For more information, review the [Dataproc
48 | documentation](https://cloud.google.com/dataproc/docs/). You can also
49 | pose questions to the [Stack
50 | Overflow](http://stackoverflow.com/questions/tagged/google-cloud-dataproc) community
51 | with the tag `google-cloud-dataproc`.
52 | See our other [Google Cloud Platform github
53 | repos](https://github.com/GoogleCloudPlatform) for sample applications and
54 | scaffolding for other frameworks and use cases.
55 | 
56 | ## Contributing changes
57 | 
58 | * See [CONTRIBUTING.md](CONTRIBUTING.md)
59 | 
60 | ## Licensing
61 | 
62 | * See [LICENSE](LICENSE)
63 | 


--------------------------------------------------------------------------------
/benchmarks/benchmarkUtil.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import yaml
  3 | import subprocess
  4 | import os
  5 | import re
  6 | 
  7 | 
  8 | class Scenario:
  9 |     def __init__(self, name, config_file_path):
 10 |         self.name = name
 11 |         self.config_file_name = config_file_path
 12 | 
 13 | 
 14 | def execute_shell(cmd):
 15 |     print("Running command {}".format(cmd))
 16 |     p = subprocess.Popen(
 17 |         cmd,
 18 |         shell=True,
 19 |         stdout=subprocess.PIPE,
 20 |         stderr=subprocess.PIPE
 21 |     )
 22 |     stdout, stderr = p.communicate()
 23 |     return_code = p.returncode
 24 |     return stdout, stderr, return_code
 25 | 
 26 | 
 27 | class Benchmark:
 28 |     clusterName = ""
 29 |     cluster_template_path = "cfg.yaml.tmpl"
 30 |     scenario_file_name = "testing_scenarios.yaml.tmpl"
 31 |     scenarios = []
 32 | 
 33 |     def run_workflow(self, scenario):
 34 |         command = "gcloud dataproc workflow-templates instantiate-from-file --file {} --format json".format(
 35 |             scenario.config_file_name)
 36 |         stdout, stderr, return_code = execute_shell(command)
 37 |         if return_code != 0:
 38 |             print("Workflow execution failed - Error code is {}".format(return_code))
 39 |             print("STDOUT:\n {}\n".format(stdout))
 40 |             print("STDERR:\n {}\n".format(stderr))
 41 |             yaml_config = yaml.safe_load(open(scenario.config_file_name, 'r'))
 42 |             prefix = yaml_config['placement']['managedCluster']['clusterName']
 43 |             pattern = re.compile(r"{}-.*.".format(prefix))
 44 |             stderr = stderr.split()
 45 |             for element in stderr:
 46 |                 if pattern.findall(str(element)):
 47 |                     self.clusterName = re.compile(r"{}-.*.".format(prefix)).findall(str(element))[0][:-2]
 48 |                     break
 49 |         else:
 50 |             data = json.loads(stdout.decode('utf-8'))
 51 |             self.clusterName = data['metadata']['clusterName']
 52 | 
 53 |     def upload_config_to_gs(self, scenario):
 54 |         yaml_config = yaml.safe_load(open(scenario.config_file_name, 'r'))
 55 |         experiment_name = yaml_config['jobs'][0]['pysparkJob']['args'][0]
 56 |         scenario_destination_bucket_path = "{}/{}/{}/cfg.yaml".format(
 57 |             experiment_name,
 58 |             scenario.name,
 59 |             self.clusterName)
 60 |         command = "gsutil cp {} {} ".format(scenario.config_file_name,
 61 |                                             scenario_destination_bucket_path)
 62 |         execute_shell(command)
 63 | 
 64 |         print('File {} uploaded to {}. as cfg.yml'.format(
 65 |             scenario.config_file_name,
 66 |             scenario_destination_bucket_path))
 67 | 
 68 |     def set_config_template_file(self, tmpl):
 69 |         self.cluster_template_path = tmpl
 70 | 
 71 |     def set_scenarios_file(self, scenarios):
 72 |         self.scenario_file_name = scenarios
 73 | 
 74 |     def read_scenarios_yaml(self):
 75 |         with open(self.scenario_file_name, 'r') as stream:
 76 |             return yaml.safe_load(stream)
 77 | 
 78 |     def read_template_yaml(self):
 79 |         with open(self.cluster_template_path, 'r') as stream:
 80 |             return yaml.safe_load(stream)
 81 | 
 82 |     def write_scenarios_yaml(self, data, scenario, scenario_file):
 83 |         with open(scenario_file, 'w') as stream:
 84 |             yaml.dump(data, stream, default_flow_style=False)
 85 | 
 86 |     def merge_dicts(self, original, override):
 87 |         """
 88 |         Recursively overrides items in original based on keys/values from override.
 89 |         Input dictionaries must have common scheme.
 90 |         """
 91 |         for override_key, override_item in override.items():
 92 |             if override_key in original:
 93 |                 if isinstance(override_item, dict):
 94 |                     self.merge_dicts(original[override_key], override_item)
 95 |                 elif isinstance(override_item, list):
 96 |                     original_list = original[override_key]
 97 |                     if original_list is None:
 98 |                       original[override_key] = override_item
 99 |                     else:
100 |                       for idx, item in enumerate(override_item):
101 |                           if isinstance(item, dict):
102 |                               self.merge_dicts(original_list[idx], item)
103 |                           else:
104 |                               original_list[idx] = item
105 |                 else:
106 |                     original[override_key] = override_item
107 |             else:
108 |                 original[override_key] = override_item
109 | 
110 |     def merge_configs(self):
111 |         """Apply config from scenario to template."""
112 |         scenarios = self.read_scenarios_yaml()
113 |         for scenario in scenarios:
114 |             base_config = self.read_template_yaml()
115 |             # Iterate over scenarios_file
116 |             scenario_dict = scenarios[scenario]
117 |             self.merge_dicts(base_config, scenario_dict)
118 |             scenario_file = "{}/{}-{}".format("/tmp",scenario, "cfg.yaml")
119 |             if os.path.exists(scenario_file):
120 |                 print(scenario_file + " already exists will be overwritten.")
121 |             self.write_scenarios_yaml(base_config, scenario, scenario_file)
122 |             self.scenarios.append(Scenario(scenario, scenario_file))
123 | 


--------------------------------------------------------------------------------
/benchmarks/bigbench.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # This init action installs bigbench benchmark tool on a dataproc cluster
 4 | 
 5 | set -euxo pipefail
 6 | 
 7 | function install_components(){
 8 |   apt-get install -y python-pip
 9 |   pip install --upgrade google-cloud-storage
10 |   apt-get install -y pssh
11 | }
12 | 
13 | function install_bigbench(){
14 |   local benchmark_engine
15 |   benchmark_engine="$(/usr/share/google/get_metadata_value attributes/benchmark_engine || true)"
16 |   git clone https://github.com/takeon8/Big-Data-Benchmark-for-Big-Bench
17 |   cd Big-Data-Benchmark-for-Big-Bench
18 |   if [ "${benchmark_engine}" == 'spark_sql' ]  ; then
19 |     git checkout spark-sql
20 | 
21 |     if ! grep -q '^spark\.sql\.warehouse\.dir=' '/etc/spark/conf/spark-defaults.conf'; then
22 |       echo 'spark.sql.warehouse.dir=/root/spark-warehouse' >> '/etc/spark/conf/spark-defaults.conf'
23 |     fi
24 |   else
25 |     git checkout spark2
26 |   fi
27 | 
28 |   if [[ "${dataproc_version}" == 'dataproc-1-3' || "${dataproc_version}" == 'dataproc-1-4' ]] ; then
29 |     # On Dataproc 1.3 and 1.4 we need to provide updated bigbench-ml-spark-2x.jar
30 |     git cherry-pick d543e9a748a6a17524c7420726f3dbb6bc19c108
31 |   fi
32 | }
33 | 
34 | function configure_bigbench(){
35 |   local old_hadoop_conf='export BIG_BENCH_HADOOP_CONF="/etc/hadoop/conf.cloudera.hdfs"'
36 |   local new_hadoop_conf='export BIG_BENCH_HADOOP_CONF="/etc/hadoop/conf"'
37 |   local old_libs='export BIG_BENCH_HADOOP_LIBS_NATIVE="/opt/cloudera/parcels/CDH/lib/hadoop/lib/native"'
38 |   local new_libs='export BIG_BENCH_HADOOP_LIBS_NATIVE=...'
39 | 
40 |   sed -i "s#${old_hadoop_conf}#${new_hadoop_conf}#" conf/userSettings.conf
41 |   sed -i "s#${old_libs}#${new_libs}#" conf/userSettings.conf
42 |   sed -i "s/export BIG_BENCH_PSSH_BINARY=\"pssh\"/export BIG_BENCH_PSSH_BINARY=\"parallel-ssh\"/g" \
43 |     conf/userSettings.conf
44 | 
45 |   echo 'IS_EULA_ACCEPTED=true' >> \
46 |     data-generator/Constants.properties
47 |   echo 'export BIG_BENCH_ENGINE_HIVE_ML_FRAMEWORK_SPARK_BINARY="spark-submit --deploy-mode cluster --master yarn"' >> \
48 |     engines/hive/conf/engineSettings.conf
49 | }
50 | 
51 | function set_system_properties(){
52 |   local file_handle_limit
53 |   file_handle_limit="$(/usr/share/google/get_metadata_value attributes/file_handle_limit || true)"
54 |   local swap_size
55 |   swap_size="$(/usr/share/google/get_metadata_value attributes/swap_size || true)"
56 | 
57 |   if [ "${file_handle_limit}" != '' ]  ; then
58 |     echo "${file_handle_limit}" > /proc/sys/fs/file-max
59 |   fi
60 | 
61 |   if [ "${swap_size}" != '' ]  ; then
62 |     fallocate -l "${swap_size}" /swapfile
63 |     chmod 600 /swapfile
64 |     mkswap /swapfile
65 |     swapon /swapfile
66 |   fi
67 | }
68 | 
69 | function main(){
70 |   local role
71 |   role="$(/usr/share/google/get_metadata_value attributes/dataproc-role)"
72 |   local dataproc_version
73 |   dataproc_version="$(/usr/share/google/get_metadata_value image | grep -o 'dataproc-[0-9]-[0-9]' )"
74 | 
75 |   if [[ "${role}" == 'Master' ]] ; then
76 |     install_bigbench
77 |     configure_bigbench
78 |     install_components
79 |   fi
80 |   set_system_properties
81 | }
82 | 
83 | main
84 | 


--------------------------------------------------------------------------------
/benchmarks/cfg.yaml.tmpl:
--------------------------------------------------------------------------------
 1 | jobs:
 2 |   - pysparkJob:
 3 |       mainPythonFileUri: gs://dataproc-benchmarking/benchmarks/trigger_bigbench_benchmark.py
 4 |       args:
 5 |     stepId: bigbench
 6 | placement:
 7 |   managedCluster:
 8 |     clusterName: benchmark
 9 |     config:
10 |       gceClusterConfig:
11 |         zoneUri: us-central1-f
12 |       masterConfig:
13 |         numInstances: 1
14 |         machineTypeUri: n1-standard-8
15 |         diskConfig:
16 |           bootDiskType: pd-standard
17 |           bootDiskSizeGb: 500
18 |       workerConfig:
19 |         numInstances: 2
20 |         machineTypeUri: n1-standard-8
21 |         diskConfig:
22 |           bootDiskType: pd-standard
23 |           bootDiskSizeGb: 500
24 |       softwareConfig:
25 |         imageVersion: "1.2.37"
26 | 


--------------------------------------------------------------------------------
/benchmarks/examples/bigbench-dataproc-1-3.yaml:
--------------------------------------------------------------------------------
 1 | jobs:
 2 | - pysparkJob:
 3 |     mainPythonFileUri: gs://dataproc-benchmarking/benchmarks/trigger_bigbench_benchmark.py
 4 |     args:
 5 |       - gs://dataproc-1-3-hive
 6 |       - scenario_1
 7 |       - -f 1
 8 |   stepId: a111
 9 | placement:
10 |   managedCluster:
11 |     clusterName: bigbench
12 |     config:
13 |       gceClusterConfig:
14 |         zoneUri: us-central1-f
15 |       masterConfig:
16 |         numInstances: 1
17 |         machineTypeUri: custom-6-30720
18 |         diskConfig:
19 |           bootDiskType: pd-standard
20 |           bootDiskSizeGb: 300
21 |       workerConfig:
22 |         numInstances: 5
23 |         machineTypeUri: custom-32-102400
24 |         diskConfig:
25 |           bootDiskType: pd-standard
26 |           bootDiskSizeGb: 500
27 |           numLocalSsds: 5
28 |       softwareConfig:
29 |         imageVersion: "1.3.2"
30 |       initializationActions:
31 |       - executableFile: gs://dataproc-benchmarking/init-actions/bigbench.sh
32 | 


--------------------------------------------------------------------------------
/benchmarks/examples/bigbench-hive-scenarios.yaml:
--------------------------------------------------------------------------------
  1 | q01:
  2 |   jobs:
  3 |   - pysparkJob:
  4 |       args:
  5 |         - gs://dataproc-1-3-hive
  6 |         - q01
  7 |         - -f 1 -j 1
  8 | 
  9 | q02:
 10 |   jobs:
 11 |   - pysparkJob:
 12 |       args:
 13 |         - gs://dataproc-1-3-hive
 14 |         - q02
 15 |         - -f 1 -j 2
 16 | 
 17 | q03:
 18 |   jobs:
 19 |   - pysparkJob:
 20 |       args:
 21 |         - gs://dataproc-1-3-hive
 22 |         - q03
 23 |         - -f 1 -j 3
 24 | 
 25 | q04:
 26 |   jobs:
 27 |   - pysparkJob:
 28 |       args:
 29 |         - gs://dataproc-1-3-hive
 30 |         - q04
 31 |         - -f 1 -j 4
 32 | 
 33 | q05:
 34 |   jobs:
 35 |   - pysparkJob:
 36 |       args:
 37 |         - gs://dataproc-1-3-hive
 38 |         - q05
 39 |         - -f 1 -j 5
 40 | 
 41 | q06:
 42 |   jobs:
 43 |   - pysparkJob:
 44 |       args:
 45 |         - gs://dataproc-1-3-hive
 46 |         - q06
 47 |         - -f 1 -j 6
 48 | 
 49 | q07:
 50 |   jobs:
 51 |   - pysparkJob:
 52 |       args:
 53 |         - gs://dataproc-1-3-hive
 54 |         - q07
 55 |         - -f 1 -j 7
 56 | 
 57 | q08:
 58 |   jobs:
 59 |   - pysparkJob:
 60 |       args:
 61 |         - gs://dataproc-1-3-hive
 62 |         - q08
 63 |         - -f 1 -j 8
 64 | 
 65 | q09:
 66 |   jobs:
 67 |   - pysparkJob:
 68 |       args:
 69 |         - gs://dataproc-1-3-hive
 70 |         - q09
 71 |         - -f 1 -j 9
 72 | 
 73 | q10:
 74 |   jobs:
 75 |   - pysparkJob:
 76 |       args:
 77 |         - gs://dataproc-1-3-hive
 78 |         - q10
 79 |         - -f 1 -j 10
 80 | 
 81 | q11:
 82 |   jobs:
 83 |   - pysparkJob:
 84 |       args:
 85 |         - gs://dataproc-1-3-hive
 86 |         - q11
 87 |         - -f 1 -j 11
 88 | 
 89 | q12:
 90 |   jobs:
 91 |   - pysparkJob:
 92 |       args:
 93 |         - gs://dataproc-1-3-hive
 94 |         - q12
 95 |         - -f 1 -j 12
 96 | 
 97 | q13:
 98 |   jobs:
 99 |   - pysparkJob:
100 |       args:
101 |         - gs://dataproc-1-3-hive
102 |         - q13
103 |         - -f 1 -j 13
104 | 
105 | q14:
106 |   jobs:
107 |   - pysparkJob:
108 |       args:
109 |         - gs://dataproc-1-3-hive
110 |         - q14
111 |         - -f 1 -j 14
112 | 
113 | q15:
114 |   jobs:
115 |   - pysparkJob:
116 |       args:
117 |         - gs://dataproc-1-3-hive
118 |         - q15
119 |         - -f 1 -j 15
120 | 
121 | q16:
122 |   jobs:
123 |   - pysparkJob:
124 |       args:
125 |         - gs://dataproc-1-3-hive
126 |         - q16
127 |         - -f 1 -j 16
128 | 
129 | q17:
130 |   jobs:
131 |   - pysparkJob:
132 |       args:
133 |         - gs://dataproc-1-3-hive
134 |         - q17
135 |         - -f 1 -j 17
136 | 
137 | q18:
138 |   jobs:
139 |   - pysparkJob:
140 |       args:
141 |         - gs://dataproc-1-3-hive
142 |         - q18
143 |         - -f 1 -j 18
144 | 
145 | q19:
146 |   jobs:
147 |   - pysparkJob:
148 |       args:
149 |         - gs://dataproc-1-3-hive
150 |         - q19
151 |         - -f 1 -j 19
152 | 
153 | q20:
154 |   jobs:
155 |   - pysparkJob:
156 |       args:
157 |         - gs://dataproc-1-3-hive
158 |         - q20
159 |         - -f 1 -j 20
160 | 
161 | q21:
162 |   jobs:
163 |   - pysparkJob:
164 |       args:
165 |         - gs://dataproc-1-3-hive
166 |         - q21
167 |         - -f 1 -j 21
168 | 
169 | q22:
170 |   jobs:
171 |   - pysparkJob:
172 |       args:
173 |         - gs://dataproc-1-3-hive
174 |         - q22
175 |         - -f 1 -j 22
176 | 
177 | q23:
178 |   jobs:
179 |   - pysparkJob:
180 |       args:
181 |         - gs://dataproc-1-3-hive
182 |         - q23
183 |         - -f 1 -j 23
184 | 
185 | q24:
186 |   jobs:
187 |   - pysparkJob:
188 |       args:
189 |         - gs://dataproc-1-3-hive
190 |         - q24
191 |         - -f 1 -j 24
192 | 
193 | q25:
194 |   jobs:
195 |   - pysparkJob:
196 |       args:
197 |         - gs://dataproc-1-3-hive
198 |         - q25
199 |         - -f 1 -j 25
200 | 
201 | q26:
202 |   jobs:
203 |   - pysparkJob:
204 |       args:
205 |         - gs://dataproc-1-3-hive
206 |         - q26
207 |         - -f 1 -j 26
208 | 
209 | q27:
210 |   jobs:
211 |   - pysparkJob:
212 |       args:
213 |         - gs://dataproc-1-3-hive
214 |         - q27
215 |         - -f 1 -j 27
216 | 
217 | q28:
218 |   jobs:
219 |   - pysparkJob:
220 |       args:
221 |         - gs://dataproc-1-3-hive
222 |         - q28
223 |         - -f 1 -j 28
224 | 
225 | q29:
226 |   jobs:
227 |   - pysparkJob:
228 |       args:
229 |         - gs://dataproc-1-3-hive
230 |         - q29
231 |         - -f 1 -j 29
232 | 
233 | q30:
234 |   jobs:
235 |   - pysparkJob:
236 |       args:
237 |         - gs://dataproc-1-3-hive
238 |         - q30
239 |         - -f 1 -j 30
240 | 


--------------------------------------------------------------------------------
/benchmarks/examples/hibench-dataproc-1-2-scenarios.yaml:
--------------------------------------------------------------------------------
 1 | tiny:
 2 |   jobs:
 3 |     pysparkJob:
 4 |       mainPythonFileUri: gs://dataproc-benchmarking/benchmarks/trigger_hibench_benchmark.py
 5 |       args:
 6 |         - gs://dataproc-1-2-hibench
 7 |         - tiny_scale_factor_scenario
 8 |         - tiny
 9 |   placement:
10 |     managedCluster:
11 |       config:
12 |         softwareConfig:
13 |           imageVersion: "1.2.45"
14 |         initializationActions:
15 |           executableFile: gs://dataproc-benchmarking/init-actions/hibench.sh
16 | 


--------------------------------------------------------------------------------
/benchmarks/hibench.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | function err() {
  4 |   echo "[$(date +'%Y-%m-%dT%H:%M:%S%z')]: $@" >&2
  5 |   return 1
  6 | }
  7 | 
  8 | function install_dependencies(){
  9 |     apt-get -y install bc || err 'Unable to install bc'
 10 |     git clone https://github.com/takeon8/HiBench
 11 |     wget -qO- http://archive.apache.org/dist/maven/maven-3/3.5.4/binaries/apache-maven-3.5.4-bin.tar.gz \
 12 |       | tar zxv
 13 |     export M2_HOME=/apache-maven-3.5.4
 14 |     echo "PATH=/apache-maven-3.5.4/bin:$PATH" >> /etc/profile
 15 |     update-alternatives --install "/usr/bin/mvn" "mvn" "/apache-maven-3.5.4/bin/mvn" 0
 16 |     update-alternatives --set mvn /apache-maven-3.5.3/bin/mvn
 17 |     echo 'export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64' >> /etc/hadoop/conf/hadoop-env.sh
 18 | }
 19 | 
 20 | function configure_benchmark(){
 21 |     local namenode=$(bdconfig get_property_value \
 22 |       --configuration_file '/etc/hadoop/conf/core-site.xml' \
 23 |       --name 'fs.default.name')
 24 |     local cluster_name=$(echo "${namenode}" | replace 'hdfs://' '')
 25 |     local num_masters=$(/usr/share/google/get_metadata_value attributes/dataproc-master \
 26 |       | grep -o "${cluster_name%%-*}" \
 27 |       | wc -l)
 28 |     local num_masters_additional=$(/usr/share/google/get_metadata_value \
 29 |       attributes/dataproc-master-additional \
 30 |       | grep -o "${cluster_name%%-*}" \
 31 |       | wc -l)
 32 |     local master_cpus=$(nproc -all)
 33 |     local num_workers=$(/usr/share/google/get_metadata_value attributes/dataproc-worker-count)
 34 |     local name=$(/usr/share/google/get_metadata_value name)
 35 |     local workers_list=$(yarn node -list -all | grep "${name%%-*}-w-0")
 36 |     local worker_id=$(echo "${workers_list%%:*}")
 37 |     local worker_cpus=$(yarn node -list -showDetails ${worker_id} \
 38 |       | grep -m1 vCores \
 39 |       | sed -e 's/vCores:\(.*\)>/\1/' \
 40 |       | sed 's/.*,//' \
 41 |       | tr -d '[:space:]')
 42 |     sed -i '/micro.sleep/d' ./HiBench/conf/benchmarks.lst
 43 |     cat << 'EOF' > ./HiBench/conf/hadoop.conf
 44 |     # Hadoop home
 45 |     hibench.hadoop.home     /usr/lib/hadoop
 46 | 
 47 |     # The path of hadoop executable
 48 |     hibench.hadoop.executable     ${hibench.hadoop.home}/bin/hadoop
 49 | 
 50 |     # Hadoop configraution directory
 51 |     hibench.hadoop.configure.dir  ${hibench.hadoop.home}/etc/hadoop
 52 | 
 53 |     # Hadoop release provider. Supported value: apache, cdh5, hdp
 54 |     hibench.hadoop.release    apache
 55 | 
 56 |     hibench.hadoop.examples.test.jar    /usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar
 57 | EOF
 58 |     echo "# The root HDFS path to store HiBench data" >> ./HiBench/conf/hadoop.conf
 59 |     echo "hibench.hdfs.master       ${namenode}:8020" >> ./HiBench/conf/hadoop.conf
 60 |     cat << 'EOF' > ./HiBench/conf/spark.conf
 61 |     # Spark home
 62 |     hibench.spark.home      /usr/lib/spark
 63 | 
 64 |     # Spark master
 65 |     #   standalone mode: spark://xxx:7077
 66 |     #   YARN mode: yarn-client
 67 |     hibench.spark.master    yarn-client
 68 | EOF
 69 |     yarn_executors_num=$(python -c \
 70 |         "print (${num_masters} + ${num_masters_additional}) * ${master_cpus} + ${num_workers} * ${worker_cpus}")
 71 |     echo -e "export YARN_NUM_EXECUTORS=${yarn_executors_num}\n$(cat /HiBench/bin/functions/workload_functions.sh)" \
 72 |       > /HiBench/bin/functions/workload_functions.sh
 73 | }
 74 | 
 75 | function set_system_properties(){
 76 |   local file_handle_limit
 77 |   file_handle_limit="$(/usr/share/google/get_metadata_value attributes/file_handle_limit || true)"
 78 |   local swap_size
 79 |   swap_size="$(/usr/share/google/get_metadata_value attributes/swap_size || true)"
 80 | 
 81 |   if [ "${file_handle_limit}" != '' ]  ; then
 82 |     echo "${file_handle_limit}" > /proc/sys/fs/file-max
 83 |   fi
 84 | 
 85 |   if [ "${swap_size}" != '' ]  ; then
 86 |     fallocate -l "${swap_size}" /swapfile
 87 |     chmod 600 /swapfile
 88 |     mkswap /swapfile
 89 |     swapon /swapfile
 90 |   fi
 91 | }
 92 | 
 93 | 
 94 | function main() {
 95 |   local role=$(/usr/share/google/get_metadata_value attributes/dataproc-role)
 96 |   local master=$(/usr/share/google/get_metadata_value attributes/dataproc-master)
 97 |   if [[ "${role}" == 'Master' ]]; then
 98 |     # Only run on the master node
 99 |     update_apt_get || err 'Unable to update apt-get'
100 |     install_dependencies || err 'Installing dependencies for HiBench failed'
101 |     configure_benchmark || err 'Configuration failed'
102 |     bash ./HiBench/bin/build_all.sh || echo "Retry build step" && \
103 |       bash ./HiBench/bin/build_all.sh \
104 |       || err "Build step failed"
105 |   fi
106 |   set_system_properties
107 | }
108 | 
109 | main
110 | 
111 | 


--------------------------------------------------------------------------------
/benchmarks/initial_benchmark/cfg-1.yaml:
--------------------------------------------------------------------------------
 1 | jobs:
 2 |   pysparkJob:
 3 |     mainPythonFileUri: gs://dataproc-benchmarking/benchmarks/trigger_bigbench_benchmark.py
 4 |     args:
 5 |       - gs://bucket-name/bigbench
 6 |       - scenario_1
 7 |       - -f 1
 8 |   stepId: a111
 9 | id: test
10 | name: "projects/gcp-project-name/regions/global/workflowTemplates/id"
11 | placement:
12 |   managedCluster:
13 |     clusterName: example
14 |     config:
15 |       gceClusterConfig:
16 |         zoneUri: us-central1-f
17 |       masterConfig:
18 |         numInstances: 1
19 |         machineTypeUri: custom-6-30720
20 |         diskConfig:
21 |           bootDiskType: pd-standard
22 |           bootDiskSizeGb: 300
23 |         imageUri:
24 |       workerConfig:
25 |         numInstances: 10
26 |         machineTypeUri: custom-32-51200
27 |         diskConfig:
28 |           bootDiskType: pd-standard
29 |           bootDiskSizeGb: 500
30 |           numLocalSsds: 5
31 |         imageUri:
32 |       softwareConfig:
33 |         imageVersion: "1.2.37"
34 |       initializationActions:
35 |         executableFile: gs://dataproc-benchmarking/init-actions/bigbench.sh
36 | 


--------------------------------------------------------------------------------
/benchmarks/initial_benchmark/cpu-testing-scenarios.yaml:
--------------------------------------------------------------------------------
 1 | # Memory size for a 4 vCPU instance must be between 3840MiB and 26624MiB, while 30720MiB was provided.
 2 | # Test below will fail
 3 | #scenario_cpu_4:
 4 | #  jobs:
 5 | #    pysparkJob:
 6 | #      args:
 7 | #        - gs://bucket-name/bigbench
 8 | #        - scenario_cpu_4
 9 | #        - -f 1
10 | #  placement:
11 | #    managedCluster:
12 | #      config:
13 | #        masterConfig:
14 | #          machineTypeUri: custom-4-30720
15 | #        workerConfig:
16 | #          machineTypeUri: custom-4-51200
17 | #
18 | 
19 | scenario_cpu_8:
20 |   jobs:
21 |     pysparkJob:
22 |       args:
23 |         - gs://bucket-name/bigbench
24 |         - scenario_cpu_8
25 |         - -f 1
26 |   placement:
27 |     managedCluster:
28 |       config:
29 |         masterConfig:
30 |           machineTypeUri: custom-8-30720
31 |         workerConfig:
32 |           machineTypeUri: custom-8-51200
33 | 
34 | scenario_cpu_16:
35 |   jobs:
36 |     pysparkJob:
37 |       args:
38 |         - gs://bucket-name/bigbench
39 |         - scenario_cpu_16
40 |         - -f 1
41 |   placement:
42 |     managedCluster:
43 |       config:
44 |         masterConfig:
45 |           machineTypeUri: custom-16-30720
46 |         workerConfig:
47 |           machineTypeUri: custom-16-51200
48 | 
49 | scenario_cpu_32:
50 |   jobs:
51 |     pysparkJob:
52 |       args:
53 |         - gs://bucket-name/bigbench
54 |         - scenario_cpu_16
55 |         - -f 1
56 |   placement:
57 |     managedCluster:
58 |       config:
59 |         masterConfig:
60 |           machineTypeUri: custom-32-30720
61 |         workerConfig:
62 |           machineTypeUri: custom-32-51200
63 | 
64 | # Memory size for a 64 vCPU instance must be between 58880MiB and 425984MiB
65 | # Test below will fail
66 | #scenario_cpu_64:
67 | #  jobs:
68 | #    pysparkJob:
69 | #      args:
70 | #        - gs://bucket-name/bigbench
71 | #        - scenario_cpu_64
72 | #        - -f 1
73 | #  placement:
74 | #    managedCluster:
75 | #      config:
76 | #        masterConfig:
77 | #          machineTypeUri: custom-64-30720
78 | #        workerConfig:
79 | #          machineTypeUri: custom-64-51200


--------------------------------------------------------------------------------
/benchmarks/requirements.txt:
--------------------------------------------------------------------------------
1 | google-cloud-storage>=1.10.0
2 | pyspark>=2.3.2
3 | pyyaml>=4.2b1
4 | mock>=2.0.0
5 | 


--------------------------------------------------------------------------------
/benchmarks/runPerformanceTesting.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3.6
 2 | 
 3 | '''
 4 | This script triggers performance testing
 5 | '''
 6 | 
 7 | import argparse
 8 | import threading
 9 | from benchmarkUtil import Benchmark
10 | 
11 | 
12 | def trigger_workflow(scenario, benchmark):
13 |     benchmark.run_workflow(scenario)
14 |     print("Scenario {} has finished".format(scenario.name))
15 |     benchmark.upload_config_to_gs(scenario)
16 | 
17 | 
18 | if __name__ == '__main__':
19 |     PARSER = argparse.ArgumentParser()
20 |     PARSER.add_argument("--scenarios", action="store",
21 |                         help="File contains scenarios definition \
22 |                         for running Bigbench/Highbench tests \
23 |                         on various configurations")
24 |     PARSER.add_argument("--template", action="store",
25 |                         help="File that contains a default config template file \
26 |                         that would be overriden by scenarios")
27 |     PARSER.add_argument("--dry_run", action="store_true",
28 |                         help="If enabled it allows for config files preparation \
29 |                         without workflow execution")
30 |     PARSER.add_argument("--parallel", action="store_true",
31 |                         help="If enabled all scenarios will be executed \
32 |                         in parallel mode")
33 | 
34 |     ARGS = PARSER.parse_args()
35 |     BENCHMARK = Benchmark()
36 |     if ARGS.scenarios:
37 |         BENCHMARK.set_scenarios_file(ARGS.scenarios)
38 |     if ARGS.template:
39 |         BENCHMARK.set_config_template_file(ARGS.template)
40 |     BENCHMARK.merge_configs()
41 |     if not ARGS.dry_run:
42 |         print("---Starting performance testing---")
43 |         for scenario in BENCHMARK.scenarios:
44 |             print("Starting performance testing for {} using {} configuration file"
45 |                   .format(scenario.name, scenario.config_file_name))
46 |             if ARGS.parallel:
47 |                 threading.Thread(target=trigger_workflow, args=(scenario, BENCHMARK)).start()
48 |             else:
49 |                 trigger_workflow(scenario, BENCHMARK)
50 | 


--------------------------------------------------------------------------------
/benchmarks/testing_scenarios.yaml.tmpl:
--------------------------------------------------------------------------------
 1 | scenario_1:
 2 |   jobs:
 3 |   - pysparkJob:
 4 |       args:
 5 |         - gs://bucket-name/bigbench
 6 |         - scenario_1
 7 |         - -f 1
 8 |   placement:
 9 |     managedCluster:
10 |       config:
11 |         gceClusterConfig:
12 |           metadata:
13 |             file_handle_limit: "10000000"
14 |             swap_size: 5G
15 |         workerConfig:
16 |           numInstances: 10
17 |         initializationActions:
18 |           executableFile: gs://dataproc-benchmarking/init-actions/bigbench.sh
19 | 
20 | scenario_2:
21 |   jobs:
22 |   - pysparkJob:
23 |       args:
24 |         - gs://bucket-name/bigbench
25 |         - scenario_2
26 |         - -f 1
27 |   placement:
28 |     managedCluster:
29 |       config:
30 |         workerConfig:
31 |           numInstances: 2
32 |         softwareConfig:
33 |           properties:
34 |             mapred:mapreduce.map.cpu.vcores: '1'
35 |             yarn:yarn.app.mapreduce.am.resource.mb: '2048'
36 |         initializationActions:
37 |           executableFile: gs://dataproc-benchmarking/init-actions/bigbench.sh
38 | 


--------------------------------------------------------------------------------
/benchmarks/trigger_bigbench_benchmark.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script defines spark-job steps for bigbench performance testing
 3 | """
 4 | 
 5 | import os
 6 | import sys
 7 | import subprocess
 8 | from pyspark import SparkConf, SparkContext
 9 | 
10 | # Configure the environment
11 | if 'SPARK_HOME' not in os.environ:
12 |     os.environ['SPARK_HOME'] = '/usr/lib/spark'
13 | 
14 | CONF = SparkConf().setAppName('pubmed_open_access').setMaster('local[32]')
15 | SC = SparkContext(conf=CONF)
16 | 
17 | 
18 | def execute_shell(cmd):
19 |     """
20 |     :param cmd:
21 |     :return: stdout, err, return_code
22 |     """
23 |     popen_instance = subprocess.Popen(
24 |         cmd,
25 |         shell=True,
26 |         stdout=subprocess.PIPE,
27 |         stderr=subprocess.PIPE
28 |     )
29 |     stdout, err = popen_instance.communicate()
30 |     return_code = popen_instance.returncode
31 |     return stdout, err, return_code
32 | 
33 | 
34 | def run_benchmark(arg):
35 |     """
36 |     This method trigger benchmark execution
37 |     """
38 |     command = 'bash /Big-Data-Benchmark-for-Big-Bench/bin/bigBench runBenchmark {}'.format(arg)
39 |     execute_shell(command)
40 | 
41 | 
42 | def upload_results():
43 |     """
44 |     This method upload results to the bucket
45 |     """
46 |     cluster_name, err, return_code = execute_shell("/usr/share/google/get_metadata_value \
47 |             attributes/dataproc-cluster-name")
48 |     for file in os.listdir('/Big-Data-Benchmark-for-Big-Bench/logs/'):
49 |         if file.__contains__(".csv") or file.__contains__(".zip"):
50 |             output_path = "{}/{}/{}/".format(sys.argv[1], sys.argv[2], cluster_name)
51 |             command = "gsutil cp /Big-Data-Benchmark-for-Big-Bench/logs/{} {}{}" \
52 |                 .format(file, output_path, file)
53 |             execute_shell(command)
54 | 
55 | 
56 | def main():
57 |     """
58 |     This method execute required benchmark actions and uploads results to the bucket
59 |     """
60 |     if len(sys.argv) > 3:
61 |         if sys.argv[3] == "non-benchmark":
62 |             print("Running non-benchmark option. \
63 |                 Empty times.csv file should be uploaded to specified bucket")
64 |             command = "mkdir -p /Big-Data-Benchmark-for-Big-Bench/logs/ \
65 |                 && touch /Big-Data-Benchmark-for-Big-Bench/logs/sample.csv"
66 |             execute_shell(command)
67 |         else:
68 |             print('benchmark with optional argument {}'.format(sys.argv[3]))
69 |             run_benchmark(sys.argv[3])
70 |     else:
71 |         print("Running benchmark testing for scenario named {}".format(sys.argv[2]))
72 |         run_benchmark(None)
73 |     upload_results()
74 | 
75 | 
76 | if __name__ == '__main__':
77 |     main()
78 | 


--------------------------------------------------------------------------------
/benchmarks/trigger_hibench_benchmark.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script defines spark-job steps for hibench performance testing
 3 | """
 4 | import os
 5 | import sys
 6 | import subprocess
 7 | import fileinput
 8 | from pyspark import SparkConf, SparkContext
 9 | 
10 | # Configure the environment
11 | if 'SPARK_HOME' not in os.environ:
12 |     os.environ['SPARK_HOME'] = '/usr/lib/spark'
13 | 
14 | CONF = SparkConf().setAppName('pubmed_open_access').setMaster('local[32]')
15 | SC = SparkContext(conf=CONF)
16 | 
17 | 
18 | def execute_shell(cmd):
19 |     """
20 |     :param cmd:
21 |     :return: stdout, err, return_code
22 |     """
23 |     popen_instance = subprocess.Popen(
24 |         cmd,
25 |         shell=True,
26 |         stdout=subprocess.PIPE,
27 |         stderr=subprocess.PIPE
28 |     )
29 |     stdout, err = popen_instance.communicate()
30 |     return_code = popen_instance.returncode
31 |     return stdout, err, return_code
32 | 
33 | 
34 | def run_benchmark():
35 |     """
36 |     This method trigger benchmark execution
37 |     """
38 |     cmd = 'bash /HiBench/bin/run_all.sh'
39 |     execute_shell(cmd)
40 | 
41 | 
42 | def upload_results():
43 |     """
44 |     This method upload results to the bucket
45 |     """
46 |     cluster_name = os.popen("/usr/share/google/get_metadata_value \
47 |                             attributes/dataproc-cluster-name") \
48 |         .read()
49 |     output_path = "{}/{}/{}/hibench.report".format(sys.argv[1], sys.argv[2], cluster_name)
50 |     cmd = "gsutil cp /HiBench/report/hibench.report {}".format(output_path)
51 |     execute_shell(cmd)
52 | 
53 | 
54 | def set_hibench_scale_profile(profile):
55 |     """
56 |     This method replace a default scale profile inside hibench.conf file and set given value
57 |     Available value is tiny, small, large, huge, gigantic and bigdata.
58 |     :param profile: scale factor
59 |     """
60 |     for n, line in enumerate(fileinput.input('/HiBench/conf/hibench.conf', inplace=True), start=1):
61 |         if 'hibench.scale.profile' in line:
62 |             line = "hibench.scale.profile     {}\n".format(profile)
63 |         print(line)
64 | 
65 | 
66 | def main():
67 |     """
68 |     This method execute required benchmark actions and uploads results to the bucket
69 |     """
70 |     if len(sys.argv) > 3:
71 |         if sys.argv[3] == "non-benchmark":
72 |             print("Running non-benchmark option. \
73 |             Empty hibench.report file should be uploaded to specified bucket")
74 |             os.mkdir('/HiBench/report/')
75 |             open('/HiBench/report/hibench.report', 'w+').close()
76 |         else:
77 |             print('benchmark with optional argument {}'.format(sys.argv[3]))
78 |             set_hibench_scale_profile(sys.argv[3])
79 |             run_benchmark()
80 |     else:
81 |         print("Running benchmark testing for scenario named {}".format(sys.argv[2]))
82 |         run_benchmark()
83 |     upload_results()
84 | 
85 | 
86 | if __name__ == '__main__':
87 |     main()
88 | 


--------------------------------------------------------------------------------
/benchmarks/unittests.py:
--------------------------------------------------------------------------------
 1 | """ This module run unit tests """
 2 | import os
 3 | import unittest
 4 | import yaml
 5 | from mock import patch
 6 | from benchmarkUtil import Benchmark
 7 | 
 8 | 
 9 | TESTFILE1="/tmp/scenario_1-cfg.yaml"
10 | TESTFILE2="/tmp/scenario_2-cfg.yaml"
11 | 
12 | 
13 | class VerifyBenchmarking(unittest.TestCase):
14 |     """ This class wrap all unit tests in single scenario."""
15 | 
16 |     def setUp(self):
17 |         print("Test name:", self._testMethodName)
18 | 
19 |     def tearDown(self):
20 |         if os.path.exists(TESTFILE1):
21 |             os.remove(TESTFILE1)
22 |         if os.path.exists(TESTFILE2):
23 |             os.remove(TESTFILE2)
24 | 
25 |     def read_yaml(self, yaml_name):
26 |         """Safely open, read and close any yaml file."""
27 |         with open(yaml_name, 'r') as stream:
28 |             return yaml.safe_load(stream)
29 | 
30 |     def test_job_is_merged_with_good_arguments(self):
31 |         """Testing passing PySpark job and its args."""
32 |         Benchmark.merge_configs(Benchmark())
33 |         result_config = self.read_yaml(TESTFILE1)
34 |         job = result_config['jobs'][0]["pysparkJob"]
35 |         self.assertEqual(job["mainPythonFileUri"], "gs://dataproc-benchmarking/benchmarks/trigger_bigbench_benchmark.py")
36 |         self.assertEqual(len(job["args"]), 3)
37 | 
38 |     @patch('benchmarkUtil.Benchmark.read_scenarios_yaml',
39 |            return_value={'scenario_1':
40 |                              {'placement':
41 |                                   {'managedCluster':
42 |                                        {'config':
43 |                                             {'masterConfig':
44 |                                                  {'numInstances': 15}}}}}})
45 |     def test_yaml_master_number_passing(self, *args):
46 |         """Testing passing custom number of instances to cluster settings"""
47 |         Benchmark.merge_configs(Benchmark())
48 |         result_config = self.read_yaml(TESTFILE1)
49 |         self.assertEqual(result_config['placement']['managedCluster']['config']
50 |                          ['masterConfig']['numInstances'], 15)
51 | 
52 |     @patch('benchmarkUtil.Benchmark.read_scenarios_yaml',
53 |            return_value={'scenario_2':
54 |                              {'placement':
55 |                                   {'managedCluster':
56 |                                        {'config':
57 |                                             {'initializationActions':
58 |                                                  {'executableFile':
59 |                                                       'gs://bucket-name/hibench.sh'}}}}}})
60 |     def test_yaml_init_action_passing(self, *args):
61 |         """Testing passing init action to cluster settings"""
62 |         Benchmark.merge_configs(Benchmark())
63 |         result_config = self.read_yaml(TESTFILE2)
64 |         self.assertEqual(result_config['placement']['managedCluster']['config']
65 |                          ['initializationActions']['executableFile'],
66 |                          "gs://bucket-name/hibench.sh")
67 | 
68 |     @patch('benchmarkUtil.Benchmark.read_scenarios_yaml',
69 |            return_value={'scenario_2':
70 |                              {'placement':
71 |                                   {'managedCluster':
72 |                                        {'config':
73 |                                             {'softwareConfig':
74 |                                                  {'properties':
75 |                                                       {'mapreduce.map.cpu.vcores': 8}}}}}}})
76 |     def test_software_prop_passing(self, *args):
77 |         """Testing software properties passing"""
78 |         Benchmark.merge_configs(Benchmark())
79 |         result_config = self.read_yaml(TESTFILE2)
80 |         self.assertEqual(result_config['placement']['managedCluster']['config']
81 |                          ['softwareConfig']['properties']['mapreduce.map.cpu.vcores'], 8)
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     unittest.main()
86 | 


--------------------------------------------------------------------------------
/codelabs/opencv-haarcascade/.gitignore:
--------------------------------------------------------------------------------
1 | project/project
2 | project/target
3 | target
4 | 


--------------------------------------------------------------------------------
/codelabs/opencv-haarcascade/build.sbt:
--------------------------------------------------------------------------------
 1 | lazy val root = (project in file(".")).
 2 |   settings(
 3 |     name := "feature_detector",
 4 |     version := "1.0",
 5 |     scalaVersion := "2.10.6"
 6 |   )
 7 | 
 8 | libraryDependencies += "org.apache.spark" %% "spark-core" % "1.6.2" % "provided"
 9 | 
10 | libraryDependencies += "org.bytedeco" % "javacv" % "1.2"
11 | 
12 | libraryDependencies += "org.bytedeco.javacpp-presets" % "opencv" % "3.1.0-1.2" classifier "linux-x86_64"
13 | 
14 | libraryDependencies += "org.bytedeco.javacpp-presets" % "opencv" % "3.1.0-1.2"
15 | 
16 | classpathTypes += "maven-plugin"
17 | 
18 | // EclipseKeys.withSource := true
19 | 
20 | // EclipseKeys.withJavadoc := true
21 | 


--------------------------------------------------------------------------------
/codelabs/opencv-haarcascade/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5")
2 | 


--------------------------------------------------------------------------------
/codelabs/spark-bigquery/backfill.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2019 Google Inc.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | # This code accompanies this codelab:
 16 | # https://codelabs.developers.google.com/codelabs/pyspark-bigquery/
 17 | 
 18 | # This script for backfills data loads from Reddit into Google Cloud Storage.
 19 | 
 20 | # Python imports
 21 | import re
 22 | import time
 23 | import sys
 24 | 
 25 | # A Spark Session is how we interact with Spark SQL to create Dataframes
 26 | from pyspark.sql import SparkSession
 27 | 
 28 | # PySpark function for replacing characters using a regex.
 29 | # Use this to remove newline characters.
 30 | from pyspark.sql.functions import regexp_replace, col
 31 | 
 32 | # Library for interacting with Google Cloud Storage
 33 | from google.cloud import storage
 34 | 
 35 | # This will help catch some PySpark errors
 36 | from py4j.protocol import Py4JJavaError
 37 | 
 38 | # Create a SparkSession under the name "reddit". Viewable via the Spark UI
 39 | spark = SparkSession.builder.appName("reddit").getOrCreate()
 40 | 
 41 | # Establish a set of years and months to iterate over
 42 | year = sys.argv[1]
 43 | month = sys.argv[2]
 44 | bucket_name = sys.argv[3]
 45 | 
 46 | # Establish a subreddit to process
 47 | subreddit = 'food'
 48 | 
 49 | # Set Google Cloud Storage temp location
 50 | path = "tmp" + str(time.time())
 51 | 
 52 | # Keep track of all tables accessed via the job
 53 | tables_read = []
 54 | 
 55 | # In the form of <project-id>.<dataset>.<table>
 56 | table = f"fh-bigquery.reddit_posts.{year}_{month}"
 57 | 
 58 | # If the table doesn't exist simply continue and not
 59 | # log it into our "tables_read" list
 60 | try:
 61 |     df = spark.read.format('bigquery').option('table', table).load()
 62 | except Py4JJavaError:
 63 |     print(f"{table} does not exist. ")
 64 |     sys.exit(0)
 65 | 
 66 | print(f"Processing {table}.")
 67 | 
 68 | # Select the "title", "selftext" and "created_utc" columns of the designated
 69 | # subreddit and replace newline characters with a single space
 70 | subreddit_timestamps = (
 71 |     df
 72 |     .select(
 73 |         regexp_replace(col("title"), "\n", " "),
 74 |         regexp_replace(col("selftext"), "\n", " "),
 75 |         "created_utc"
 76 |     )
 77 |     .where(df.subreddit == subreddit)
 78 | )
 79 | 
 80 | tmp_output_path = "gs://" + bucket_name + "/" + path + "/" + year + "/" + month
 81 | # Write output to a temp GCS bucket. Spark jobs can be written out to multiple
 82 | # files and partitions. By using coalesce, you can ensure that the output is
 83 | # consolidated to a single file. Use .options to tell Spark to write out in a
 84 | # gzip format, and .csv to do the write.
 85 | (
 86 |     subreddit_timestamps
 87 |     # Data can get written out to multiple files / partition.
 88 |     # This ensures it will only write to 1.
 89 |     .coalesce(1)
 90 |     .write
 91 |     # Gzip the output file
 92 |     .options(codec="org.apache.hadoop.io.compress.GzipCodec")
 93 |     # Write out to csv
 94 |     .csv(tmp_output_path)
 95 | )
 96 | # Lastly, move the temp file to a new bucket and delete the temp directory.
 97 | regex = "part-[0-9a-zA-Z\-]*.csv.gz"
 98 | new_path = "/".join(["reddit_posts", year, month, subreddit + ".csv.gz"])
 99 | 
100 | # Create the storage client
101 | storage_client = storage.Client()
102 | 
103 | # Create an object representing the original bucket
104 | source_bucket = storage_client.get_bucket(bucket_name)
105 | 
106 | # Grab all files in the source bucket. Typically there is also a _SUCCESS file
107 | # inside of the directory, so make sure to find just a single csv file.
108 | blobs = list(source_bucket.list_blobs(prefix=path))
109 | 
110 | # Locate the file that represents our partition. Copy to new location.
111 | for blob in blobs:
112 |     if re.search(regex, blob.name):
113 |         source_bucket.copy_blob(blob, source_bucket, new_path)
114 | 
115 | # Lastly, delete the temp directory.
116 | for blob in blobs:
117 |     blob.delete()
118 | 


--------------------------------------------------------------------------------
/codelabs/spark-bigquery/backfill.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2019 Google Inc.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | # This script is associated with the codelab found at https://codelabs.developers.google.com/codelabs/pyspark-bigquery/. 
18 | # This script kicks off a series of Cloud Dataproc PySpark Jobs.
19 | # This script must be provided with a Cloud Dataproc Cluster name and Bucket name in positions arguments 1 and 2 respectively.
20 | 
21 | # Starting year and all months
22 | base_year=2017
23 | months=(01 02 03 04 05 06 07 08 09 10 11 12)
24 | 
25 | # Grab list of existing BigQuery tables
26 | tables=$(bq ls fh-bigquery:reddit_posts) 
27 | 
28 | year=${base_year}
29 | warm_up=true 
30 | 
31 | # Set the name of the output bucket
32 | CLUSTER_NAME=${1}
33 | BUCKET_NAME=${2}
34 | 
35 | # Iterate for every year / month pair starting from January 2016 up through the current year.
36 | while [[ ${year} -le $(date +%Y) ]]
37 | do
38 |   for month in "${months[@]}"
39 |   do
40 |     # If the YYYY_MM table doesn't exist, we skip over it.
41 |     exists="$(echo "${tables}" | grep " ${year}_${month} ")"
42 |     if [ -z "${exists}" ]; then
43 |       continue
44 |     fi
45 |     echo "${year}_${month}"
46 | 
47 |     # Submit a PySpark job via the Cloud Dataproc Jobs API
48 |     gcloud dataproc jobs submit pyspark \
49 |         --cluster ${CLUSTER_NAME} \
50 |         --jars gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar \
51 |         --driver-log-levels root=FATAL \
52 |         backfill.py \
53 |         -- ${year} ${month} ${BUCKET_NAME} &
54 |     sleep 5
55 | 
56 |     if ${warm_up}; then 
57 |       sleep 10
58 |       warm_up=false 
59 |     fi
60 |   done
61 |   ((year ++))
62 | done
63 | 
64 | wait
65 | 


--------------------------------------------------------------------------------
/codelabs/spark-bigquery/counts_by_subreddit.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Google Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # This script accompanies this codelab:
16 | # https://codelabs.developers.google.com/codelabs/pyspark-bigquery/
17 | 
18 | # This script outputs subreddits counts for a given set of years and month
19 | # This data comes from BigQuery via the dataset "fh-bigquery.reddit_comments"
20 | 
21 | # These allow us to create a schema for our data
22 | from pyspark.sql.types import StructField, StructType, StringType, LongType
23 | 
24 | # A Spark Session is how we interact with Spark SQL to create Dataframes
25 | from pyspark.sql import SparkSession
26 | 
27 | # This will help catch some PySpark errors
28 | from py4j.protocol import Py4JJavaError
29 | 
30 | # Create a SparkSession under the name "reddit". Viewable via the Spark UI
31 | spark = SparkSession.builder.appName("reddit").getOrCreate()
32 | 
33 | # Create a two column schema consisting of a string and a long integer
34 | fields = [StructField("subreddit", StringType(), True),
35 |           StructField("count", LongType(), True)]
36 | schema = StructType(fields)
37 | 
38 | # Create an empty DataFrame. We will continuously union our output with this
39 | subreddit_counts = spark.createDataFrame([], schema)
40 | 
41 | # Establish a set of years and months to iterate over
42 | years = ['2017', '2018', '2019']
43 | months = ['01', '02', '03', '04', '05', '06',
44 |           '07', '08', '09', '10', '11', '12']
45 | 
46 | # Keep track of all tables accessed via the job
47 | tables_read = []
48 | for year in years:
49 |     for month in months:
50 | 
51 |         # In the form of <project-id>.<dataset>.<table>
52 |         table = f"fh-bigquery.reddit_posts.{year}_{month}"
53 | 
54 |         # If the table doesn't exist we will simply continue and not
55 |         # log it into our "tables_read" list
56 |         try:
57 |             table_df = (spark.read.format('bigquery').option('table', table)
58 |                         .load())
59 |             tables_read.append(table)
60 |         except Py4JJavaError as e:
61 |             if f"Table {table} not found" in str(e):
62 |                 continue
63 |             else:
64 |                 raise
65 | 
66 |         # We perform a group-by on subreddit, aggregating by the count and then
67 |         # unioning the output to our base dataframe
68 |         subreddit_counts = (
69 |             table_df
70 |             .groupBy("subreddit")
71 |             .count()
72 |             .union(subreddit_counts)
73 |         )
74 | 
75 | print("The following list of tables will be accounted for in our analysis:")
76 | for table in tables_read:
77 |     print(table)
78 | 
79 | # From our base table, we perform a group-by, summing over the counts.
80 | # We then rename the column and sort in descending order both for readability.
81 | # show() will collect the table into memory output the table to std out.
82 | (
83 |     subreddit_counts
84 |     .groupBy("subreddit")
85 |     .sum("count")
86 |     .withColumnRenamed("sum(count)", "count")
87 |     .sort("count", ascending=False)
88 |     .show()
89 | )
90 | 


--------------------------------------------------------------------------------
/codelabs/spark-hudi/pyspark_hudi_example.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """Pyspark Hudi test."""
  3 | 
  4 | import sys
  5 | from pyspark.sql import SparkSession
  6 | 
  7 | 
  8 | def create_hudi_table(spark, table_name, table_uri):
  9 |   """Creates Hudi table."""
 10 |   create_table_sql = f"""
 11 |     CREATE TABLE IF NOT EXISTS {table_name} (
 12 |       uuid string,
 13 |       begin_lat double,
 14 |       begin_lon double,
 15 |       end_lat double,
 16 |       end_lon double,
 17 |       driver string,
 18 |       rider string,
 19 |       fare double,
 20 |       partitionpath string,
 21 |       ts long
 22 |     ) USING hudi
 23 |     LOCATION '{table_uri}'
 24 |     TBLPROPERTIES (
 25 |       type = 'cow',
 26 |       primaryKey = 'uuid',
 27 |       preCombineField = 'ts'
 28 |     )
 29 |     PARTITIONED BY (partitionpath)
 30 |   """
 31 |   spark.sql(create_table_sql)
 32 | 
 33 | 
 34 | def generate_test_dataframe(spark, n_rows):
 35 |   """Generates test dataframe with Hudi's built-in data generator."""
 36 |   sc = spark.sparkContext
 37 |   utils = sc._jvm.org.apache.hudi.QuickstartUtils
 38 |   data_generator = utils.DataGenerator()
 39 |   inserts = utils.convertToStringList(data_generator.generateInserts(n_rows))
 40 |   return spark.read.json(sc.parallelize(inserts, 2))
 41 | 
 42 | 
 43 | def write_hudi_table(table_name, table_uri, df):
 44 |   """Writes Hudi table."""
 45 |   hudi_options = {
 46 |       'hoodie.table.name': table_name,
 47 |       'hoodie.datasource.write.recordkey.field': 'uuid',
 48 |       'hoodie.datasource.write.partitionpath.field': 'partitionpath',
 49 |       'hoodie.datasource.write.table.name': table_name,
 50 |       'hoodie.datasource.write.operation': 'upsert',
 51 |       'hoodie.datasource.write.precombine.field': 'ts',
 52 |       'hoodie.upsert.shuffle.parallelism': 2,
 53 |       'hoodie.insert.shuffle.parallelism': 2,
 54 |   }
 55 |   df.write.format('hudi').options(**hudi_options).mode('append').save(table_uri)
 56 | 
 57 | 
 58 | def query_commit_history(spark, table_name, table_uri):
 59 |   tmp_table = f'{table_name}_commit_history'
 60 |   spark.read.format('hudi').load(table_uri).createOrReplaceTempView(tmp_table)
 61 |   query = f"""
 62 |     SELECT DISTINCT(_hoodie_commit_time)
 63 |     FROM {tmp_table}
 64 |     ORDER BY _hoodie_commit_time
 65 |     DESC
 66 |   """
 67 |   return spark.sql(query)
 68 | 
 69 | 
 70 | def read_hudi_table(spark, table_name, table_uri, commit_ts=''):
 71 |   """Reads Hudi table at the given commit timestamp."""
 72 |   if commit_ts:
 73 |     options = {'as.of.instant': commit_ts}
 74 |   else:
 75 |     options = {}
 76 |   tmp_table = f'{table_name}_snapshot'
 77 |   spark.read.format('hudi').options(**options).load(
 78 |       table_uri
 79 |   ).createOrReplaceTempView(tmp_table)
 80 |   query = f"""
 81 |     SELECT _hoodie_commit_time, begin_lat, begin_lon,
 82 |         driver, end_lat, end_lon, fare, partitionpath,
 83 |         rider, ts, uuid
 84 |     FROM {tmp_table}
 85 |   """
 86 |   return spark.sql(query)
 87 | 
 88 | 
 89 | def main():
 90 |   """Test create write and read Hudi table."""
 91 |   if len(sys.argv) != 3:
 92 |     raise Exception('Expected arguments: <table_name> <table_uri>')
 93 | 
 94 |   table_name = sys.argv[1]
 95 |   table_uri = sys.argv[2]
 96 | 
 97 |   app_name = f'pyspark-hudi-test_{table_name}'
 98 |   print(f'Creating Spark session {app_name} ...')
 99 |   spark = SparkSession.builder.appName(app_name).getOrCreate()
100 |   spark.sparkContext.setLogLevel('WARN')
101 | 
102 |   print(f'Creating Hudi table {table_name} at {table_uri} ...')
103 |   create_hudi_table(spark, table_name, table_uri)
104 | 
105 |   print('Generating test data batch 1...')
106 |   n_rows1 = 10
107 |   input_df1 = generate_test_dataframe(spark, n_rows1)
108 |   input_df1.show(truncate=False)
109 | 
110 |   print('Writing Hudi table, batch 1 ...')
111 |   write_hudi_table(table_name, table_uri, input_df1)
112 | 
113 |   print('Generating test data batch 2...')
114 |   n_rows2 = 10
115 |   input_df2 = generate_test_dataframe(spark, n_rows2)
116 |   input_df2.show(truncate=False)
117 | 
118 |   print('Writing Hudi table, batch 2 ...')
119 |   write_hudi_table(table_name, table_uri, input_df2)
120 | 
121 |   print('Querying commit history ...')
122 |   commits_df = query_commit_history(spark, table_name, table_uri)
123 |   commits_df.show(truncate=False)
124 |   previous_commit_ts = commits_df.collect()[1]._hoodie_commit_time
125 | 
126 |   print('Reading the Hudi table snapshot at the latest commit ...')
127 |   output_df1 = read_hudi_table(spark, table_name, table_uri)
128 |   output_df1.show(truncate=False)
129 | 
130 |   print(f'Reading the Hudi table snapshot at {previous_commit_ts} ...')
131 |   output_df2 = read_hudi_table(spark, table_name, table_uri, previous_commit_ts)
132 |   output_df2.show(truncate=False)
133 | 
134 |   print('Stopping Spark session ...')
135 |   spark.stop()
136 | 
137 |   print('All done')
138 | 
139 | 
140 | main()
141 | 


--------------------------------------------------------------------------------
/gcloud/.gitignore:
--------------------------------------------------------------------------------
1 | init/*/


--------------------------------------------------------------------------------
/gcloud/autoscaling-policy.yaml:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2021 Google LLC and contributors
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS-IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | workerConfig:
18 |   minInstances: 2
19 |   maxInstances: 5
20 |   weight: 1
21 | secondaryWorkerConfig:
22 |   maxInstances: 5
23 |   weight: 1
24 | basicAlgorithm:
25 |   cooldownPeriod: 120s
26 |   yarnConfig:
27 |     scaleUpFactor: 0.05
28 |     scaleDownFactor: 0.8
29 |     gracefulDecommissionTimeout: 300s
30 | 


--------------------------------------------------------------------------------
/gcloud/bin/create-dpgce:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright 2021 Google LLC
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS-IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # Exit on failure
19 | set -e
20 | 
21 | source lib/env.sh
22 | 
23 | create_project
24 | 
25 | configure_gcloud
26 | 
27 | set_cluster_name
28 | 
29 | enable_services
30 | 
31 | create_bucket
32 | 
33 | create_vpc_network
34 | 
35 | #create_ip_allocation
36 | 
37 | #create_vpc_peering
38 | 
39 | # Create a cloud router
40 | 
41 | create_router
42 | 
43 | add_nat_policy
44 | 
45 | # create subnet
46 | 
47 | create_subnet
48 | 
49 | create_firewall_rules
50 | 
51 | create_service_account
52 | 
53 | # Create logging firewall rules
54 | 
55 | #create_logging_firewall_rules
56 | 
57 | #create_bigtable_instance
58 | 
59 | #create_mysql_instance
60 | #create_legacy_mssql_instance
61 | 
62 | # Create PHS dataproc cluster
63 | 
64 | #create_phs_cluster
65 | 
66 | # Create normal dataproc cluster
67 | 
68 | create_autoscaling_policy
69 | 
70 | create_dpgce_cluster
71 | 
72 | # Perform some connectivity tests
73 | 
74 | #perform_connectivity_tests
75 | 
76 | 


--------------------------------------------------------------------------------
/gcloud/bin/create-dpgke:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright 2021 Google LLC
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS-IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | 
19 | # Exit on failure
20 | set -e
21 | 
22 | 
23 | source lib/env.sh
24 | 
25 | set_cluster_name
26 | 
27 | create_bucket
28 | 
29 | enable_services
30 | 
31 | create_vpc_network
32 | 
33 | # create_ip_allocation
34 | # create_vpc_peering
35 | 
36 | # Create a cloud router
37 | 
38 | create_router
39 | 
40 | add_nat_policy
41 | 
42 | create_subnet
43 | 
44 | create_firewall_rules
45 | 
46 | create_service_account
47 | 
48 | grant_gke_roles
49 | 
50 | create_gke_cluster
51 | 
52 | # Create gke dataproc cluster
53 | 
54 | create_dpgke_cluster
55 | 
56 | # Perform some connectivity tests
57 | 
58 | # perform_connectivity_tests
59 | 
60 | 


--------------------------------------------------------------------------------
/gcloud/bin/destroy-dpgce:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright 2021 Google LLC
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS-IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | #set -e
19 | 
20 | source lib/env.sh
21 | 
22 | delete_dpgce_cluster
23 | 
24 | delete_service_account
25 | 
26 | delete_autoscaling_policy
27 | 
28 | #delete_phs_cluster()
29 | 
30 | #delete_mysql_instance
31 | #delete_legacy_mssql_instance
32 | 
33 | delete_router
34 | 
35 | delete_firewall_rules
36 | 
37 | #delete_logging_firewall_rules
38 | 
39 | #delete_ip_allocation
40 | 
41 | delete_subnet
42 | 
43 | delete_vpc_network
44 | 
45 | #delete_vpc_peering
46 | 
47 | delete_bucket
48 | 
49 | set +x
50 | 


--------------------------------------------------------------------------------
/gcloud/bin/destroy-dpgke:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright 2021 Google LLC
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS-IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | source lib/env.sh
19 | 
20 | # perform_connectivity_tests
21 | 
22 | # Perform some connectivity tests
23 | 
24 | delete_gke_cluster
25 | 
26 | # Delete gke dataproc cluster
27 | 
28 | delete_service_account
29 | 
30 | delete_firewall_rules
31 | 
32 | delete_subnet
33 | 
34 | # Delete a cloud router
35 | delete_router
36 | 
37 | # create_vpc_peering
38 | # create_ip_allocation
39 | delete_vpc_network
40 | 
41 | #enable_services
42 | delete_bucket
43 | 
44 | 


--------------------------------------------------------------------------------
/gcloud/bin/recreate-dpgce:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright 2021 Google LLC
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS-IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | source lib/env.sh
19 | 
20 | # Copy latest initialization action scripts
21 | #echo -n "copying actions to gcs bucket..."
22 | #gsutil -m cp \
23 | #       -L action-update.log \
24 | #       -r init/* gs://${BUCKET}/dataproc-initialization-actions
25 | #if [[ $? == 0 ]]; then
26 | #  echo "done"
27 | #else
28 | #  echo "fail"
29 | #  exit 1
30 | #fi
31 | 
32 | # re-create dpgce dataproc cluster
33 | delete_dpgce_cluster
34 | create_dpgce_cluster
35 | 
36 | echo "========================================"
37 | echo "General Purpose DPGCE Cluster re-created"
38 | echo "========================================"
39 | 
40 | 


--------------------------------------------------------------------------------
/gcloud/bin/recreate-dpgke:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright 2021 Google LLC
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS-IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | source env.sh
19 | 
20 | # Copy latest initialization action scripts
21 | gsutil -m cp -r init/* gs://${BUCKET}/dataproc-initialization-actions
22 | 
23 | # re-create normal dataproc cluster
24 | delete_gke_cluster
25 | create_gke_cluster
26 | 
27 | echo "========================"
28 | echo "DPGKE Cluster re-created"
29 | echo "========================"
30 | 
31 | 


--------------------------------------------------------------------------------
/gcloud/bin/scp-master:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright 2021 Google LLC
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS-IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | source lib/env.sh
18 | 
19 | if [[ -z "$1" ]]; then
20 |   echo "$0 [master number] <files to copy to worker:/tmp>"
21 |   exit 1
22 | fi
23 | 
24 | # If the first argument is a number, assume it indicates which master to send file to
25 | MASTER_HOSTNAME="${CLUSTER_NAME}-m"
26 | if [[ $(perl -e "print 1 if q{$1} =~ /^\d+$/") == "1" ]]; then
27 |   master_num="$1"
28 |   echo "master_num: $master_num"
29 |   MASTER_HOSTNAME="${MASTER_HOSTNAME}-${master_num}"
30 |   shift 1
31 | else
32 |   MASTER_HOSTNAME="${MASTER_HOSTNAME}"
33 | fi
34 | 
35 | date
36 | 
37 | gcloud compute scp --recurse "$*" \
38 |   --zone ${ZONE} \
39 |   ${MASTER_HOSTNAME}:/tmp  \
40 |   --tunnel-through-iap \
41 |   --project ${PROJECT_ID}
42 | 


--------------------------------------------------------------------------------
/gcloud/bin/scp-worker:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright 2021 Google LLC
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS-IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | source lib/env.sh
18 | 
19 | set -x
20 | 
21 | if [[ -z "$1" ]]; then
22 |   echo "$0 [worker number] <files to copy to worker:/tmp>"
23 |   exit 1
24 | fi
25 | 
26 | # If the first argument is a number, assume it indicates which worker to send file to
27 | WORKER_HOSTNAME="${CLUSTER_NAME}-w"
28 | if [[ $(perl -e "print 1 if q{$1} =~ /^\d+$/") == "1" ]]; then
29 |   worker_num="$1"
30 |   echo "worker_num: $worker_num"
31 |   WORKER_HOSTNAME="${WORKER_HOSTNAME}-${worker_num}"
32 |   shift 1
33 | else
34 |   WORKER_HOSTNAME="${WORKER_HOSTNAME}-0"  
35 | fi
36 | 
37 | gcloud compute scp --recurse "$*" \
38 |   --zone ${ZONE} \
39 |   ${WORKER_HOSTNAME}:/tmp  \
40 |   --tunnel-through-iap \
41 |   --project ${PROJECT_ID}
42 | 
43 | set +x
44 | 


--------------------------------------------------------------------------------
/gcloud/bin/ssh-master:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright 2021 Google LLC
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS-IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | source lib/env.sh
18 | 
19 | master_num="$1"
20 | 
21 | MASTER_HOSTNAME="${CLUSTER_NAME}-m"
22 | if [[ -n "$master_num" ]]; then
23 |   echo "master_num: $master_num"
24 |   MASTER_HOSTNAME="${MASTER_HOSTNAME}-${master_num}"
25 | fi
26 | 
27 | gcloud compute ssh --zone ${ZONE} ${MASTER_HOSTNAME}  --tunnel-through-iap --project ${PROJECT_ID} -- -o ConnectTimeout=360
28 | 


--------------------------------------------------------------------------------
/gcloud/bin/ssh-worker:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright 2021 Google LLC
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS-IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | source env.sh
18 | 
19 | worker_num="$1"
20 | 
21 | WORKER_HOSTNAME="${CLUSTER_NAME}-w"
22 | if [[ -n "$worker_num" ]]; then
23 |   echo "worker_num: $worker_num"
24 |   WORKER_HOSTNAME="${WORKER_HOSTNAME}-${worker_num}"
25 | else
26 |   WORKER_HOSTNAME="${WORKER_HOSTNAME}-0"  
27 | fi
28 | 
29 | gcloud compute ssh --zone ${ZONE} ${WORKER_HOSTNAME}  --tunnel-through-iap --project ${PROJECT_ID} -- -o ConnectTimeout=360
30 | 


--------------------------------------------------------------------------------
/gcloud/env.json.sample:
--------------------------------------------------------------------------------
 1 | {
 2 |   "PROJECT_ID":"ldap-example-yyyy-nn",
 3 |   "ORG_NUMBER":"100000000001",
 4 |   "USER": "project-owner",
 5 |   "DOMAIN": "your-domain-goes-here.com",
 6 |   "PRIV_USER": "privileged",
 7 |   "PRIV_DOMAIN": "privileged-domain-here.com",
 8 |   "BILLING_ACCOUNT": "100000-000000-000001",
 9 |   "FOLDER_NUMBER": "100000000001",
10 |   "REGION": "us-west4",
11 |   "RANGE": "10.00.01.0/24",
12 |   "IDLE_TIMEOUT": "30m",
13 |   "ASN_NUMBER": "65531",
14 |   "IMAGE_VERSION": "2.2,
15 |   "BUCKET": "myproject-dataproc-repro-bucket",
16 |   "TEMP_BUCKET": "myproject-dataproc-repro-temp-bucket",
17 |   "CLUSTER_NAME": "cluster-name-here",
18 |   "BIGTABLE_INSTANCE": "my-bigtable"
19 |   "BIGTABLE_DISPLAY_NAME": "my-bigtable-cluster",
20 |   "BIGTABLE_CLUSTER_CONFIG": "null"
21 | }
22 | 


--------------------------------------------------------------------------------
/gcloud/init/startup-script.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl -w
 2 | #
 3 | # Copyright 2021 Google LLC
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS-IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | use v5.38;
19 | use POSIX qw(strftime);
20 | 
21 | #
22 | # Dataproc Pre-Startup
23 | # AKA GCE Linux VM Startup script
24 | # Example script follows
25 | #
26 | # https://cloud.google.com/compute/docs/instances/startup-scripts/linux
27 | #
28 | # startup-script-perl
29 | #
30 | 
31 | use File::Temp qw/ tempdir  /;
32 | my $template = q{/tmp/prestartup-script-XXXXXXXX};
33 | my $TMPDIR = tempdir ( $template, CLEANUP => 0 );
34 | my $LOGFILE = qq{${TMPDIR}/prestartup-script.log};
35 | 
36 | open(my($log_fh), q{>}, qq{${LOGFILE}}) or warn qq{cannot open logfile for write};
37 | sub log_msg {
38 |   print $log_fh strftime(q{%F %T %z> }, localtime), qq{@_}, $/;
39 | }
40 | 
41 | log_msg qq{script start};
42 | 
43 | # script goes here
44 | 
45 | log_msg qq{script end};
46 | 
47 | #
48 | # Startup script ends
49 | #
50 | 


--------------------------------------------------------------------------------
/gcloud/init/startup-script.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright 2021 Google LLC
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS-IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | #
19 | # Example Dataproc Initialization Action
20 | # AKA GCE Linux VM Startup script
21 | # Example script follows
22 | #
23 | # https://cloud.google.com/compute/docs/instances/startup-scripts/linux
24 | #
25 | # startup-script-bash
26 | #
27 | 
28 | set -x
29 | 
30 | readonly master_node=$(/usr/share/google/get_metadata_value attributes/dataproc-master)
31 | readonly ROLE="$(/usr/share/google/get_metadata_value attributes/dataproc-role)"
32 | if [[ "${ROLE}" == 'Master' ]]; then
33 |   if [[ "${HOSTNAME}" == "$master_node" ]]; then
34 |     echo "this instance will go first"
35 |   else
36 |     sleep 15s
37 |   fi
38 | fi
39 | 
40 | echo "PATH: ${PATH}" > /tmp/startup-script-path.log
41 | which gcloud > /tmp/gcloud-path.log
42 | 
43 | set +x
44 | 


--------------------------------------------------------------------------------
/gcloud/lib/database-functions.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # Copyright 2021 Google LLC
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #      http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS-IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | #
 17 | 
 18 | function create_mysql_instance() {
 19 |   # Create SQL Instance
 20 |   set -x
 21 |   gcloud sql instances create ${MYSQL_INSTANCE} \
 22 |     --no-assign-ip \
 23 |     --project=${PROJECT_ID} \
 24 |     --network=${NETWORK_URI_PARTIAL} \
 25 |     --database-version="MYSQL_5_7" \
 26 |     --activation-policy=ALWAYS \
 27 |     --zone ${ZONE}
 28 |   set +x
 29 | 
 30 |   echo "======================"
 31 |   echo "MySQL Instance Created"
 32 |   echo "======================"
 33 | }
 34 | 
 35 | function delete_mysql_instance() {
 36 |   set -x
 37 |   gcloud sql instances delete --quiet ${MYSQL_INSTANCE}
 38 |   set +x
 39 | 
 40 |   echo "MySQL instance deleted"
 41 | }
 42 | 
 43 | function create_legacy_mssql_instance() {
 44 |   set -x
 45 |   # Create legacy SQL Server Instance
 46 |   # Microsoft SQL Server 2014 Enterprise
 47 |   # Windows Server 2012
 48 |   # 64-bit
 49 |   local METADATA="kdc-root-passwd=${INIT_ACTIONS_ROOT}/${KDC_ROOT_PASSWD_KEY}.encrypted"
 50 |   METADATA="${METADATA},kms-keyring=${KMS_KEYRING}"
 51 |   METADATA="${METADATA},kdc-root-passwd-key=${KDC_ROOT_PASSWD_KEY}"
 52 |   METADATA="${METADATA},startup-script-url=${INIT_ACTIONS_ROOT}/kdc-server.sh"
 53 |   METADATA="service-account-user=${GSA}"
 54 |   gcloud compute instances create ${MSSQL_INSTANCE} \
 55 |     --zone ${ZONE} \
 56 |     --subnet ${SUBNET} \
 57 |     --service-account=${GSA} \
 58 |     --boot-disk-type pd-ssd \
 59 |     --image-family=${MSSQL_IMAGE_FAMILY} \
 60 |     --image-project=${MSSQL_IMAGE_PROJECT} \
 61 |     --machine-type=${MSSQL_MACHINE_TYPE} \
 62 |     --scopes='cloud-platform' \
 63 |     --metadata ${METADATA}
 64 |   set +x
 65 | }
 66 | 
 67 | function delete_legacy_mssql_instance() {
 68 |   set -x
 69 |   gcloud compute instances delete ${MSSQL_INSTANCE} \
 70 |     --quiet
 71 |   set +x
 72 |   echo "mssql legacy instance deleted"
 73 | }
 74 | 
 75 | function create_mssql_instance() {
 76 |   # Create CloudSQL Instance
 77 |   set -x
 78 |   # This only works for:
 79 | 
 80 |   # SQLSERVER_2017_ENTERPRISE, SQLSERVER_2017_EXPRESS, SQLSERVER_2017_STANDARD,
 81 |   # SQLSERVER_2017_WEB, SQLSERVER_2019_ENTERPRISE, SQLSERVER_2019_EXPRESS,
 82 |   # SQLSERVER_2019_STANDARD, SQLSERVER_2019_WEB
 83 | 
 84 |   gcloud sql instances create ${MSSQL_INSTANCE} \
 85 |     --no-assign-ip \
 86 |     --project=${PROJECT_ID} \
 87 |     --network=${NETWORK_URI_PARTIAL} \
 88 |     --database-version=${MSSQL_DATABASE_VERSION} \
 89 |     --activation-policy=ALWAYS \
 90 |     --zone ${ZONE}
 91 |   set +x
 92 | 
 93 |   echo "======================"
 94 |   echo "mssql Instance Created"
 95 |   echo "======================"
 96 | }
 97 | 
 98 | function delete_mssql_instance() {
 99 |   set -x
100 |   gcloud sql instances delete --quiet ${MSSQL_INSTANCE}
101 |   set +x
102 | 
103 |   echo "mssql instance deleted"
104 | }
105 | 
106 | function create_pgsql_instance() {
107 |   # Create CloudSQL Instance
108 |   set -x
109 | 
110 |   gcloud sql instances create ${PGSQL_INSTANCE} \
111 |     --no-assign-ip \
112 |     --project=${PROJECT_ID} \
113 |     --network=${NETWORK_URI_PARTIAL} \
114 |     --database-version=${PGSQL_DATABASE_VERSION} \
115 |     --activation-policy=ALWAYS \
116 |     --root-password="${PGSQL_ROOT_PASSWORD}" \
117 |     --zone ${ZONE}
118 |   set +x
119 | 
120 |   echo "======================"
121 |   echo "pgsql Instance Created"
122 |   echo "======================"
123 | }
124 | 
125 | function delete_pgsql_instance() {
126 |   set -x
127 |   gcloud sql instances delete --quiet ${PGSQL_INSTANCE}
128 |   set +x
129 | 
130 |   echo "pgsql instance deleted"
131 | }
132 | 
133 | 


--------------------------------------------------------------------------------
/gcloud/lib/secure-boot/create-key-pair.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS-IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | #
 15 | # This script creates a key pair and publishes to cloud secrets or
 16 | # fetches an already published key pair from cloud secrets
 17 | 
 18 | set -e
 19 | 
 20 | # https://github.com/glevand/secure-boot-utils
 21 | 
 22 | # https://cloud.google.com/compute/shielded-vm/docs/creating-shielded-images#adding-shielded-image
 23 | 
 24 | # https://cloud.google.com/compute/shielded-vm/docs/creating-shielded-images#generating-security-keys-certificates
 25 | 
 26 | # https://wiki.archlinux.org/title/Unified_Extensible_Firmware_Interface/Secure_Boot#Creating_keys
 27 | 
 28 | ITERATION=042
 29 | 
 30 | CURRENT_PROJECT_ID="$(gcloud config get project)"
 31 | if [[ -z "${CURRENT_PROJECT_ID}" ]]; then
 32 |     echo 'project is not set.  please set with `gcloud config set project ${PROJECT_ID}`' >&2
 33 |     exit -1
 34 | fi
 35 | PROJECT_ID="${CURRENT_PROJECT_ID}"
 36 | 
 37 | function create_key () {
 38 |     local EFI_VAR_NAME="$1"
 39 |     local CN_VAL="$2"
 40 |     local PRIVATE_KEY="tls/${EFI_VAR_NAME}.rsa"
 41 |     local CACERT="tls/${EFI_VAR_NAME}.pem"
 42 |     local CACERT_DER="tls/${EFI_VAR_NAME}.der"
 43 |     CA_KEY_SECRET_NAME="efi-${EFI_VAR_NAME}-priv-key-${ITERATION}"
 44 |     CA_CERT_SECRET_NAME="efi-${EFI_VAR_NAME}-pub-key-${ITERATION}"
 45 |     # If the secrets exist in secret manager, populate the tls/ directory
 46 |     if [[ ! -f "${PRIVATE_KEY}" ]] && gcloud secrets describe "${CA_CERT_SECRET_NAME}" > /dev/null ; then
 47 |       mkdir -p tls
 48 | 
 49 |       gcloud secrets versions access "1" \
 50 |         --project="${PROJECT_ID}" \
 51 |         --secret="${CA_KEY_SECRET_NAME}" \
 52 |         | dd of="${PRIVATE_KEY}" status=none
 53 | 
 54 |       gcloud secrets versions access "1" \
 55 |         --project="${PROJECT_ID}" \
 56 |         --secret="${CA_CERT_SECRET_NAME}" \
 57 |         | base64 --decode \
 58 |         | dd of="${CACERT_DER}" status=none
 59 | 
 60 |       # Create a PEM-format version of the cert
 61 |       openssl x509 \
 62 |         -inform DER \
 63 |         -in "${CACERT_DER}" \
 64 |         -outform PEM \
 65 |         -out "${CACERT}"
 66 | 
 67 |       MS_UEFI_CA="tls/MicCorUEFCA2011_2011-06-27.crt"
 68 |       curl -s -L -o "${MS_UEFI_CA}" 'https://go.microsoft.com/fwlink/p/?linkid=321194'
 69 | 
 70 |       echo "${CA_KEY_SECRET_NAME}" > tls/private-key-secret-name.txt
 71 |       echo "${CA_CERT_SECRET_NAME}" > tls/public-key-secret-name.txt
 72 |       modulus_md5sum="$(openssl rsa -noout -modulus -in ${PRIVATE_KEY} | openssl md5 | awk '{print $2}' | tee tls/modulus-md5sum.txt)"
 73 |       return
 74 |     fi
 75 | 
 76 |     if [[ -f "${PRIVATE_KEY}" ]]; then
 77 | #        echo "key already exists.  Skipping generation." >&2
 78 |         modulus_md5sum="$(cat tls/modulus-md5sum.txt)"
 79 |         return
 80 |     fi
 81 |     mkdir -p tls
 82 | 
 83 |     echo "generating '${CN_VAL}' '${CACERT}', '${CACERT_DER}' and '${PRIVATE_KEY}'" >&2
 84 |     # Generate new x.509 key and cert
 85 |     openssl req \
 86 |             -newkey rsa:3072 \
 87 |             -nodes \
 88 |             -keyout "${PRIVATE_KEY}" \
 89 |             -new \
 90 |             -x509 \
 91 |             -sha256 \
 92 |             -days 3650 \
 93 |             -subj "/CN=${CN_VAL}/" \
 94 |             -out "${CACERT}"
 95 | 
 96 |     # Create a DER-format version of the cert
 97 |     openssl x509 \
 98 |             -outform DER \
 99 |             -in "${CACERT}" \
100 |             -outform DER \
101 |             -in "${CACERT}" \
102 |             -out "${CACERT_DER}"
103 | 
104 |     # Create a new secret containing private key
105 |     gcloud secrets create "${CA_KEY_SECRET_NAME}" \
106 |            --project="${PROJECT_ID}" \
107 |            --replication-policy="automatic" \
108 |            --data-file="${PRIVATE_KEY}"
109 | 
110 |     echo "Private key secret name: '${CA_KEY_SECRET_NAME}'" >&2
111 |     echo "${CA_KEY_SECRET_NAME}" > tls/private-key-secret-name.txt
112 | 
113 |     # Create a new secret containing public key
114 |     cat "${CACERT_DER}" | base64 > "${CACERT_DER}.base64"
115 |     gcloud secrets create "${CA_CERT_SECRET_NAME}" \
116 |            --project="${PROJECT_ID}" \
117 |            --replication-policy="automatic" \
118 |            --data-file="${CACERT_DER}.base64"
119 | 
120 |     modulus_md5sum="$(openssl x509 -noout -modulus -in ${CACERT} | openssl md5 | awk '{print $2}')"
121 |     echo "modulus-md5sum: ${modulus_md5sum}" >&2
122 |     echo "${modulus_md5sum}" > tls/modulus-md5sum.txt
123 |     echo "Public key secret name: '${CA_CERT_SECRET_NAME}'" >&2
124 |     echo "${CA_CERT_SECRET_NAME}" > tls/public-key-secret-name.txt
125 | 
126 | }
127 | 
128 | EFI_VAR_NAME=db
129 | 
130 | create_key "${EFI_VAR_NAME}" "Cloud Dataproc Custom Image CA ${ITERATION}"
131 | 
132 | echo "modulus_md5sum=${modulus_md5sum}"
133 | echo "private_secret_name=${CA_KEY_SECRET_NAME}"
134 | echo "public_secret_name=${CA_CERT_SECRET_NAME}"
135 | echo "secret_project=${PROJECT_ID}"
136 | echo "secret_version=1"
137 | 


--------------------------------------------------------------------------------
/notebooks/README.md:
--------------------------------------------------------------------------------
 1 | # Spark and Jupyter Notebooks on Dataproc
 2 | 
 3 | [Google Cloud Dataproc](https://cloud.google.com/dataproc) allows you to use [JupyterLab](https://jupyterlab.readthedocs.io/en/stable/) or [Classic Jupyter Notebooks](https://jupyter-notebook.readthedocs.io/en/stable/) on your cluster by enabling the [Jupyter component](https://cloud.google.com/dataproc/docs/concepts/components/jupyter) and [Component Gateway](https://cloud.google.com/dataproc/docs/concepts/accessing/dataproc-gateways). 
 4 | 
 5 | This folder contains example notebooks for using Spark with the [BigQuery Storage connector](https://cloud.google.com/dataproc/docs/concepts/connectors/bigquery) and [Google Cloud Storage connector](https://cloud.google.com/dataproc/docs/concepts/connectors/cloud-storage), and other common use cases.
 6 | 
 7 | ### Create Dataproc Cluster with Jupyter
 8 | 
 9 | These notebooks are designed to be run on Google Cloud Dataproc.
10 | 
11 | Follow the links below for instructions on how to create a Dataproc Cluster with the Juypter component installed.
12 | 
13 | * [Tutorial - Install and run a Jupyter notebook on a Dataproc cluster](https://cloud.google.com/dataproc/docs/tutorials/jupyter-notebook)
14 | * [Blog post - Apache Spark and Jupyter Notebooks made easy with Dataproc component gateway](https://medium.com/google-cloud/apache-spark-and-jupyter-notebooks-made-easy-with-dataproc-component-gateway-fa91d48d6a5a)
15 | 
16 | ## Notebooks
17 | 
18 | ### Python 3 Kernel (PySpark)
19 | 
20 | * 1.1. BigQuery Storage & Spark DataFrames
21 | * 1.2. BigQuery Storage & Spark SQL
22 | * 1.3. BigQuery Storage & Spark MLlib
23 | * 2.1. Google Cloud Storage (CSV) & Spark DataFrames
24 | * 3.1. Spark DataFrames & Pandas Plotting
25 | 
26 | ### Neo4j examples (Python and PySpark)
27 | 
28 | * / neo4j / 1.1. Similarity Example, Graph Data Science Client - Python
29 | * / neo4j / 1.2. Similarity Example, Python Client
30 | * / neo4j / 1.3. Similarity Example, BigQuery Storage & Spark Connector & GDS Client - Python


--------------------------------------------------------------------------------
/notebooks/ai-ml/README.md:
--------------------------------------------------------------------------------
1 | There are three ways to run notebooks inside Dataproc:
2 | 1. [Vertex workbench](https://cloud.google.com/vertex-ai/docs/workbench/instances/create-dataproc-enabled) using [serverless sessions](https://cloud.google.com/dataproc-serverless/docs/quickstarts/jupyterlab-sessions) (Recommended)
3 | 2. Colab enterprises using serverless session (Preview)
4 | 3. [Jupyter optional component](https://cloud.google.com/dataproc/docs/concepts/components/jupyter)
5 | 
6 | ## Contributing
7 | Please follow this [template](https://github.com/GoogleCloudPlatform/generative-ai/blob/main/notebook_template.ipynb) to contribute more scenarios
8 | 


--------------------------------------------------------------------------------
/notebooks/python/neo4j/images/neo4j_dataproc_api_selector.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudDataproc/cloud-dataproc/cb0ced6a30bc88a8ddc58d2870c217273be4958c/notebooks/python/neo4j/images/neo4j_dataproc_api_selector.png


--------------------------------------------------------------------------------
/notebooks/python/neo4j/images/neo4j_dataproc_mkture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudDataproc/cloud-dataproc/cb0ced6a30bc88a8ddc58d2870c217273be4958c/notebooks/python/neo4j/images/neo4j_dataproc_mkture.png


--------------------------------------------------------------------------------
/notebooks/python/neo4j/images/neo4j_dataproc_process.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudDataproc/cloud-dataproc/cb0ced6a30bc88a8ddc58d2870c217273be4958c/notebooks/python/neo4j/images/neo4j_dataproc_process.png


--------------------------------------------------------------------------------
/notebooks/python/neo4j/images/neo4j_dataproc_stack.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudDataproc/cloud-dataproc/cb0ced6a30bc88a8ddc58d2870c217273be4958c/notebooks/python/neo4j/images/neo4j_dataproc_stack.png


--------------------------------------------------------------------------------
/spark-hbase/README.md:
--------------------------------------------------------------------------------
  1 | # Google Cloud Dataproc
  2 | 
  3 | This folder contains code and documentation to connect to HBase from Spark job in Dataproc following steps below:
  4 | 
  5 | ## Create a Dataproc cluster with HBase, Zookeeper installed
  6 | 
  7 | Create a Dataproc cluster using the –optional-components flag to install the HBase, Zookeeper optional component on the cluster and the enable-component-gateway flag to enable the Component Gateway to allow you to access the HBase Console from the Cloud Console.
  8 | 
  9 | * Set environment variables:
 10 |   * PROJECT: Your GCP project ID.
 11 |   * REGION: Region where the cluster used in this tutorial will be created, for example, "us-west1".
 12 |   * WORKERS: 3 - 5 workers are recommended for this tutorial. 
 13 |   * IMAGE_VERSION: Your Dataproc image version.
 14 |   * CLUSTER: Your cluster name.
 15 | 
 16 | * For example:
 17 | 
 18 | ```
 19 | PROJECT=my-projectId
 20 | WORKERS=3
 21 | REGION=us-west1
 22 | IMAGE_VERSION=2.0.30-debian10
 23 | CLUSTER=dataproc-hbase-cluster
 24 | ```
 25 | 
 26 | * Run the Google Cloud CLI on your local machine to create the cluster:
 27 | 
 28 | ```
 29 | gcloud dataproc clusters create ${CLUSTER} \
 30 |   --project=${PROJECT} \
 31 |   --image-version=${IMAGE_VERSION} \
 32 |   --region=${REGION} \
 33 |   --num-workers=${WORKERS} \
 34 |   --optional-components=ZOOKEEPER,HBASE \
 35 |   --enable-component-gateway
 36 | ```
 37 | 
 38 | ## Create HBase table via HBase Shell
 39 | 
 40 | * These steps are to create the HBase table:
 41 |   * SSH to the master node of Dataproc cluster
 42 |   * Run the command to access to HBase shell: 
 43 |   
 44 |   ```
 45 |   hbase shell
 46 |   ```
 47 |   * Create HBase table with a column family:
 48 |     
 49 |   ```
 50 |   create 'my_table','cf'
 51 |   ```
 52 | 
 53 | ## Spark code to write, read the data to HBase in Java/Python
 54 | 
 55 | To build the code, run the following commands from the main directory:
 56 | 
 57 | ```
 58 | mvn clean package
 59 | ```
 60 | 
 61 | > Note that build requires Java 8 and Maven installed
 62 | 
 63 | ## Submit the Spark job
 64 | 
 65 | ### Submit the Java Spark job
 66 | 
 67 | To submit the Java Spark job, using the command below:
 68 | 
 69 | ```
 70 | gcloud dataproc jobs submit spark --class=hbase.SparkHBaseMain \
 71 |   --jars=spark-hbase-1.0-SNAPSHOT.jar \
 72 |   --project=${PROJECT} \
 73 |   --region=${REGION} \
 74 |   --cluster=${CLUSTER} \
 75 |   --properties='spark.driver.extraClassPath=/etc/hbase/conf:/usr/lib/hbase/*,spark.executor.extraClassPath=/etc/hbase/conf:/usr/lib/hbase/*'
 76 | ```
 77 | 
 78 | ### Submit the PySpark job
 79 | 
 80 | To submit the PySpark job, using the command below:
 81 | 
 82 | ```
 83 | gcloud dataproc jobs submit pyspark --project=${PROJECT} \
 84 |   --region=${REGION} \
 85 |   --cluster=${CLUSTER} \
 86 |   --properties='spark.driver.extraClassPath=/etc/hbase/conf:/usr/lib/hbase/*,spark.executor.extraClassPath=/etc/hbase/conf:/usr/lib/hbase/*' \
 87 |   pyspark-hbase.py
 88 | ```
 89 | 
 90 | > You need to pass the Spark properties “spark.driver.extraClassPath” and “spark.executor.extraClassPath” to add HBase configuration and HBase library to classpath. Otherwise, the job will fail with exceptions.
 91 | 
 92 | Alternatively, you can add above properties when creating the Dataproc cluster so that you do not need to pass properties to the Spark submit command. For example, this command to add the properties when creating the Dataproc cluster:
 93 | 
 94 | ```
 95 | gcloud dataproc clusters create ${CLUSTER} \
 96 |   --project=${PROJECT} \
 97 |   --image-version=${IMAGE_VERSION} \
 98 |   --region=${REGION} \
 99 |   --num-workers=${WORKERS} \
100 |   --optional-components=ZOOKEEPER,HBASE \
101 |   --enable-component-gateway \
102 |   --properties='spark:spark.driver.extraClassPath=/etc/hbase/conf:/usr/lib/hbase/*,spark:spark.executor.extraClassPath=/etc/hbase/conf:/usr/lib/hbase/*’
103 | ```
104 | 
105 | ## Verify the data in HBase shell
106 | 
107 | After the Spark job is successfully, you can follow these steps to verify the data in HBase:
108 | 1. SSH to the master node of Dataproc cluster
109 | 2. Run the command to access to HBase Shell:
110 | 
111 | ```
112 | hbase shell
113 | ```
114 | 3. Scan the table to view the data using command:
115 | 
116 | ```
117 | scan 'my_table'
118 | ```
119 | 
120 | It should return the data:
121 | 
122 | ```
123 | ROW                           COLUMN+CELL                                                                          
124 | key1                         column=cf:status, timestamp=1645043111980, value=foo                                 
125 | key2                         column=cf:status, timestamp=1645043111980, value=bar
126 | ```


--------------------------------------------------------------------------------
/spark-hbase/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |     <name>Spark Hbase Tutorial</name>
 8 |     <groupId>com.google.cloud.dataproc.spark.hbase</groupId>
 9 |     <artifactId>spark-hbase</artifactId>
10 |     <version>1.0-SNAPSHOT</version>
11 |     <url>https://github.com/GoogleCloudDataproc/cloud-dataproc</url>
12 | 
13 |     <properties>
14 |         <scala.version>2.12.14</scala.version>
15 |         <scala.main.version>2.12</scala.main.version>
16 |         <spark.version>3.1.2</spark.version>
17 |         <hbase.client.version>2.2.7</hbase.client.version>
18 |         <hbase-spark.version>1.0.0</hbase-spark.version>
19 |     </properties>
20 | 
21 |     <dependencies>
22 |         <dependency>
23 |             <groupId>org.scala-lang</groupId>
24 |             <artifactId>scala-library</artifactId>
25 |             <version>${scala.version}</version>
26 |         </dependency>
27 | 
28 |         <dependency>
29 |             <groupId>org.apache.spark</groupId>
30 |             <artifactId>spark-core_${scala.main.version}</artifactId>
31 |             <version>${spark.version}</version>
32 |             <scope>compile</scope>
33 |         </dependency>
34 | 
35 |         <dependency>
36 |             <groupId>org.apache.spark</groupId>
37 |             <artifactId>spark-sql_${scala.main.version}</artifactId>
38 |             <version>${spark.version}</version>
39 |             <scope>compile</scope>
40 |         </dependency>
41 | 
42 |         <dependency>
43 |             <groupId>org.apache.hbase</groupId>
44 |             <artifactId>hbase-client</artifactId>
45 |             <version>${hbase.client.version}</version>
46 |             <scope>provided</scope>
47 |         </dependency>
48 | 
49 |         <dependency>
50 |             <groupId>org.apache.hbase.connectors.spark</groupId>
51 |             <artifactId>hbase-spark</artifactId>
52 |             <version>${hbase-spark.version}</version>
53 |         </dependency>
54 | 
55 |     </dependencies>
56 | </project>
57 | 


--------------------------------------------------------------------------------
/spark-hbase/scripts/pyspark-hbase.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | 
 3 | # Initialize Spark Session
 4 | spark = SparkSession \
 5 |   .builder \
 6 |   .master('yarn') \
 7 |   .appName('spark-hbase-tutorial') \
 8 |   .getOrCreate()
 9 | 
10 | data_source_format = ''
11 | 
12 | # Create some test data
13 | df = spark.createDataFrame(
14 |     [
15 |         ("key1", "foo"),
16 |         ("key2", "bar"),
17 |     ],
18 |     ["key", "name"]
19 | )
20 | 
21 | # Define the schema for catalog
22 | catalog = ''.join("""{
23 |     "table":{"namespace":"default", "name":"my_table"},
24 |     "rowkey":"key",
25 |     "columns":{
26 |         "key":{"cf":"rowkey", "col":"key", "type":"string"},
27 |         "name":{"cf":"cf", "col":"name", "type":"string"}
28 |     }
29 | }""".split())
30 | 
31 | # Write to HBase
32 | df.write.format('org.apache.hadoop.hbase.spark').options(catalog=catalog).option("hbase.spark.use.hbasecontext", "false").mode("overwrite").save()
33 | 
34 | # Read from HBase
35 | result = spark.read.format('org.apache.hadoop.hbase.spark').options(catalog=catalog).option("hbase.spark.use.hbasecontext", "false").load()
36 | result.show()


--------------------------------------------------------------------------------
/spark-hbase/src/main/java/hbase/SparkHBaseMain.java:
--------------------------------------------------------------------------------
 1 | package hbase;
 2 | 
 3 | import org.apache.hadoop.hbase.spark.datasources.HBaseTableCatalog;
 4 | import org.apache.spark.sql.Dataset;
 5 | import org.apache.spark.sql.Row;
 6 | import org.apache.spark.sql.SparkSession;
 7 | 
 8 | import java.io.Serializable;
 9 | import java.util.Arrays;
10 | import java.util.HashMap;
11 | import java.util.Map;
12 | 
13 | public class SparkHBaseMain {
14 |     public static class SampleData implements Serializable {
15 |         private String key;
16 |         private String name;
17 | 
18 | 
19 |         public SampleData(String key, String name) {
20 |             this.key = key;
21 |             this.name = name;
22 |         }
23 | 
24 |         public SampleData() {
25 |         }
26 | 
27 |         public String getName() {
28 |             return name;
29 |         }
30 | 
31 |         public void setName(String name) {
32 |             this.name = name;
33 |         }
34 | 
35 |         public String getKey() {
36 |             return key;
37 |         }
38 | 
39 |         public void setKey(String key) {
40 |             this.key = key;
41 |         }
42 |     }
43 |     public static void main(String[] args) {
44 |         // Init SparkSession
45 |         SparkSession spark = SparkSession
46 |                 .builder()
47 |                 .master("yarn")
48 |                 .appName("spark-hbase-tutorial")
49 |                 .getOrCreate();
50 | 
51 |         // Data Schema
52 |         String catalog = "{"+"\"table\":{\"namespace\":\"default\", \"name\":\"my_table\"}," +
53 |                 "\"rowkey\":\"key\"," +
54 |                 "\"columns\":{" +
55 |                 "\"key\":{\"cf\":\"rowkey\", \"col\":\"key\", \"type\":\"string\"}," +
56 |                 "\"name\":{\"cf\":\"cf\", \"col\":\"name\", \"type\":\"string\"}" +
57 |                 "}" +
58 |                 "}";
59 | 
60 |         Map<String, String> optionsMap = new HashMap<String, String>();
61 |         optionsMap.put(HBaseTableCatalog.tableCatalog(), catalog);
62 | 
63 |         Dataset<Row> ds= spark.createDataFrame(Arrays.asList(
64 |                 new SampleData("key1", "foo"),
65 |                 new SampleData("key2", "bar")), SampleData.class);
66 | 
67 |         // Write to HBase
68 |         ds.write()
69 |                 .format("org.apache.hadoop.hbase.spark")
70 |                 .options(optionsMap)
71 |                 .option("hbase.spark.use.hbasecontext", "false")
72 |                 .mode("overwrite")
73 |                 .save();
74 | 
75 |         // Read from HBase
76 |         Dataset dataset = spark.read()
77 |                 .format("org.apache.hadoop.hbase.spark")
78 |                 .options(optionsMap)
79 |                 .option("hbase.spark.use.hbasecontext", "false")
80 |                 .load();
81 |         dataset.show();
82 |     }
83 | }
84 | 


--------------------------------------------------------------------------------
/spark-tensorflow/config-standard.yaml:
--------------------------------------------------------------------------------
1 | trainingInput:
2 |         scaleTier: STANDARD_1
3 | 


--------------------------------------------------------------------------------
/spark-tensorflow/doc/ml-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudDataproc/cloud-dataproc/cb0ced6a30bc88a8ddc58d2870c217273be4958c/spark-tensorflow/doc/ml-pipeline.png


--------------------------------------------------------------------------------
/spark-tensorflow/doc/prereqs.md:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | 
 3 | 1. You are familiar with [Python](https://www.python.org/).
 4 | 
 5 | 2. You have [TensorFlow set up](https://www.tensorflow.org/install/). We will
 6 | use the [Python API](https://www.tensorflow.org/api_docs/python/) to communicate
 7 | with it.
 8 | 
 9 | 3. You have [Spark set up](https://spark.apache.org/docs/latest/) and
10 |    you know [the basics](https://spark.apache.org/docs/latest/quick-start.html).
11 | 
12 | 4. You are familiar with [scala](http://scala-lang.org/) and have [SBT](http://www.scala-sbt.org/) installed. We will use scala to communicate with Spark.
13 | 
14 | 5. You have access to the [Criteo Kaggle data set](http://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset/). Unless you are interested in
15 | stress testing Cloud products, I highly recommend making it smaller.
16 | 
17 | 6. The data should be available to your Spark cluster. If you intend to run this
18 | example on [Google Cloud Platform](https://cloud.google.com/), this means
19 | putting it in Google Cloud Storage:
20 |        a. [Sign up for free tier](https://cloud.google.com/free/) if you haven't already.
21 |        b. [Create a project](https://cloud.google.com/resource-manager/docs/creating-managing-projects).
22 |        c. Store your data set(s) into a [Google Cloud Storage
23 |        bucket](https://cloud.google.com/storage/) associated with your project.
24 |        
25 | 7. If you plan to do use [Dataproc](https://cloud.google.com/dataproc/docs/) for
26 |    preprocessing or [Machine Learning
27 |    Engine](https://cloud.google.com/ml-engine/docs/) for training/prediction,
28 |    please [enable them in your
29 |    project](https://support.google.com/cloud/answer/6158841?hl=en).
30 | 
31 | - - -
32 | 
33 | [Home](../README.md)
34 | 


--------------------------------------------------------------------------------
/spark-tensorflow/doc/spark-package.md:
--------------------------------------------------------------------------------
 1 | # Packaging the Criteo preprocessor
 2 | 
 3 | Before we can preprocess the Criteo data, we must package our Spark script into
 4 | a JAR file, as well as the `spark-tensorflow-connector` and the
 5 | [scopt](https://github.com/scopt/scopt) library, which we use to parse arguments
 6 | to our Spark script. We will submit the JAR files for these three components to
 7 | our Spark cluster together in order to run preprocessing jobs.
 8 | 
 9 | This is as simple as going to the [preprocessor directory](../prepare) and
10 | running:
11 | `$ sbt clean package`
12 | 
13 | This will create a JAR file whose path (relative to the root directory for this
14 | project) is `prepare/target/scala-2.11/criteo-prepare_2.11-1.0.jar`. Let us set
15 | the variable
16 | ```
17 | $ CRITEO_JAR=prepare/target/scala-2.11/criteo-prepare_2.11-1.0.jar
18 | ```
19 | 
20 | ## spark-tensorflow-connector
21 | 
22 | You will have to provide Spark with the JAR file to this component as,
23 | otherwise, it will not know how to write a DataFrame into `.tfrecords` files.
24 | 
25 | To build the `spark-tensorflow-connector` JAR, you can [follow the
26 | instructions in the tensorflow/ecosystem
27 | docs](https://github.com/tensorflow/ecosystem/tree/master/spark/spark-tensorflow-connector).
28 | (Don't worry, this is painless.)
29 | 
30 | Let us set the variable
31 | ```
32 | $ SPARK_TF_CONNECTOR_JAR=<path to spark-tensorflow-connector JAR you just built>
33 | ```
34 | 
35 | ## scopt
36 | 
37 | We will use [scopt](https://github.com/scopt/scopt) to process arguments to our
38 | Spark script. We will provide the package information to Spark when we run the
39 | job.
40 | 
41 | - - -
42 | 
43 | [Home](../README.md)
44 | 


--------------------------------------------------------------------------------
/spark-tensorflow/doc/spark-submit.md:
--------------------------------------------------------------------------------
 1 | # Submitting a Spark Preprocessing job
 2 | 
 3 | Let us assume that the input data is stored under the path
 4 | `$BASE_DIR/$CRITEO_DATASET` and that we want the `.tfrecords` files to be
 5 | written to `$BASE_DIR/$OUTPUT_DIR`.
 6 | 
 7 | Let us set `MODE=analyze`, `MODE=transform`
 8 | 
 9 | Mode `analyze` use training data to preprocess the data and create
10 | artifacts for either the `transform` step or to be fed directly 
11 | into the TF graph. The artifacts created are the average value
12 | of each integer feature (which is used to replace nulls), or the
13 | rank of each feature value for the categorical features, which 
14 | is used as the categorical value in the TF graph.
15 | 
16 | 
17 | ### Local preprocessing
18 | 
19 | ```
20 | $ spark-submit --master local --class \
21 | com.google.cloud.ml.samples.criteo.CriteoPreprocessingApplication \
22 | --packages com.github.scopt:scopt_2.11:3.6.0 --jars \
23 | "$SPARK_TF_CONNECTOR_JAR" $CRITEO_JAR --base $BASE_DIR \
24 | --in $CRITEO_DATASET --out $OUTPUT_DIR -m $MODE 
25 | ```
26 | 
27 | ### Cloud preprocessing
28 | 
29 | Alternatively, you can use [Dataproc](https://cloud.google.com/dataproc/docs/)
30 | to perform preprocessing in the cloud.
31 | 
32 | Begin by [creating a Dataproc
33 | cluster](https://cloud.google.com/dataproc/docs/guides/create-cluster). Store
34 | its name as `CLUSTER=<name of your Dataproc cluster>`.
35 | 
36 | Make sure that your data lives in a Google Cloud Storage bucket accessible to
37 | your Dataproc cluster as described in the [Prerequisites](./prereqs.md). Store
38 | the bucket URL under the `BUCKET` variable, and let the `CRITEO_DATASET` and
39 | `OUTPUT_DIR` variables be defined above relative to the GCS bucket URL.
40 | 
41 | Then:
42 | ```
43 | $ gcloud dataproc jobs submit spark --cluster $CLUSTER \
44 | --properties spark.jars.packages=com.github.scopt:scopt_2.11:3.6.0 \
45 | --jars "$SPARK_TF_CONNECTOR_JAR,$CRITEO_JAR" \
46 | --class "com.google.cloud.ml.samples.criteo.CriteoPreprocessingApplication" \
47 | -- \
48 | -b $BUCKET -i $CRITEO_DATASET -o $OUTPUT_DIR -m $MODE -x 
49 | ```
50 | 
51 | - - -
52 | 
53 | [Home](../README.md)
54 | 


--------------------------------------------------------------------------------
/spark-tensorflow/doc/tf-serve.md:
--------------------------------------------------------------------------------
  1 | # Serving the trained classifier on Cloud ML Engine
  2 | 
  3 | ## Finding saved models
  4 | 
  5 | The trainer exports saved models to the following directory:
  6 | ```
  7 | $JOB_DIR/export/Servo/
  8 | ```
  9 | where `$JOB_DIR` is the directory you passed as the `--job-dir` argument to the
 10 | trainer (as in the previous section).
 11 | 
 12 | Within the `Servo` directory, models are stored in models are stored in folders
 13 | named by the epoch time at which they were exported.
 14 | 
 15 | ## Hosting on Cloud ML Engine
 16 | 
 17 | There are two concepts you should be familiar with when hosting your TensorFlow
 18 | models using Cloud ML Engine:
 19 | 
 20 | 1. models, which correspond to a single TensorFlow graph
 21 | 
 22 | 2. versions, which correspond to the results of training that graph
 23 | 
 24 | A single model can potentially have multiple versions, each of which
 25 | corresponds to a distinct prediction service. The classifier we have trained
 26 | will be a single version of a model.
 27 | 
 28 | To begin with, then, we must create a Cloud ML Engine model to hold our
 29 | classifier. Let us call our model `sparky`:
 30 | ```
 31 | $ gcloud ml-engine models create "sparky"
 32 | ```
 33 | 
 34 | Now let us upload our latest export of the trained model as version `alpha` of
 35 | the `sparky` model. Assuming that
 36 | `MODEL_EXPORT=$JOB_DIR/export/Servo/<most recent timestamp>` be the location of
 37 | the saved model, you can create version `alpha` from your command line by:
 38 | ```
 39 | $ gcloud ml-engine versions create "alpha" --model=sparky --origin=$MODEL_EXPORT
 40 | ```
 41 | 
 42 | ## Making predictions
 43 | 
 44 | In order to make predictions using this hosted model, we have to apply the same
 45 | preprocessing to the input data that we applied when we trained the model. In
 46 | this version of the guide, we explicitly only show how to do this in the case of
 47 | batch predictions. In the next version of the guide, we will show how this can
 48 | be done for online predictions using structured streaming in Spark.
 49 | 
 50 | As a stopgap, one could also use the model artifacts exported by the Spark
 51 | training job (the CSV file of categorical value counts stored on the path
 52 | specified by the `-x` argument to the Spark job) to manually preprocess a data
 53 | stream and perform [online prediction using Cloud Machine Learning
 54 | Engine](https://cloud.google.com/ml-engine/docs/concepts/prediction-overview#online_prediction_versus_batch_prediction).
 55 | 
 56 | ### Batch prediction
 57 | 
 58 | Begin by preprocessing your data in `prediction` mode following the instructions
 59 | in the [Preprocessing](#preprocessing) section so that the preprocessed data
 60 | meant for prediction is stored in `$OUTPUT_DIR`. If you used Dataproc to
 61 | preprocess the data for prediction, `$OUTPUT_DIR` is a Cloud Storage bucket. If
 62 | not, then you will have to move your data to Cloud Storage to make it available
 63 | for batch prediction. You can do so using the [gsutil
 64 | tool](https://cloud.google.com/storage/docs/gsutil).
 65 | 
 66 | Once your data is on GCS, please set the `GCS_PATH` environment variable to the
 67 | reflect the path to the Cloud Storage directory containing the data meant for
 68 | preprocessing (with NO trailing `/`). We will store the predictions in the
 69 | `$GCS_PATH/predictions` directory.
 70 | 
 71 | Set the `PREDICTION_SERVICE_REGION` environment variable to the GCP region in
 72 | which your prediction service is running (most likely
 73 | `PREDICTION_SERVICE_REGION="us-central1"`).
 74 | 
 75 | To queue up the batch prediction job:
 76 | ```
 77 | $ gcloud ml-engine jobs submit prediction sparky_alpha_1 \
 78 | --model=sparky --version=alpha \
 79 | --data-format=TF_RECORD \
 80 | --input-paths="$GCS_PATH/part-*" \
 81 | --output-path="$GCS_PATH/predictions" \
 82 | --region="$PREDICTION_SERVICE_REGION"
 83 | ```
 84 | 
 85 | The job is completed asynchronously, so if you would like to monitor its status,
 86 | you can either view the logs on the Google Cloud Platform web dashboard or you
 87 | can use the following command at your command line:
 88 | ```
 89 | $ gcloud ml-engine jobs stream-logs sparky_alpha_1
 90 | ```
 91 | 
 92 | Alternatively, you can check up on the status of the job using
 93 | ```
 94 | $ gcloud ml-engine jobs describe sparky_alpha_1
 95 | ```
 96 | 
 97 | Once the job has completed the output will be written to the
 98 | `$GCS_PATH/predictions` directory in parts, with each line of each part
 99 | containing a distinct JSON object with two keys:
100 | 
101 | 1. `classes` - 0 and 1, the two classes that our linear classifier chooses
102 |    between, presented as the array `["0", "1"]`
103 | 
104 | 2. `scores` - an array representing a probability vector, with the entry in
105 |    index `i` representing the probability that the corresponding input belonged
106 |    to class `i`.
107 | 
108 | - - -
109 | 
110 | [Home](../README.md)
111 | 


--------------------------------------------------------------------------------
/spark-tensorflow/doc/tf-train.md:
--------------------------------------------------------------------------------
  1 | # Training the linear classifier
  2 | 
  3 | Once you have preprocessed your training data in training mode and some
  4 | evaluation data in evaluation mode, you can use it to train the
  5 | TensorFlow classifier. For this, whether your run your training
  6 | job locally or on Google Cloud Platform, you will be using [this
  7 | code](../trainer/task.py).
  8 | 
  9 | ## Preprocess your artifacts
 10 | 
 11 | The artifact files as written by Spark need one more preprocessing step before 
 12 | they are ready for consumption by the Tensorflow graph. Specifically, they
 13 | need to be renamed and the count of the categorical feature value files must
 14 | be created.
 15 | 
 16 | This preprocessing can be done on either a local directory with the `preprocess_artifacts_local.py` command:
 17 | 
 18 | ```
 19 |   $ python preprocess_artifacts_local.py <your local artifact directory>
 20 | ``` 
 21 | 
 22 | or on a GCS bucket with the `preprocess_artifacts_gcs.py` command:
 23 | 
 24 | ```
 25 |   $ python preprocess_artifacts_gcs.py <your GCS artifacts>
 26 | ```  
 27 | 
 28 | ## Command line arguments
 29 | 
 30 | Regardless of how you run the training job, you will *have* to pass it the
 31 | following arguments:
 32 | 
 33 | 1. `--job-dir`: Location where TensorFlow should store the model exported by
 34 |    the training job.
 35 | 
 36 | 2. `--train-dir`: Directory containing training data.
 37 | 3. `--eval-dir`: Directory containing evaluation data.
 38 | 
 39 | Let us assume that the values for each of these arguments are stored under
 40 | `$JOB_DIR, $TRAIN_DIR, $EVAL_DIR` respectively.
 41 | 
 42 | You may also *optionally* pass the following arguments:
 43 | 
 44 | 1. `--batch-size`: This is the prescribed size of each training batch.
 45 | 
 46 | 2. `--train-steps`: This is the number of steps of training that should be
 47 |    performed. If you do not specify this parameter, the model will train itself
 48 |    continually.
 49 | 
 50 | ## Training locally
 51 | 
 52 | Begin by setting up your environment with:
 53 | ```
 54 | $ pip install -r requirements.txt
 55 | ```
 56 | 
 57 | (It is highly recommended that you do this in a fresh Python 2 virtual
 58 | environment.)
 59 | 
 60 | With your environment set up, you can run the training job by:
 61 | ```
 62 | $ python task.py --job-dir $JOB_DIR --train-dir $TRAIN_DIR --eval-dir $EVAL_DIR
 63 | ```
 64 | 
 65 | If you want to monitor how your model is faring against your evaluation data,
 66 | you can run:
 67 | ```
 68 | $ tensorboard --logdir=$JOB_DIR
 69 | ```
 70 | and go to `http://localhost:6006` in your browser.
 71 | 
 72 | ## Training in the cloud
 73 | 
 74 | Make sure that you have enabled MLEngine. Decide upon a name for your training
 75 | job, and store it under the `JOB` environment variable. Then:
 76 | ```
 77 | gcloud ml-engine jobs submit training $JOB --stream-logs --runtime-version 1.2 \
 78 |   --job-dir $JOB_DIR \
 79 |   --module-name trainer.task --package-path trainer --region "us-central1" \
 80 |   -- --train-dir $TRAIN_DIR --eval-dir $EVAL_DIR --artifact-dir $ARTIFACT_DIR
 81 | ```
 82 | 
 83 | (Note, the extra `--` on the last line of the command above is not a typo. That
 84 | is the pattern used in the `gcloud` command to signify that all subsequent
 85 | arguments are not for `gcloud` itself but are rather parameters to the program
 86 | being executed by `gcloud`. In this case, `trainer.task`.)
 87 | 
 88 | If you are using a sizable portion of the Kaggle challenge data set, you may
 89 | want to run several batches in parallel to speed up training. To do so, you can
 90 | specify MLEngine configuration as outlined
 91 | [here](https://cloud.google.com/ml-engine/docs/concepts/training-overview). The
 92 | TL;DR version is that you should make a YAML file representing a
 93 | [TrainingInput](https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#traininginput)
 94 | argument to the MLEngine API and pass this YAML file to `gcloud ml-engine jobs
 95 | submit training` under the `--config` argument.
 96 | 
 97 | This project provides a [sample YAML file](./config-standard.yaml) which can
 98 | be passed in the training command as follows:
 99 | ```
100 | gcloud ml-engine jobs submit training $JOB --stream-logs --runtime-version 1.2 \
101 |   --job-dir $JOB_DIR \
102 |   --module-name trainer.task --package-path trainer --region "us-central1" \
103 |   --config config-standard.yaml \
104 |   -- --train-dir $TRAIN_DIR --eval-dir $EVAL_DIR
105 | ```
106 | 
107 | - - -
108 | 
109 | [Home](../README.md)
110 | 


--------------------------------------------------------------------------------
/spark-tensorflow/gcloud-tests/config.yaml:
--------------------------------------------------------------------------------
1 | trainingInput:
2 |         scaleTier: BASIC
3 | 


--------------------------------------------------------------------------------
/spark-tensorflow/gcloud-tests/request.json:
--------------------------------------------------------------------------------
1 | {"categorical-feature-3": [""], "categorical-feature-2": [""], "categorical-feature-1": [""], "categorical-feature-7": [""], "categorical-feature-6": [""], "categorical-feature-5": [""], "categorical-feature-4": [""], "categorical-feature-9": [""], "categorical-feature-8": [""], "integer-feature-1": [0], "categorical-feature-22": [""], "integer-feature-3": [0], "integer-feature-2": [0], "integer-feature-5": [0], "integer-feature-4": [0], "integer-feature-7": [0], "integer-feature-6": [0], "integer-feature-9": [0], "integer-feature-8": [0], "categorical-feature-20": [""], "categorical-feature-21": [""], "categorical-feature-26": [""], "id": "1", "categorical-feature-24": [""], "categorical-feature-25": [""], "categorical-feature-13": [""], "categorical-feature-12": [""], "categorical-feature-11": [""], "categorical-feature-10": [""], "categorical-feature-17": [""], "categorical-feature-16": [""], "categorical-feature-15": [""], "categorical-feature-14": [""], "categorical-feature-19": [""], "categorical-feature-18": [""], "categorical-feature-23": [""], "integer-feature-11": [0], "integer-feature-10": [0], "integer-feature-13": [0], "integer-feature-12": [0]}


--------------------------------------------------------------------------------
/spark-tensorflow/gcloud-tests/test-tf-tsv.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Copyright 2017 Google Inc. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | # This script makes the following assumptions:
18 | # 1. gcloud is configured with credentials that have been set up to run these
19 | #    tests.
20 | # 2. This means that the credentials should provide access to a GCS bucket
21 | #    containing the data required to run the tests.
22 | # 3. The path to this GCS bucket will be provided using the GCS_BUCKET
23 | #    environment variable.
24 | # 4. The path will be provided prefixed with a "gs://" and will have no trailing
25 | #    backslash.
26 | # 5. The GCS bucket will contain a /preprocessed-data subpath containing the
27 | #    files and subdirectories in the ../trainer/tests directory (path relative
28 | #    to this file).
29 | 
30 | DATA_DIR="$GCS_BUCKET/preprocessed-data"
31 | ARTIFACTS_DIR="$DATA_DIR/artifacts"
32 | 
33 | TEST_TIME=$(date +%s)
34 | JOB_NAME=test_job_$TEST_TIME
35 | JOB_DIR="$GCS_BUCKET/$JOB_NAME"
36 | JOB_CONFIG="gcloud-tests/config.yaml"
37 | 
38 | REGION="us-central1"
39 | 
40 | 
41 | echo "Submitting training job..."
42 | 
43 | gcloud ml-engine jobs submit training $JOB_NAME \
44 |   --runtime-version=1.2 \
45 |   --job-dir=$JOB_DIR \
46 |   --module-name=trainer.task \
47 |   --package-path trainer \
48 |   --region $REGION \
49 |   --config=$JOB_CONFIG \
50 |   --quiet \
51 |   -- \
52 |   --data-format=tsv \
53 |   --train-dir=${DATA_DIR}/ \
54 |   --eval-dir=${DATA_DIR}/ \
55 |   --artifact-dir=${ARTIFACTS_DIR}/ \
56 |   --batch-size=2 \
57 |   --train-steps=10 \
58 |   --eval-steps=1 \
59 |   --learning-rate=0.5 \
60 |   --min-eval-frequency=0
61 | 
62 | while :
63 | do
64 |   sleep 30
65 |   echo "Polling ML Engine for status of training job: $JOB_NAME"
66 |   STATUS=$(gcloud ml-engine jobs list --filter="jobId=$JOB_NAME" --format="value(state)")
67 |   echo "Status: $STATUS"
68 |   if [[ $STATUS == "SUCCEEDED" || $STATUS == "FAILED" ]]; then
69 |     break
70 |   fi
71 | done
72 | 
73 | if [[ $STATUS != "SUCCEEDED" ]]; then
74 |   exit 1
75 | fi
76 | 
77 | 
78 | MODEL_NAME=test_model
79 | MODEL_VERSION=v$TEST_TIME
80 | 
81 | ORIGIN=$(gsutil ls "$JOB_DIR/**/saved_model.pb" | sed 's/\(.\)saved_model.pb/\1/g')
82 | 
83 | echo "Training succeeded. Creating model from saved model at $ORIGIN ..."
84 | 
85 | gcloud ml-engine versions create $MODEL_VERSION \
86 |   --model=$MODEL_NAME \
87 |   --origin=$ORIGIN \
88 |   --runtime-version=1.2
89 | 
90 | gcloud ml-engine predict \
91 |   --model $MODEL_NAME \
92 |   --version $MODEL_VERSION \
93 |   --json-instances ./gcloud-tests/request.json
94 | 


--------------------------------------------------------------------------------
/spark-tensorflow/prepare/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM hseeberger/scala-sbt
2 | ADD . /scala_app
3 | CMD cd /scala_app && sbt test
4 | 


--------------------------------------------------------------------------------
/spark-tensorflow/prepare/build.sbt:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2017 Google Inc. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *            http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | name := "criteo-prepare"
18 | 
19 | version := "1.0"
20 | 
21 | scalaVersion := "2.11.11"
22 | 
23 | libraryDependencies ++= Seq(
24 |   "org.apache.spark" % "spark-core_2.11" % "2.1.1",
25 |   "org.apache.spark" % "spark-sql_2.11" % "2.1.1",
26 |   "org.apache.spark" % "spark-mllib_2.11" % "2.1.1",
27 |   "com.github.scopt" % "scopt_2.11" % "3.6.0"
28 | )
29 | 
30 | libraryDependencies += "org.scalactic" %% "scalactic" % "3.0.1"
31 | libraryDependencies += "org.scalatest" %% "scalatest" % "3.0.1" % "test"
32 | 
33 | parallelExecution in Test := false
34 | 


--------------------------------------------------------------------------------
/spark-tensorflow/prepare/create-cluster.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | gcloud dataproc clusters create $CLUSTER --properties "yarn:yarn.nodemanager.vmem-check-enabled=false" --zone us-west1-c --master-machine-type n1-standard-8 --master-boot-disk-size 500 --num-workers 4 --worker-machine-type n1-standard-4 --worker-boot-disk-size 500 --project cloudml-spark-tf-connector
4 | 


--------------------------------------------------------------------------------
/spark-tensorflow/prepare/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version = 0.13.15


--------------------------------------------------------------------------------
/spark-tensorflow/prepare/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.8.0")


--------------------------------------------------------------------------------
/spark-tensorflow/prepare/src/main/scala/com/google/cloud/ml/samples/criteo/ArtifactExporter.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2017 Google Inc. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *            http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.google.cloud.ml.samples.criteo
18 | 
19 | import org.apache.spark.sql.{DataFrame, SparkSession}
20 | 
21 | 
22 | trait ArtifactExporter {
23 |   def export(column: String, df: DataFrame)
24 | }
25 | 
26 | 
27 | class FileArtifactExporter (val outputPath: String)
28 |                        (implicit val spark: SparkSession)
29 | extends ArtifactExporter
30 | {
31 | 
32 |   def export(prefix: String, df: DataFrame): Unit = {
33 |     val fullOutputPath = outputPath + "/" + prefix
34 |     df.repartition(1).write.format("csv").save(fullOutputPath)
35 |   }
36 | }
37 | 
38 | class EmptyArtifactExporter
39 |   extends ArtifactExporter {
40 | 
41 |   var exported: Option[Array[Seq[Any]]] = None
42 | 
43 |   override def export(column: String, df: DataFrame): Unit = {
44 |     exported = Some(df.collect.map(_.toSeq.map(_.toString)))
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/spark-tensorflow/prepare/src/main/scala/com/google/cloud/ml/samples/criteo/CriteoAnalyzer.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2017 Google Inc. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *            http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.google.cloud.ml.samples.criteo
18 | 
19 | import org.apache.spark.sql.{DataFrame, SparkSession}
20 | import org.apache.spark.sql.types.StructType
21 | 
22 | 
23 | class CriteoAnalyzer(val inputPath: String,
24 |                      val schema: StructType,
25 |                      val features: CriteoFeatures,
26 |                      val numPartitions: Integer,
27 |                      val indexer: TrainingIndexer,
28 |                      val importer: CriteoImporter,
29 |                      val artifactExporter: ArtifactExporter)
30 |                     (implicit val spark: SparkSession) {
31 | 
32 |   def analyze() {
33 |     val missingReplacer = new CriteoMissingReplacer()
34 | 
35 |     val cleanedDf = importer.criteoImport
36 |     val noNonNullDf = cleanedDf.na.fill("null")
37 |     val filledDf = noNonNullDf.na.replace(noNonNullDf.columns, Map("" -> "null"))
38 | 
39 |     val averages = missingReplacer.getAverageIntegerFeatures(
40 |       filledDf, features.integerFeatureLabels)
41 |     averages.foreach {
42 |       case (col: String, df: DataFrame) =>
43 |         artifactExporter.export(col, df)
44 |     }
45 | 
46 |     val valueCounts = indexer.getCategoricalFeatureValueCounts(filledDf)
47 |     val vocabularies = indexer.getCategoricalColumnVocabularies(valueCounts)
48 | 
49 |     vocabularies.foreach {
50 |       case (col: String, df: DataFrame) =>
51 |         artifactExporter.export(features.categoricalLabelMap(col), df)
52 |     }
53 | 
54 |   }
55 | 
56 |   def apply(): Unit = analyze
57 | }
58 | 


--------------------------------------------------------------------------------
/spark-tensorflow/prepare/src/main/scala/com/google/cloud/ml/samples/criteo/CriteoExporter.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2017 Google Inc. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *            http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.google.cloud.ml.samples.criteo
18 | 
19 | import org.apache.spark.sql._
20 | 
21 | trait CriteoExporter {
22 |   def criteoExport(df: DataFrame): Unit
23 | }
24 | 
25 | 
26 | class FileExporter(val outputPath: String, val format: String)
27 |                   (implicit val spark: SparkSession)
28 | extends CriteoExporter
29 | {
30 |   def criteoExport(df: DataFrame): Unit = df.write.format(format).save(outputPath)
31 | }
32 | 
33 | 
34 | class FileStreamExporter(val outputPath: String, val format: String)
35 |                         (implicit val spark: SparkSession)
36 | extends CriteoExporter
37 | {
38 |   def criteoExport(df: DataFrame): Unit = {
39 |     df.writeStream.format(format).
40 |       option("checkpointLocation", outputPath ++ "/checkpoints").
41 |       start(outputPath).awaitTermination()
42 |   }
43 | }
44 | 
45 | 
46 | class TestExporter extends CriteoExporter {
47 |   var exported: Option[Array[Seq[Any]]] = None
48 | 
49 |   def criteoExport(df: DataFrame): Unit = {
50 |     exported = Some(df.collect.map(_.toSeq.map(_.toString)))
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/spark-tensorflow/prepare/src/main/scala/com/google/cloud/ml/samples/criteo/CriteoFeatures.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2017 Google Inc. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *            http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.google.cloud.ml.samples.criteo
18 | 
19 | import org.apache.spark.sql.types.{StringType, StructField, StructType}
20 | 
21 | /**
22 |  * CriteoFeatures objects maintain information about the features to be preprocessed in a Criteo
23 |  * data set.
24 |  */
25 | case class CriteoFeatures() {
26 |   val integerFeatureLabels: Seq[String] = (1 to 13).map(index => s"integer-feature-$index")
27 |   val categoricalFeatureLabels: Seq[String] = (1 to 26).map(index => s"categorical-feature-$index")
28 |   val categoricalRawLabels: Seq[String] = categoricalFeatureLabels.map({label => label + "-raw"})
29 |   val clickedLabel = Seq("clicked")
30 | 
31 |   val inputLabels: Seq[String] = clickedLabel ++ integerFeatureLabels ++ categoricalRawLabels
32 | 
33 |   val integralColumns: Seq[String] = inputLabels.
34 |     filterNot(label => categoricalRawLabels.contains(label))
35 | 
36 |   // Correspondence between labels in the input data and labels in the preprocessed data
37 |   val categoricalLabelMap: Map[String, String] =
38 |     Map(categoricalRawLabels.zip(categoricalFeatureLabels): _*)
39 | 
40 |   // DataFrame schema of the input data
41 |   val inputSchema: StructType = StructType(inputLabels.map(StructField(_, StringType)))
42 | 
43 |   val outputLabels: Seq[String] = clickedLabel ++ integerFeatureLabels ++ categoricalFeatureLabels
44 | }
45 | 


--------------------------------------------------------------------------------
/spark-tensorflow/prepare/src/main/scala/com/google/cloud/ml/samples/criteo/CriteoImporter.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2017 Google Inc. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *            http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.google.cloud.ml.samples.criteo
18 | 
19 | import org.apache.spark.sql._
20 | import org.apache.spark.sql.types._
21 | 
22 | 
23 | trait CriteoImporter {
24 |   def criteoImport: DataFrame
25 | 
26 | }
27 | 
28 | class CleanTSVImporter(val inputPath: String,
29 |                        val schema: StructType,
30 |                        val numPartitions: Int)
31 |                       (implicit val spark: SparkSession)
32 |   extends CriteoImporter {
33 |   def criteoImport: DataFrame = {
34 |     val rawDf = spark.read.format("csv").
35 |       option("sep", "\t").
36 |       schema(schema).
37 |       load(inputPath).
38 |       repartition(numPartitions)
39 | 
40 |     rawDf.na.fill("", rawDf.columns)
41 |   }
42 | }
43 | 
44 | 
45 | trait VocabularyImporter {
46 |   def loadFeatureVocabularies(): Map[String, DataFrame]
47 | }
48 | 
49 | class ArtifactVocabularyImporter(features: CriteoFeatures,
50 |                                  inputPath: String)
51 |                                 (implicit val spark: SparkSession)
52 |   extends VocabularyImporter {
53 | 
54 |   def loadFeatureVocabularies(): Map[String, DataFrame] = {
55 |     features.categoricalRawLabels.map(catFeature => {
56 |       val schema = StructType(Seq(
57 |         StructField("value-" ++ catFeature, StringType),
58 |         StructField("index-" ++ catFeature, LongType)))
59 |       (catFeature, spark.read.format("csv").schema(schema)
60 |         .load(inputPath ++ "/" + features.categoricalLabelMap(catFeature) +
61 |           "/*.csv"
62 |         ))
63 |     }).toMap
64 |   }
65 | }
66 | 
67 | class TestVocabularyImporter(vocabularies: Map[String, DataFrame])(implicit val spark: SparkSession)
68 |   extends VocabularyImporter {
69 |   override def loadFeatureVocabularies(): Map[String, DataFrame] = vocabularies
70 | }
71 | 


--------------------------------------------------------------------------------
/spark-tensorflow/prepare/src/main/scala/com/google/cloud/ml/samples/criteo/CriteoIndexer.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2017 Google Inc. All Rights Reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *            http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package com.google.cloud.ml.samples.criteo
 18 | 
 19 | import org.apache.spark.sql._
 20 | import org.apache.spark.sql.types._
 21 | 
 22 | object IndexerModelSchema {
 23 |   val schema = StructType(Seq(StructField("feature", StringType),
 24 |     StructField("value", StringType),
 25 |     StructField("count", LongType)))
 26 | }
 27 | 
 28 | trait CriteoIndexer {
 29 |   type IndexerResource
 30 | 
 31 |   def features: CriteoFeatures
 32 | 
 33 |   implicit val spark: SparkSession
 34 | 
 35 |   import spark.implicits._
 36 | 
 37 |   /**
 38 |    * Creates a DataFrame containing the count of each value for each feature
 39 |    *
 40 |    * @return A DataFrame with three columns: "feature" (specifies categorical feature), "value"
 41 |    *         (specifies a particular value for that feature), and "count" (specifies number of times
 42 |    *         that value appeared for that feature in the training data).
 43 |    */
 44 |   def getCategoricalFeatureValueCounts(resource: IndexerResource): DataFrame
 45 | 
 46 |   /**
 47 |    * Constructs an embedding from the set of feature values to the positive integers for each of
 48 |    * the feature columns in a Criteo data set. Expects to be provided with value counts for each
 49 |    * of the features.
 50 |    *
 51 |    * @param categoricalFeatureValueCounts Value counts as provided by the
 52 |    *                                      `categoricalColumnValueCounts` method.
 53 |    * @return Map from feature name to embedding table DataFrame. Columns in each DataFrame are
 54 |    *         "value", "index".
 55 |    */
 56 |   def getCategoricalColumnVocabularies(categoricalFeatureValueCounts: DataFrame):
 57 |   Map[String, DataFrame] =
 58 |     features.categoricalRawLabels.map(label => {
 59 |       (label, spark.createDataFrame(
 60 |         categoricalFeatureValueCounts.
 61 |           filter($"feature" === label).
 62 |           rdd.
 63 |           map(row => row.get(1)).
 64 |           zipWithIndex.map(pair => Row(pair._1, pair._2)),
 65 |         StructType(Seq(
 66 |           StructField("value-" ++ label, StringType),
 67 |           StructField("index-" ++ label, LongType)))
 68 |       ))
 69 |     }).toMap
 70 | }
 71 | 
 72 | 
 73 | class TrainingIndexer(val features: CriteoFeatures)
 74 |                      (implicit val spark: SparkSession)
 75 |   extends CriteoIndexer {
 76 | 
 77 |   import spark.implicits._
 78 | 
 79 |   type IndexerResource = DataFrame
 80 | 
 81 |   def getCategoricalFeatureValueCounts(df: DataFrame): DataFrame = {
 82 |     val categoricalRawLabels = spark.sparkContext.broadcast(features.categoricalRawLabels)
 83 | 
 84 |     // categoricalValues tabulates each observed feature value tagged by feature, with repetition
 85 |     val categoricalValues = df.flatMap(row => {
 86 |       categoricalRawLabels.value.
 87 |         map { label => (label, row.getAs[String](label)) }
 88 |     }).toDF("feature", "value")
 89 | 
 90 |     val vocabularies = categoricalValues.
 91 |       groupBy("feature", "value").
 92 |       count.
 93 |       toDF("feature", "value", "count").
 94 |       sort("count")
 95 | 
 96 |     vocabularies.cache()
 97 | 
 98 |     vocabularies
 99 |   }
100 | }
101 | 


--------------------------------------------------------------------------------
/spark-tensorflow/prepare/src/main/scala/com/google/cloud/ml/samples/criteo/CriteoMissingReplacer.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2017 Google Inc. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *            http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.google.cloud.ml.samples.criteo
18 | 
19 | import org.apache.spark.sql._
20 | import org.apache.spark.sql.functions.avg
21 | 
22 | /**
23 |  * Missing replacer replaces all null values in a Dataframe with empty strings,
24 |  * and replaces missing integer values with the average of all integer
25 |  * in that column.
26 |  *
27 |  * @param spark Spark session
28 |  */
29 | class CriteoMissingReplacer()(implicit val spark: SparkSession) {
30 | 
31 |   import spark.implicits._
32 | 
33 |   /**
34 |    * Calculates a map of integer columns to their average values.
35 |    *
36 |    * @param to_average_df The DataFrame with integer columns to get averages of
37 |    * @param features      The column names of the integer features
38 |    * @return A map from integer column names to their averages
39 |    */
40 |   def getAverageIntegerFeatures(to_average_df: DataFrame,
41 |                                 features: Seq[String]): Map[String, DataFrame] = {
42 |     val integerFeaturesDf = to_average_df.
43 |       select(features.head, features.tail: _*).
44 |       toDF
45 | 
46 |     integerFeaturesDf.
47 |       columns.
48 |       map { col_name =>
49 |         val avg_col = integerFeaturesDf.select(avg($"$col_name"))
50 |         (col_name, avg_col)
51 |       }.toMap
52 |   }
53 | 
54 |   /**
55 |    * Replaces the integer values with their averages.
56 |    *
57 |    * @param toReplaceDf Dataframe with values to replace.
58 |    * @param features    Set of integer features column names.
59 |    * @param averages    Map of integer feature column names to their averages.
60 |    * @return The DataFrame with null values replaced with the averages.
61 |    */
62 |   def replaceIntegerFeatures(toReplaceDf: DataFrame,
63 |                              features: Seq[String],
64 |                              averages: Map[String, DataFrame]): DataFrame = {
65 |     val filledDf = toReplaceDf.na.fill("", features)
66 | 
67 |     features.foldLeft(filledDf)((df, col) => {
68 |       df.na.replace(
69 |         col,
70 |         Map("" -> averages(col).head().getDouble(0).toString)
71 |       )
72 |     })
73 |   }
74 | }
75 | 


--------------------------------------------------------------------------------
/spark-tensorflow/prepare/src/main/scala/com/google/cloud/ml/samples/criteo/CriteoPreprocessingApplication.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2017 Google Inc. All Rights Reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *            http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package com.google.cloud.ml.samples.criteo
 18 | 
 19 | import org.apache.spark.sql.{DataFrame, SparkSession}
 20 | import scopt.OptionParser
 21 | 
 22 | 
 23 | /**
 24 |   * Union of the different modes in which preprocessing can be done.
 25 |   */
 26 | sealed trait PreprocessingMode
 27 | 
 28 | case object Analyze extends PreprocessingMode
 29 | 
 30 | case object Transform extends PreprocessingMode
 31 | 
 32 | /**
 33 |   * Converts string for mode into the appropriate `PreprocessingMode` object.
 34 |   */
 35 | object PreprocessingMode {
 36 |   def apply(specifier: String): Option[PreprocessingMode] = specifier.toLowerCase match {
 37 |     case "analyze" => Some(Analyze)
 38 |     case "transform" => Some(Transform)
 39 |     case _ => None
 40 |   }
 41 | }
 42 | 
 43 | 
 44 | case class NewClargConfig(basePath: String = "",
 45 |                           relativeInputPath: String = "",
 46 |                           relativeOutputPath: String = "",
 47 |                           mode: PreprocessingMode = Analyze,
 48 |                           numPartitions: Int = 500)
 49 | 
 50 | object CriteoPreprocessingApplication {
 51 | 
 52 |   def main(args: Array[String]) {
 53 |     val parser = new OptionParser[NewClargConfig]("Criteo TFRecord Preprocessor") {
 54 |       head("CriteoPreprocessingApplication", "1.0.0")
 55 | 
 56 |       help("help").text("Prints this description of the CLI to the Criteo TFRecord Preprocessor")
 57 | 
 58 |       opt[String]('b', "base").required.action((b, c) => c.copy(basePath = b)).text(
 59 |         "The base path along which the application should find inputs and store outputs. Required."
 60 |       )
 61 | 
 62 |       opt[String]('i', "in").required.action((i, c) => c.copy(relativeInputPath = i)).text(
 63 |         "The pattern relative to the base path which the input files match. Required."
 64 |       )
 65 | 
 66 |       opt[String]('o', "out").required.action((o, c) => c.copy(relativeOutputPath = o)).text(
 67 |         "The relative path to the directory in which the resulting transformed TFRecord files" +
 68 |           " or analyze artifacts should be stored."
 69 |       )
 70 | 
 71 |       opt[Int]('n', "numPartitions").action((n, c) => c.copy(numPartitions = n)).text(
 72 |         "The number of partitions in which to process the input file. Default is 500."
 73 |       )
 74 | 
 75 |       opt[String]('m', "mode").action(
 76 |         (m, c) => {
 77 |           val mod = PreprocessingMode(m)
 78 |           c.copy(mode =
 79 |             mod match {
 80 |               case Some(mod) => mod
 81 |               case None =>
 82 |                 throw new Exception("Illegal mode passed under -m or --mode." +
 83 |                   "Pass \"analyze\", \"transform\".")
 84 |             })
 85 |         }
 86 |       ).text(
 87 |         "\"analyze\", \"transform\""
 88 |       )
 89 |     }
 90 |     parser.parse(args, NewClargConfig()) match {
 91 |       case Some(config) =>
 92 |         implicit val spark = SparkSession.builder().
 93 |           appName("Criteo TFRecord Preprocessor").
 94 |           getOrCreate()
 95 | 
 96 |         val inputPath = config.basePath ++ config.relativeInputPath
 97 |         val outputPath = config.basePath ++ config.relativeOutputPath
 98 |         val artifactPath = config.basePath ++ "artifacts/"
 99 | 
100 |         val features = CriteoFeatures()
101 | 
102 |         val artifactExporter = config.mode match {
103 |           case Analyze => new FileArtifactExporter(config.basePath ++ "artifacts/")
104 |           case _ => new EmptyArtifactExporter()
105 |         }
106 | 
107 |         val indexer = new TrainingIndexer(features)
108 |         val importer = new CleanTSVImporter(inputPath,
109 |           features.inputSchema,
110 |           config.numPartitions)
111 | 
112 |         config.mode match {
113 |           case Analyze =>
114 |             val analyzer = new CriteoAnalyzer(inputPath, features.inputSchema,
115 |             features, config.numPartitions, indexer, importer, artifactExporter)
116 |             analyzer()
117 |           case Transform =>
118 |             val vocabularyImporter = new ArtifactVocabularyImporter(features, artifactPath)
119 |             val exporter = new FileExporter(outputPath, "tfrecords")
120 | 
121 |             val transformer = new CriteoTransformer(inputPath,
122 |               features, config.numPartitions, indexer,
123 |               artifactPath, vocabularyImporter)
124 | 
125 |             val resultDf = transformer(importer.criteoImport)
126 |             exporter.criteoExport(resultDf)
127 |         }
128 |     }
129 |   }
130 | }
131 | 


--------------------------------------------------------------------------------
/spark-tensorflow/prepare/src/main/scala/com/google/cloud/ml/samples/criteo/CriteoTransformer.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2017 Google Inc. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *            http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package com.google.cloud.ml.samples.criteo
17 | 
18 | import org.apache.spark.sql.{DataFrame, SparkSession}
19 | import org.apache.spark.sql.types._
20 | 
21 | 
22 | class CriteoTransformer(inputPath: String,
23 |                         features: CriteoFeatures,
24 |                         numPartitions: Integer,
25 |                         indexer: TrainingIndexer,
26 |                         artifactPath: String,
27 |                         vocabularyImporter: VocabularyImporter)
28 |                        (implicit val spark: SparkSession) {
29 | 
30 |   def addRankFeatures(cleanedDf: DataFrame,
31 |                       vocabularies: Map[String, DataFrame]): DataFrame = {
32 |     // add the ranking feature values to the cateogrical columns
33 |     features.categoricalRawLabels.
34 |       foldLeft(cleanedDf)((df, col) => {
35 |         df.join(vocabularies(col), df(col) === vocabularies(col)("value-" ++ col))
36 |           .withColumnRenamed("index-" ++ col, features.categoricalLabelMap(col))
37 |       })
38 |   }
39 | 
40 |   /**
41 |     * Transforms the input training data into a format appropriate for
42 |     * training. This includes a conversion of categorical value to their
43 |     * frequency rank for each values, and a replacement of missing numeric
44 |     * features with the mean of that feature's value.
45 |     * @param df The input DataFrame as read by the TSV.
46 |     * @return
47 |     */
48 |   def transform(df: DataFrame): DataFrame = {
49 |     val vocabularies = vocabularyImporter.loadFeatureVocabularies()
50 |     val withCategoryRankings = addRankFeatures(df, vocabularies)
51 | 
52 |     // select just the output  columns (removing the old categorical values)
53 |     val withTargetFeaturesDf = withCategoryRankings
54 |       .select(features.outputLabels.head, features.outputLabels.tail: _*).
55 |       toDF
56 | 
57 |     // cast integer columns to floats
58 |     val floatCastDf = features.integralColumns.
59 |       foldLeft(withTargetFeaturesDf)((df, col) =>
60 |         df.withColumn(col, withTargetFeaturesDf(col).cast(FloatType)))
61 |     floatCastDf
62 |   }
63 | 
64 |   def replaceNulls(df: DataFrame): DataFrame = {
65 |     val cleanedDf = df.na.replace(features.categoricalRawLabels, Map("" -> "null"))
66 | 
67 |       val missingReplacer = new CriteoMissingReplacer()
68 |     val averages = missingReplacer.getAverageIntegerFeatures(
69 |       cleanedDf, features.integerFeatureLabels)
70 | 
71 |     missingReplacer.replaceIntegerFeatures(
72 |       cleanedDf, features.integerFeatureLabels, averages)
73 |   }
74 | 
75 |   def replaceNullsAndTransform(df: DataFrame): DataFrame = {
76 |     transform(replaceNulls(df))
77 |   }
78 | 
79 |   def apply(df: DataFrame): DataFrame = replaceNullsAndTransform(df: DataFrame)
80 | 
81 | }
82 | 


--------------------------------------------------------------------------------
/spark-tensorflow/prepare/src/test/resources/test_train.csv:
--------------------------------------------------------------------------------
1 | 0	1	1	5	0	1382	4	15	2	181	1	2		2	68fd1e64	80e26c9b	fb936136	7b4723c4	25c83c98	7e0ccccf	de7995b8	1f89b562	a73ee510	a8cd5504	b2cb9c98	37c9c164	2824a5f6	1adce6ef	8ba8b39a	891b62e7	e5ba7672	f54016b9	21ddcdc9	b1252a9d	07b5194c		3a171ecb	c5c50484	e8b83407	9727dd16
2 | 0	2	0	44	1	102	8	2	2	4	1	1		4	68fd1e64	f0cf0024	6f67f7e5	41274cd7	25c83c98	fe6b92e5	922afcc0	0b153874	a73ee510	2b53e5fb	4f1b46f3	623049e6	d7020589	b28479f6	e6c5b5cd	c92f3b61	07c540c4	b04e4670	21ddcdc9	5840adea	60f6221e		3a171ecb	43f13e8b	e8b83407	731c3655
3 | 0	2	0	1	14	767	89	4	2	245	1	3	3	45	287e684f	0a519c5c	02cf9876	c18be181	25c83c98	7e0ccccf	c78204a1	0b153874	a73ee510	3b08e48b	5f5e6091	8fe001f4	aa655a2f	07d13a8f	6dc710ed	36103458	8efede7f	3412118d			e587c466	ad3062eb	3a171ecb	3b183c5c		
4 | 0		893			4392		0	0	0		0			68fd1e64	2c16a946	a9a87e68	2e17d6f6	25c83c98	fe6b92e5	2e8a689b	0b153874	a73ee510	efea433b	e51ddf94	a30567ca	3516f6e6	07d13a8f	18231224	52b8680f	1e88c74f	74ef3502			6b3a5ca6		3a171ecb	9117a34a		
5 | 0	3	-1		0	2	0	3	0	0	1	1		0	8cf07265	ae46a29d	c81688bb	f922efad	25c83c98	13718bbd	ad9fa255	0b153874	a73ee510	5282c137	e5d8af57	66a76a26	f06c53ac	1adce6ef	8ff4b403	01adbab4	1e88c74f	26b3c7a7			21c9516a		32c7478e	b34f3128		
6 | 


--------------------------------------------------------------------------------
/spark-tensorflow/prepare/src/test/scala/com/google/cloud/ml/samples/criteo/CriteoImporterTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2017 Google Inc. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *            http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.google.cloud.ml.samples.criteo
18 | 
19 | import org.scalatest.{FlatSpec, GivenWhenThen, Matchers}
20 | 
21 | class CleanTSVImporterTest extends FlatSpec with SparkSpec with GivenWhenThen with Matchers {
22 |   "criteoImport" should "import clean training data from a TSV file" in {
23 |     val inputPath = "src/test/resources/test_train.csv"
24 |     val trainFeatures = CriteoFeatures()
25 |     val importer = new CleanTSVImporter(inputPath, trainFeatures.inputSchema, 1)
26 | 
27 |     val df = importer.criteoImport
28 | 
29 |     df.count should equal(5)
30 | 
31 |     // turn test dataframe to array to avoid serialization
32 |     val df_seq = df.collect.map(_.toSeq)
33 |     df_seq.foreach(row => {
34 |         // verify all nulls are replaced by asserting
35 |         // length without nulls is the same
36 |         val nonulls = row.filter(_ != null)
37 |         row.length should equal(nonulls.length)
38 |     })
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/spark-tensorflow/prepare/src/test/scala/com/google/cloud/ml/samples/criteo/CriteoIndexerTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2017 Google Inc. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *            http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.google.cloud.ml.samples.criteo
18 | 
19 | import org.scalatest._
20 | 
21 | import org.apache.spark.sql._
22 | 
23 | class TrainingIndexerTest extends FlatSpec with SparkSpec with GivenWhenThen with Matchers {
24 | 
25 |   trait TestFixture {
26 |     val indexer: CriteoIndexer
27 |     val trainingDf: DataFrame
28 |     val featureValueCountResult: DataFrame
29 |     val artifactExporter: EmptyArtifactExporter
30 |   }
31 | 
32 |   private var _fixture: Option[TestFixture] = None
33 | 
34 |   private def fixture: TestFixture = _fixture match {
35 |     case None =>
36 |       val f = new TestFixture {
37 |         val features = CriteoFeatures()
38 |         val artifactExporter = new EmptyArtifactExporter()
39 |         val indexer = new TrainingIndexer(features)
40 | 
41 |         val firstCatInput: String = features.categoricalRawLabels.head
42 | 
43 |         // Creating training data as a Seq of Row objects.
44 |         // First five rows will have "abc" as value of first categorical column and "0" in every
45 |         // other column
46 | 
47 |         val rows1to5 = (1 to 5).map(_ => features.inputLabels.map(_ match {
48 |           case `firstCatInput` => "abc"
49 |           case _ => "0"
50 |         }))
51 | 
52 |         // The next three rows will have "xyz" as value of first categorical column and "0" in every
53 |         // other column
54 |         val rows6to8 = (1 to 3).map(_ => features.inputLabels.map({
55 |           case `firstCatInput` => "xyz"
56 |           case _ => "0"
57 |         }))
58 | 
59 |         // The final two rows will have empty values in the first categorical column and have "0" in
60 |         // every other column
61 |         val rows9and10 = (1 to 2).map(_ => features.inputLabels.map({
62 |           case `firstCatInput` => ""
63 |           case _ => "0"
64 |         }))
65 | 
66 |         val trainingDataSeq = rows1to5 ++ rows6to8 ++ rows9and10
67 |         val trainingData: Seq[Row] = trainingDataSeq map {v => Row.fromSeq(v)}
68 | 
69 |         val trainingDf = spark.createDataFrame(spark.sparkContext.parallelize(trainingData),
70 |           features.inputSchema)
71 | 
72 |         val featureValueCountResult: DataFrame = indexer.
73 |           getCategoricalFeatureValueCounts(trainingDf)
74 |       }
75 | 
76 |       _fixture = Some(f)
77 | 
78 |       f
79 | 
80 |     case Some(f) => f
81 |   }
82 | 
83 |   behavior of "TrainingIndexer"
84 | 
85 | 
86 |   it should "correctly create the feature counts" in {
87 |     val f = fixture
88 |     f.featureValueCountResult.first().length should equal(3)
89 |   }
90 | }
91 | 


--------------------------------------------------------------------------------
/spark-tensorflow/prepare/src/test/scala/com/google/cloud/ml/samples/criteo/CriteoMissingReplacerTest.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2017 Google Inc. All Rights Reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *            http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package com.google.cloud.ml.samples.criteo
 18 | 
 19 | import scala.collection.JavaConverters._
 20 | import org.scalatest.{FlatSpec, GivenWhenThen, Matchers}
 21 | import org.apache.spark.sql.{DataFrame, Row}
 22 | import org.apache.spark.sql.types.{StringType, StructField, StructType}
 23 | 
 24 | import scala.util.Try
 25 | 
 26 | 
 27 | class CriteoMissingReplacerTest
 28 |   extends FlatSpec with SparkSpec with GivenWhenThen with Matchers
 29 |       {
 30 | 
 31 |   private def hasColumn(df: DataFrame, path: String): Boolean = Try(df(path)).isSuccess
 32 | 
 33 | 
 34 |   trait TestFixture {
 35 |     val replacer: CriteoMissingReplacer
 36 |   }
 37 | 
 38 |   private var _fixture: Option[TestFixture] = None
 39 | 
 40 |   private def fixture: TestFixture = _fixture match {
 41 |     case None =>
 42 |       val f = new TestFixture {
 43 |         val features = CriteoFeatures()
 44 |         val replacer = new CriteoMissingReplacer()
 45 |       }
 46 | 
 47 |       _fixture = Some(f)
 48 | 
 49 |       f
 50 | 
 51 |     case Some(f) => f
 52 |   }
 53 | 
 54 | 
 55 |   behavior of "Missing Replacer"
 56 | 
 57 |   it should "replace missing ints with average" in {
 58 |     val f = fixture
 59 | 
 60 |     val dataSeq = Seq(
 61 |       Seq("1", "5", "a"),
 62 |       Seq(null, "", "b"),
 63 |       Seq("10", "13", "c"),
 64 |       Seq("12", "14", "d")
 65 |     )
 66 | 
 67 |     val trainingData: Seq[Row] = dataSeq map {v => Row.fromSeq(v)}
 68 |     val schema = StructType(Seq(
 69 |       StructField("a", StringType),
 70 |       StructField("b", StringType),
 71 |       StructField("c", StringType)
 72 |     ))
 73 | 
 74 |     val df = spark.createDataFrame(trainingData.asJava, schema)
 75 | 
 76 |     val integerFeatures = Seq("a", "b")
 77 |     val averageFeaturesMap = f.replacer.getAverageIntegerFeatures(df, integerFeatures)
 78 | 
 79 |     val averagedDf = f.replacer.replaceIntegerFeatures(df, integerFeatures, averageFeaturesMap)
 80 |     hasColumn(averagedDf, "c") should be(true)
 81 | 
 82 |     val df_seq = averagedDf.select("a", "b").collect.map(_.toSeq)
 83 |     val df_seq_str = df_seq.map(_.map(n => n.asInstanceOf[String]))
 84 |     val df_seq_doub = df_seq_str.map(_.map(n => n.toDouble))
 85 | 
 86 |     val first_col_expected_avg = ((1 + 10 + 12).toDouble) / 3
 87 |     val second_col_expected_avg = ((5 + 13 + 14).toDouble) / 3
 88 |     val Eps = 1e-3
 89 |     val first_column_avg = df_seq_doub(1)(0)
 90 |     val second_column_avg = df_seq_doub(1)(1)
 91 | 
 92 |     first_column_avg should equal(first_col_expected_avg +- Eps)
 93 |     second_column_avg should equal(second_col_expected_avg +- Eps)
 94 | 
 95 |     df_seq.foreach(row => {
 96 |       val nonulls = row.filter(_ != null)
 97 |       row.length should equal(nonulls.length)
 98 | 
 99 |       val noblanks = row.filter(_ != "")
100 |       row.length should equal(noblanks.length)
101 |     })
102 |   }
103 | }
104 | 
105 | 


--------------------------------------------------------------------------------
/spark-tensorflow/prepare/src/test/scala/com/google/cloud/ml/samples/criteo/CriteoTransformerTest.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2017 Google Inc. All Rights Reserved.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *            http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package com.google.cloud.ml.samples.criteo
 18 | 
 19 | import org.scalatest.{FlatSpec, GivenWhenThen, Matchers}
 20 | 
 21 | import org.apache.spark.sql.{DataFrame, Row}
 22 | 
 23 | class CriteoTransformerTest extends FlatSpec with SparkSpec with GivenWhenThen with Matchers{
 24 | 
 25 |   trait TestFixture {
 26 |     val indexer: TrainingIndexer
 27 |     val trainingDf: DataFrame
 28 |     val result: DataFrame
 29 |     val artifactExporter: EmptyArtifactExporter
 30 |     val features: CriteoFeatures
 31 |     val transformer: CriteoTransformer
 32 |   }
 33 | 
 34 |   private var _fixture: Option[TestFixture] = None
 35 | 
 36 |   private def fixture: TestFixture = _fixture match {
 37 |     case None =>
 38 |       val f = new TestFixture {
 39 |         val features = CriteoFeatures()
 40 | 
 41 |         // Creating training data as a Seq of Row objects.
 42 |         // First five rows will have "abc" as value of first categorical column and "0" in every
 43 |         // other column
 44 |         val firstCatInput: String = features.categoricalRawLabels.head
 45 |         val firstIntInput: String = features.integerFeatureLabels.head
 46 | 
 47 |         val rows1to5 = (1 to 5).map(_ => features.inputLabels.map(_ match {
 48 |           case `firstCatInput` => "abc"
 49 |           case `firstIntInput` => "3"
 50 |           case _ => "0"
 51 |         }))
 52 | 
 53 | 
 54 |         // The next three rows will have "xyz" as value of first categorical column and "0" in every
 55 |         // other column
 56 |         val rows6to8 = (1 to 3).map(_ => features.inputLabels.map({
 57 |           case `firstCatInput` => "xyz"
 58 |           case `firstIntInput` => ""
 59 |           case _ => "0"
 60 |         }))
 61 | 
 62 |         // The final two rows will have empty values in the first categorical column and have "0" in
 63 |         // every other column
 64 |         val rows9and10 = (1 to 2).map(_ => features.inputLabels.map({
 65 |           case `firstCatInput` => "null"
 66 |           case `firstIntInput` => "3"
 67 |           case _ => "0"
 68 |         }))
 69 | 
 70 |         val trainingDataSeq = rows1to5 ++ rows6to8 ++ rows9and10
 71 |         val trainingData: Seq[Row] = trainingDataSeq map {v => Row.fromSeq(v)}
 72 | 
 73 |         val trainingDf = spark.createDataFrame(spark.sparkContext.parallelize(trainingData),
 74 |           features.inputSchema)
 75 | 
 76 |         val artifactExporter = new EmptyArtifactExporter()
 77 |         val indexer = new TrainingIndexer(features)
 78 | 
 79 |         val valueCounts = indexer.getCategoricalFeatureValueCounts(trainingDf)
 80 |         val vocabularies = indexer.getCategoricalColumnVocabularies(valueCounts)
 81 |         val vocabularyImporter = new TestVocabularyImporter(vocabularies)
 82 |         val transformer = new CriteoTransformer("", features, 1,
 83 |           indexer, "", vocabularyImporter)
 84 | 
 85 | 
 86 |         val result: DataFrame = transformer(trainingDf)
 87 |       }
 88 | 
 89 |       _fixture = Some(f)
 90 | 
 91 |       f
 92 | 
 93 |     case Some(f) => f
 94 |   }
 95 | 
 96 |   behavior of "CriteoTransformer"
 97 | 
 98 |   it should "yield a DataFrame with the same number of rows as its input DataFrame" in {
 99 |     val f = fixture
100 |     assert(f.result.count == f.trainingDf.count)
101 |   }
102 | 
103 |   it should "verify add rank features works" in {
104 |     val f = fixture
105 | 
106 |     val headLabel = f.features.categoricalRawLabels.head
107 |     val valueCounts = f.indexer.getCategoricalFeatureValueCounts(f.trainingDf)
108 |     val vocabularies = f.indexer.getCategoricalColumnVocabularies(valueCounts)
109 | 
110 |     val withRank = f.transformer.addRankFeatures(f.trainingDf, vocabularies)
111 |     }
112 | 
113 |   it should "replace missing integer features" in {
114 |     val f = fixture
115 |     val intFeature = f.features.integerFeatureLabels.head
116 |     f.result.filter(s"`$intFeature` is null").count() should equal(0)
117 |   }
118 | 
119 | }
120 | 


--------------------------------------------------------------------------------
/spark-tensorflow/prepare/src/test/scala/com/google/cloud/ml/samples/criteo/SparkSpec.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2017 Google Inc. All Rights Reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *            http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package com.google.cloud.ml.samples.criteo
17 | 
18 | import org.scalatest.{BeforeAndAfterAll, Suite}
19 | 
20 | import org.apache.spark.{SparkConf, SparkContext}
21 | import org.apache.spark.sql.SparkSession
22 | 
23 | trait SparkSpec extends BeforeAndAfterAll {
24 |   this: Suite =>
25 | 
26 |   private var _sc: SparkContext = _
27 |   private var _spark: SparkSession = _
28 | 
29 | 
30 |   override def beforeAll(): Unit = {
31 |     super.beforeAll()
32 | 
33 |     val conf = new SparkConf()
34 |       .setMaster("local[*]")
35 |       .setAppName(this.getClass.getSimpleName)
36 | 
37 |     sparkConfig.foreach { case (k, v) => conf.setIfMissing(k, v) }
38 | 
39 |     _sc = new SparkContext(conf)
40 | 
41 |     _spark = SparkSession.builder().
42 |       appName("Criteo TFRecord Preprocessor").
43 |       getOrCreate()
44 |     sc.setLogLevel("WARN")
45 |   }
46 | 
47 |   def sparkConfig: Map[String, String] = Map.empty
48 | 
49 |   override def afterAll(): Unit = {
50 |     if (_sc != null) {
51 |       _sc.stop()
52 |       _sc = null
53 |     }
54 |     super.afterAll()
55 |   }
56 | 
57 |   implicit def sc: SparkContext = _sc
58 |   implicit def spark: SparkSession = _spark
59 | 
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/spark-tensorflow/prepare/submit-gcloud.sh:
--------------------------------------------------------------------------------
1 | gcloud dataproc jobs submit spark --cluster $CLUSTER --properties "spark.executor.memory=2g,spark.yarn.executor.memoryOverhead=1g" --jars lib/spark-tensorflow-connector-assembly-1.0.0.jar,lib/scopt_2.11-3.6.0.jar,target/scala-2.11/criteo-prepare_2.11-1.0.jar --class "com.google.cloud.ml.samples.criteo.CriteoPreprocessingApplication" -- -b "$BUCKET" -i $1 -o $2 -m $3 -x $4 "${@:5}"
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/prepare/submit-local.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | spark-submit --master local --driver-memory 12g --executor-memory 12g --class com.google.cloud.ml.samples.criteo.CriteoPreprocessingApplication --jars lib/spark-tensorflow-connector-assembly-1.0.0.jar,lib/scopt_2.11-3.6.0.jar target/scala-2.11/criteo-prepare_2.11-1.0.jar --base $BASE --in $1 --out $2 -m $3 -x $4 ${@:5}
3 | 


--------------------------------------------------------------------------------
/spark-tensorflow/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #            http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from setuptools import setup
16 | 
17 | REQUIRED_PACKAGES = []
18 | 
19 | setup(
20 |     name='spark-tf-example',
21 |     version='1.0',
22 |     install_requires=REQUIRED_PACKAGES,
23 |     include_package_data=True,
24 |     description='Spark preprocessing for TensorFlow',
25 |     packages=['trainer']
26 | )
27 | 


--------------------------------------------------------------------------------
/spark-tensorflow/test-tf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | SPARK_TF_PATH=$(dirname "$(readlink -f $0)")
4 | cd "$SPARK_TF_PATH"
5 | python -m trainer.tests $@
6 | 


--------------------------------------------------------------------------------
/spark-tensorflow/train-gcloud.sh:
--------------------------------------------------------------------------------
1 | gcloud ml-engine jobs submit training $JOB --stream-logs --runtime-version 1.2 \
2 |   --job-dir "gs://cloudml-spark-tf-connector/ml-engine/$JOB" \
3 |   --module-name trainer.task --package-path trainer --region "us-central1" \
4 |   --config config-standard.yaml -- \
5 |   --train-glob "gs://cloudml-spark-tf-connector/criteo/med-test-data/alpha/train/part-*" \
6 |   --eval-glob "gs://cloudml-spark-tf-connector/criteo/med-test-data/alpha/eval/part-*" \
7 |   --batch-size 1000 --train-steps 1
8 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudDataproc/cloud-dataproc/cb0ced6a30bc88a8ddc58d2870c217273be4958c/spark-tensorflow/trainer/__init__.py


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/model.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the 'License');
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #            http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an 'AS IS' BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Model definition."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import tensorflow as tf
 22 | 
 23 | 
 24 | MODES = tf.estimator.ModeKeys
 25 | 
 26 | 
 27 | def generate_estimator(
 28 |         mode_feature_cols_map,
 29 |         params,
 30 |         config):
 31 |     """Creates a tf.estimator.Estimator for the Criteo classification task.
 32 | 
 33 |     Args:
 34 |       mode_feature_cols_map: Dictionary mapping modes to lists of
 35 |       tf.feature_columns describing the features to expect in each mode
 36 |       params: Hyperparameter object (assumed to be an instance of
 37 |       tf.contrib.training.HParams
 38 |       config: An instance of tf.contrib.learn.RunConfig
 39 | 
 40 |     Returns:
 41 |       A tf.estimator.Estimator representing the logistic classifier we will use
 42 |     """
 43 |     model_fn = generate_model_fn(mode_feature_cols_map)
 44 | 
 45 |     return tf.estimator.Estimator(
 46 |         model_fn,
 47 |         model_dir=config.model_dir,
 48 |         params=params,
 49 |         config=config
 50 |     )
 51 | 
 52 | 
 53 | def generate_model_fn(mode_feature_cols_map):
 54 |     """Creates a model_fn to inject into our custom estimator.
 55 | 
 56 |     Args:
 57 |       mode_feature_cols_map: Dictionary mapping modes to lists of
 58 |       tf.feature_columns describing the features to expect in each mode
 59 | 
 60 |     Returns:
 61 |       A model_fn for tf.estimator.Estimator. Has the following signature:
 62 |       Args:
 63 |         features: A dictionary of strings to tensors describing the model
 64 |         features
 65 |         labels: Either None or a tensor representing the labels for a given
 66 |         batch of training or evaluation data
 67 |         mode: A member of tf.estimator.ModeKeys -- TRAIN, EVAL, or PREDICT
 68 |         params: tf.contrib.training.HParams object or None
 69 |         config: tf.contrib.learn.RunConfig object or None
 70 | 
 71 |       Returns:
 72 |         tf.estimator.EstimatorSpec object
 73 |     """
 74 |     def model_fn(features, labels, mode, params=None, config=None):
 75 |         if params is None:
 76 |             params = tf.contrib.training.HParams(learning_rate=0.01)
 77 | 
 78 |         # Extract the id tensor from the input features if it exists in the
 79 |         # feature_columns
 80 |         id_tensor = None
 81 |         if 'id' in features:
 82 |             id_tensor = features.pop('id')
 83 | 
 84 |         # Feature columns for given mode
 85 |         feature_cols = mode_feature_cols_map[mode]
 86 | 
 87 |         # Tensor of logits formed from input features
 88 |         logits = tf.feature_column.linear_model(features, feature_cols)
 89 | 
 90 |         # Apply the logistic function to the logits defined above
 91 |         # This is our classifier
 92 |         logistic = tf.sigmoid(logits, name='logistic')
 93 | 
 94 |         classifier_output = {
 95 |             'clicked': logistic
 96 |         }
 97 | 
 98 |         if id_tensor is not None:
 99 |             classifier_output['id'] = tf.identity(id_tensor)
100 | 
101 |         loss = None
102 |         train_op = None
103 | 
104 |         if mode in (MODES.TRAIN, MODES.EVAL):
105 |             loss = tf.reduce_mean(
106 |                 tf.nn.sigmoid_cross_entropy_with_logits(
107 |                     logits=logits, labels=labels, name='loss')
108 |             )
109 | 
110 |         if mode == MODES.TRAIN:
111 |             global_step = tf.train.get_or_create_global_step()
112 |             train_op = tf.train.GradientDescentOptimizer(
113 |                 learning_rate=params.learning_rate
114 |             ).minimize(loss, global_step=global_step)
115 | 
116 |         eval_metric_ops = None
117 | 
118 |         if mode == MODES.EVAL:
119 |             eval_metric_ops = {
120 |                 'accuracy': tf.metrics.accuracy(labels, logistic)}
121 | 
122 |         # Define serving signatures
123 |         prediction_output = tf.estimator.export.PredictOutput(
124 |             classifier_output)
125 | 
126 |         export_outputs = {
127 |             tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
128 |             prediction_output
129 |         }
130 | 
131 |         return tf.estimator.EstimatorSpec(
132 |             mode=mode,
133 |             predictions=classifier_output,
134 |             loss=loss,
135 |             train_op=train_op,
136 |             eval_metric_ops=eval_metric_ops,
137 |             export_outputs=export_outputs
138 |         )
139 | 
140 |     return model_fn
141 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/preprocess_artifacts_gcs.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import argparse
16 | import csv
17 | import tempfile
18 | import StringIO
19 | 
20 | 
21 | from google.cloud import storage
22 | 
23 | 
24 | integer_features = ['integer-feature-{}'.format(i) for i in range(1, 14)]
25 | categorical_features = ['categorical-feature-{}'.format(i)
26 |                         for i in range(1, 27)]
27 | 
28 | 
29 | def preprocess_integer_dirs(bucket, artifact_dir):
30 |     client = storage.Client()
31 |     bucket = client.get_bucket(bucket)
32 |     blobs = list(bucket.list_blobs())
33 | 
34 |     for ifeature in integer_features:
35 |         ifeature = artifact_dir + '/' + ifeature
36 |         files = filter(lambda b: b.name.startswith(ifeature), blobs)
37 |         csv = filter(lambda b: 'csv' in b.name, files)[0]
38 |         value = csv.download_as_string()
39 |         path = csv.name[:csv.name.rfind('/')]
40 |         new_name = path + '/mean.txt'
41 |         print('Renaming {} to {}'.format(csv.name, new_name))
42 |         new_blob = bucket.blob(new_name)
43 |         new_blob.upload_from_string(value)
44 | 
45 | 
46 | def preprocess_categorical_dirs(bucket, artifact_dir):
47 |     client = storage.Client()
48 |     bucket = client.get_bucket(bucket)
49 |     blobs = list(bucket.list_blobs())
50 | 
51 |     for cfeature in categorical_features:
52 |         new_file, filename = tempfile.mkstemp()
53 |         cfeature = artifact_dir + '/' + cfeature
54 |         files = filter(lambda b: b.name.startswith(cfeature), blobs)
55 |         csv_file = filter(lambda b: 'csv' in b.name, files)[0]
56 |         csv_file.download_to_filename(filename)
57 | 
58 |         path = csv_file.name[:csv_file.name.rfind('/')]
59 | 
60 |         with open(filename, 'r') as part_file:
61 |             csvreader = csv.reader(part_file)
62 |             features = [row[0] for row in csvreader]
63 | 
64 |         output = StringIO.StringIO()
65 |         for feature in features:
66 |             if not feature:
67 |                 feature = 'null'
68 |             output.write('{}\n'.format(feature))
69 | 
70 |         index_name = path + '/index.txt'
71 |         index_blob = bucket.blob(index_name)
72 |         index_blob.upload_from_string(output.getvalue())
73 | 
74 |         output = StringIO.StringIO()
75 |         output.write('{}\n'.format(len(features)))
76 |         count_name = path + '/count.txt'
77 |         count_blob = bucket.blob(count_name)
78 |         count_blob.upload_from_string(output.getvalue())
79 | 
80 |         print('Wrote feature in {}'.format(cfeature))
81 | 
82 | 
83 | if __name__ == '__main__':
84 |     parser = argparse.ArgumentParser(
85 |         description=__doc__,
86 |         formatter_class=argparse.RawDescriptionHelpFormatter)
87 | 
88 |     parser.add_argument('bucket')
89 |     parser.add_argument('artifact_dir')
90 | 
91 |     args = parser.parse_args()
92 |     preprocess_integer_dirs(args.bucket, args.artifact_dir)
93 |     preprocess_categorical_dirs(args.bucket, args.artifact_dir)
94 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/preprocess_artifacts_local.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the 'License');
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #            http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an 'AS IS' BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import argparse
16 | import os
17 | import shutil
18 | import csv
19 | 
20 | 
21 | 
22 | # TODO: would be better to figure out how to have Scala output it correctly
23 | 
24 | def preprocess_integer_dirs(artifact_dir):
25 |     dirs = os.listdir(artifact_dir)
26 |     integer_dirs = filter(lambda dir: dir.startswith('integer-feature'), dirs)
27 |     assert len(integer_dirs) == 13, 'Expected 13 integer feature directories'
28 |     for integer_dir in integer_dirs:
29 |         full_dir = os.path.join(artifact_dir, integer_dir)
30 |         files = os.listdir(full_dir)
31 |         part_files = filter(lambda file: file.startswith('part'), files)
32 |         assert len(part_files) == 1, ('Did not find 1 {'
33 |                                       '}'.format(integer_dir))
34 |         part_file = part_files[0]
35 |         shutil.copy(os.path.join(full_dir, part_file),
36 |                     os.path.join(full_dir, 'mean.txt'))
37 | 
38 | 
39 | def preprocess_categorical_dirs(artifact_dir):
40 |     dirs = os.listdir(artifact_dir)
41 |     categorical_dirs = filter(lambda dir: dir.startswith('categorical-feature'),
42 |                           dirs)
43 |     assert len(categorical_dirs) == 26, ('Expected 26 integer feature '
44 |                                          'directories')
45 |     for categorical_dir in categorical_dirs:
46 |         full_dir = os.path.join(artifact_dir, categorical_dir)
47 |         files = os.listdir(full_dir)
48 |         part_file_name = filter(lambda file: file.startswith('part'), files)[0]
49 |         print('Part file is {}'.format(part_file_name))
50 |         with open(os.path.join(full_dir, part_file_name), 'r') as part_file:
51 |             csvreader = csv.reader(part_file)
52 |             features = [row[0] for row in csvreader]
53 |         with open(os.path.join(full_dir, 'index.txt'), 'w') as index_file:
54 |             for feature in features:
55 |                 if not feature:
56 |                     feature = 'null'
57 |                 index_file.write('{}\n'.format(feature))
58 |         with open(os.path.join(full_dir, 'count.txt'), 'w') as count_file:
59 |             count_file.write('{}\n'.format(len(features)))
60 | 
61 | 
62 | if __name__ == '__main__':
63 |     parser = argparse.ArgumentParser(
64 |         description=__doc__,
65 |         formatter_class=argparse.RawDescriptionHelpFormatter)
66 | 
67 |     parser.add_argument('artifact_dir')
68 |     args = parser.parse_args()
69 |     print('Processing integer dirs.')
70 |     preprocess_integer_dirs(args.artifact_dir)
71 |     print('Processing categorical dirs.')
72 |     preprocess_categorical_dirs(args.artifact_dir)
73 |     print('Done processing categories.')
74 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/requirements.txt:
--------------------------------------------------------------------------------
 1 | backports.weakref==1.0rc1
 2 | funcsigs==1.0.2
 3 | html5lib==0.9999999
 4 | Markdown==2.2.0
 5 | mock==2.0.0
 6 | numpy==1.13.0
 7 | pbr==3.0.1
 8 | protobuf==3.3.0
 9 | six==1.10.0
10 | tensorflow==1.15.4
11 | Werkzeug==0.15.3
12 | google-cloud-core==0.27.1
13 | google-cloud-storage==1.4.0
14 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudDataproc/cloud-dataproc/cb0ced6a30bc88a8ddc58d2870c217273be4958c/spark-tensorflow/trainer/test/__init__.py


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-1/count.txt:
--------------------------------------------------------------------------------
1 | 3
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-1/index.txt:
--------------------------------------------------------------------------------
1 | null
2 | cat-1-word-1
3 | cat-1-word-2
4 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-10/count.txt:
--------------------------------------------------------------------------------
1 | 3
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-10/index.txt:
--------------------------------------------------------------------------------
1 | null
2 | cat-10-word-1
3 | cat-10-word-2
4 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-11/count.txt:
--------------------------------------------------------------------------------
1 | 3
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-11/index.txt:
--------------------------------------------------------------------------------
1 | null
2 | cat-11-word-1
3 | cat-11-word-2
4 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-12/count.txt:
--------------------------------------------------------------------------------
1 | 3
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-12/index.txt:
--------------------------------------------------------------------------------
1 | null
2 | cat-12-word-1
3 | cat-12-word-2
4 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-13/count.txt:
--------------------------------------------------------------------------------
1 | 3
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-13/index.txt:
--------------------------------------------------------------------------------
1 | null
2 | cat-13-word-1
3 | cat-13-word-2
4 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-14/count.txt:
--------------------------------------------------------------------------------
1 | 3
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-14/index.txt:
--------------------------------------------------------------------------------
1 | null
2 | cat-14-word-1
3 | cat-14-word-2
4 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-15/count.txt:
--------------------------------------------------------------------------------
1 | 3
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-15/index.txt:
--------------------------------------------------------------------------------
1 | null
2 | cat-15-word-1
3 | cat-15-word-2
4 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-16/count.txt:
--------------------------------------------------------------------------------
1 | 3
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-16/index.txt:
--------------------------------------------------------------------------------
1 | null
2 | cat-16-word-1
3 | cat-16-word-2
4 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-17/count.txt:
--------------------------------------------------------------------------------
1 | 3
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-17/index.txt:
--------------------------------------------------------------------------------
1 | null
2 | cat-17-word-1
3 | cat-17-word-2
4 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-18/count.txt:
--------------------------------------------------------------------------------
1 | 3
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-18/index.txt:
--------------------------------------------------------------------------------
1 | null
2 | cat-18-word-1
3 | cat-18-word-2
4 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-19/count.txt:
--------------------------------------------------------------------------------
1 | 3
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-19/index.txt:
--------------------------------------------------------------------------------
1 | null
2 | cat-19-word-1
3 | cat-19-word-2
4 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-2/count.txt:
--------------------------------------------------------------------------------
1 | 3
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-2/index.txt:
--------------------------------------------------------------------------------
1 | null
2 | cat-2-word-1
3 | cat-2-word-2
4 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-20/count.txt:
--------------------------------------------------------------------------------
1 | 3
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-20/index.txt:
--------------------------------------------------------------------------------
1 | null
2 | cat-20-word-1
3 | cat-20-word-2
4 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-21/count.txt:
--------------------------------------------------------------------------------
1 | 3
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-21/index.txt:
--------------------------------------------------------------------------------
1 | null
2 | cat-21-word-1
3 | cat-21-word-2
4 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-22/count.txt:
--------------------------------------------------------------------------------
1 | 3
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-22/index.txt:
--------------------------------------------------------------------------------
1 | null
2 | cat-22-word-1
3 | cat-22-word-2
4 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-23/count.txt:
--------------------------------------------------------------------------------
1 | 3
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-23/index.txt:
--------------------------------------------------------------------------------
1 | null
2 | cat-23-word-1
3 | cat-23-word-2
4 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-24/count.txt:
--------------------------------------------------------------------------------
1 | 3
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-24/index.txt:
--------------------------------------------------------------------------------
1 | null
2 | cat-24-word-1
3 | cat-24-word-2
4 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-25/count.txt:
--------------------------------------------------------------------------------
1 | 3
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-25/index.txt:
--------------------------------------------------------------------------------
1 | null
2 | cat-25-word-1
3 | cat-25-word-2
4 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-26/count.txt:
--------------------------------------------------------------------------------
1 | 3
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-26/index.txt:
--------------------------------------------------------------------------------
1 | null
2 | cat-26-word-1
3 | cat-26-word-2
4 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-3/count.txt:
--------------------------------------------------------------------------------
1 | 3
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-3/index.txt:
--------------------------------------------------------------------------------
1 | null
2 | cat-3-word-1
3 | cat-3-word-2
4 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-4/count.txt:
--------------------------------------------------------------------------------
1 | 3
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-4/index.txt:
--------------------------------------------------------------------------------
1 | null
2 | cat-4-word-1
3 | cat-4-word-2
4 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-5/count.txt:
--------------------------------------------------------------------------------
1 | 3
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-5/index.txt:
--------------------------------------------------------------------------------
1 | null
2 | cat-5-word-1
3 | cat-5-word-2
4 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-6/count.txt:
--------------------------------------------------------------------------------
1 | 3
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-6/index.txt:
--------------------------------------------------------------------------------
1 | null
2 | cat-6-word-1
3 | cat-6-word-2
4 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-7/count.txt:
--------------------------------------------------------------------------------
1 | 3
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-7/index.txt:
--------------------------------------------------------------------------------
1 | null
2 | cat-7-word-1
3 | cat-7-word-2
4 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-8/count.txt:
--------------------------------------------------------------------------------
1 | 3
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-8/index.txt:
--------------------------------------------------------------------------------
1 | null
2 | cat-8-word-1
3 | cat-8-word-2
4 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-9/count.txt:
--------------------------------------------------------------------------------
1 | 3
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/categorical-feature-9/index.txt:
--------------------------------------------------------------------------------
1 | null
2 | cat-9-word-1
3 | cat-9-word-2
4 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/integer-feature-1/mean.txt:
--------------------------------------------------------------------------------
1 | 0
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/integer-feature-10/mean.txt:
--------------------------------------------------------------------------------
1 | 0
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/integer-feature-11/mean.txt:
--------------------------------------------------------------------------------
1 | 0
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/integer-feature-12/mean.txt:
--------------------------------------------------------------------------------
1 | 0
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/integer-feature-13/mean.txt:
--------------------------------------------------------------------------------
1 | 0
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/integer-feature-2/mean.txt:
--------------------------------------------------------------------------------
1 | 0
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/integer-feature-3/mean.txt:
--------------------------------------------------------------------------------
1 | 0
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/integer-feature-4/mean.txt:
--------------------------------------------------------------------------------
1 | 0
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/integer-feature-5/mean.txt:
--------------------------------------------------------------------------------
1 | 0
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/integer-feature-6/mean.txt:
--------------------------------------------------------------------------------
1 | 0
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/integer-feature-7/mean.txt:
--------------------------------------------------------------------------------
1 | 0
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/integer-feature-8/mean.txt:
--------------------------------------------------------------------------------
1 | 0
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/artifacts/integer-feature-9/mean.txt:
--------------------------------------------------------------------------------
1 | 0
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/data.tfrecords:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudDataproc/cloud-dataproc/cb0ced6a30bc88a8ddc58d2870c217273be4958c/spark-tensorflow/trainer/test/data.tfrecords


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/test.csv:
--------------------------------------------------------------------------------
1 | 0,,,,,,,,,,,,,,cat-1-word-2,cat-2-word-2,cat-3-word-2,cat-4-word-2,cat-5-word-2,cat-6-word-2,cat-7-word-2,cat-8-word-2,cat-9-word-2,cat-10-word-2,cat-11-word-2,cat-12-word-2,cat-13-word-2,cat-14-word-2,cat-15-word-2,cat-16-word-2,cat-17-word-2,cat-18-word-2,cat-19-word-2,cat-20-word-2,cat-21-word-2,cat-22-word-2,cat-23-word-2,cat-24-word-2,cat-25-word-2,cat-26-word-2
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/test1.expr:
--------------------------------------------------------------------------------
1 | id=['1'];integer-feature-1=[[0]];integer-feature-2=[[0]];integer-feature-3=[[0]];integer-feature-4=[[0]];integer-feature-5=[[0]];integer-feature-6=[[0]];integer-feature-7=[[0]];integer-feature-8=[[0]];integer-feature-9=[[0]];integer-feature-10=[[0]];integer-feature-11=[[0]];integer-feature-12=[[0]];integer-feature-13=[[0]];categorical-feature-1=[['cat-1-word-2']];categorical-feature-2=[['cat-2-word-2']];categorical-feature-3=[['cat-3-word-2']];categorical-feature-4=[['cat-4-word-2']];categorical-feature-5=[['cat-5-word-2']];categorical-feature-6=[['cat-6-word-2']];categorical-feature-7=[['cat-7-word-2']];categorical-feature-8=[['cat-8-word-2']];categorical-feature-9=[['cat-9-word-2']];categorical-feature-10=[['cat-10-word-2']];categorical-feature-11=[['cat-11-word-2']];categorical-feature-12=[['cat-12-word-2']];categorical-feature-13=[['cat-13-word-2']];categorical-feature-14=[['cat-14-word-2']];categorical-feature-15=[['cat-15-word-2']];categorical-feature-16=[['cat-16-word-2']];categorical-feature-17=[['cat-17-word-2']];categorical-feature-18=[['cat-18-word-2']];categorical-feature-19=[['cat-19-word-2']];categorical-feature-20=[['cat-20-word-2']];categorical-feature-21=[['cat-21-word-2']];categorical-feature-22=[['cat-22-word-2']];categorical-feature-23=[['cat-23-word-2']];categorical-feature-24=[['cat-24-word-2']];categorical-feature-25=[['cat-25-word-2']];categorical-feature-26=[['cat-26-word-2']]
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/test2.expr:
--------------------------------------------------------------------------------
1 | id=['2'];integer-feature-1=[[0]];integer-feature-2=[[1]];integer-feature-3=[[2]];integer-feature-4=[[3]];integer-feature-5=[[4]];integer-feature-6=[[5]];integer-feature-7=[[6]];integer-feature-8=[[7]];integer-feature-9=[[8]];integer-feature-10=[[9]];integer-feature-11=[[10]];integer-feature-12=[[11]];integer-feature-13=[[12]];categorical-feature-1=[['cat-1-word-1']];categorical-feature-2=[['cat-2-word-1']];categorical-feature-3=[['cat-3-word-1']];categorical-feature-4=[['cat-4-word-1']];categorical-feature-5=[['cat-5-word-1']];categorical-feature-6=[['cat-6-word-1']];categorical-feature-7=[['cat-7-word-1']];categorical-feature-8=[['cat-8-word-1']];categorical-feature-9=[['cat-9-word-1']];categorical-feature-10=[['cat-10-word-1']];categorical-feature-11=[['cat-11-word-1']];categorical-feature-12=[['cat-12-word-1']];categorical-feature-13=[['cat-13-word-1']];categorical-feature-14=[['cat-14-word-1']];categorical-feature-15=[['cat-15-word-1']];categorical-feature-16=[['cat-16-word-1']];categorical-feature-17=[['cat-17-word-1']];categorical-feature-18=[['cat-18-word-1']];categorical-feature-19=[['cat-19-word-1']];categorical-feature-20=[['cat-20-word-1']];categorical-feature-21=[['cat-21-word-1']];categorical-feature-22=[['cat-22-word-1']];categorical-feature-23=[['cat-23-word-1']];categorical-feature-24=[['cat-24-word-1']];categorical-feature-25=[['cat-25-word-1']];categorical-feature-26=[['cat-26-word-1']]
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/test3.expr:
--------------------------------------------------------------------------------
1 | id=['3'];integer-feature-1=[[0]];integer-feature-2=[[0]];integer-feature-3=[[0]];integer-feature-4=[[0]];integer-feature-5=[[0]];integer-feature-6=[[0]];integer-feature-7=[[0]];integer-feature-8=[[0]];integer-feature-9=[[0]];integer-feature-10=[[0]];integer-feature-11=[[0]];integer-feature-12=[[0]];integer-feature-13=[[0]];categorical-feature-1=[['cat-1-word-1']];categorical-feature-2=[['cat-2-word-1']];categorical-feature-3=[['cat-3-word-1']];categorical-feature-4=[['cat-4-word-1']];categorical-feature-5=[['cat-5-word-1']];categorical-feature-6=[['cat-6-word-1']];categorical-feature-7=[['cat-7-word-1']];categorical-feature-8=[['cat-8-word-1']];categorical-feature-9=[['cat-9-word-1']];categorical-feature-10=[['cat-10-word-1']];categorical-feature-11=[['cat-11-word-1']];categorical-feature-12=[['cat-12-word-1']];categorical-feature-13=[['cat-13-word-1']];categorical-feature-14=[['cat-14-word-1']];categorical-feature-15=[['cat-15-word-1']];categorical-feature-16=[['cat-16-word-1']];categorical-feature-17=[['cat-17-word-1']];categorical-feature-18=[['cat-18-word-1']];categorical-feature-19=[['cat-19-word-1']];categorical-feature-20=[['cat-20-word-1']];categorical-feature-21=[['cat-21-word-1']];categorical-feature-22=[['cat-22-word-1']];categorical-feature-23=[['cat-23-word-1']];categorical-feature-24=[['cat-24-word-1']];categorical-feature-25=[['cat-25-word-1']];categorical-feature-26=[['cat-26-word-1']]
2 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/train.csv:
--------------------------------------------------------------------------------
1 | 1,0,1,2,3,4,5,6,7,8,9,10,11,12,cat-1-word-1,cat-2-word-1,cat-3-word-1,cat-4-word-1,cat-5-word-1,cat-6-word-1,cat-7-word-1,cat-8-word-1,cat-9-word-1,cat-10-word-1,cat-11-word-1,cat-12-word-1,cat-13-word-1,cat-14-word-1,cat-15-word-1,cat-16-word-1,cat-17-word-1,cat-18-word-1,cat-19-word-1,cat-20-word-1,cat-21-word-1,cat-22-word-1,cat-23-word-1,cat-24-word-1,cat-25-word-1,cat-26-word-1
2 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,cat-1-word-2,cat-2-word-2,cat-3-word-2,cat-4-word-2,cat-5-word-2,cat-6-word-2,cat-7-word-2,cat-8-word-2,cat-9-word-2,cat-10-word-2,cat-11-word-2,cat-12-word-2,cat-13-word-2,cat-14-word-2,cat-15-word-2,cat-16-word-2,cat-17-word-2,cat-18-word-2,cat-19-word-2,cat-20-word-2,cat-21-word-2,cat-22-word-2,cat-23-word-2,cat-24-word-2,cat-25-word-2,cat-26-word-2
3 | 


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/test/train.tsv:
--------------------------------------------------------------------------------
1 | 1	0	1	2	3	4	5	6	7	8	9	10	11	12	cat-1-word-1	cat-2-word-1	cat-3-word-1	cat-4-word-1	cat-5-word-1	cat-6-word-1	cat-7-word-1	cat-8-word-1	cat-9-word-1	cat-10-word-1	cat-11-word-1	cat-12-word-1	cat-13-word-1	cat-14-word-1	cat-15-word-1	cat-16-word-1	cat-17-word-1	cat-18-word-1	cat-19-word-1	cat-20-word-1	cat-21-word-1	cat-22-word-1	cat-23-word-1	cat-24-word-1	cat-25-word-1	cat-26-word-1
2 | 0	0	0	0	0	0	0	0	0	0	0	0	0	0	cat-1-word-2	cat-2-word-2	cat-3-word-2	cat-4-word-2	cat-5-word-2	cat-6-word-2	cat-7-word-2	cat-8-word-2	cat-9-word-2	cat-10-word-2	cat-11-word-2	cat-12-word-2	cat-13-word-2	cat-14-word-2	cat-15-word-2	cat-16-word-2	cat-17-word-2	cat-18-word-2	cat-19-word-2	cat-20-word-2	cat-21-word-2	cat-22-word-2	cat-23-word-2	cat-24-word-2	cat-25-word-2	cat-26-word-2


--------------------------------------------------------------------------------
/spark-tensorflow/trainer/tests.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2017 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #            http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Tests."""
16 | 
17 | import argparse
18 | import data
19 | import os
20 | import task
21 | import tensorflow as tf
22 | 
23 | THIS_DIR = os.path.dirname(__file__)
24 | ARTIFACT_DIR = os.path.join(THIS_DIR, 'test/artifacts/')
25 | DATA_DIR = os.path.join(THIS_DIR, 'test/')
26 | 
27 | 
28 | class SampleTests(tf.test.TestCase):
29 |     """All tests for this sample.
30 | 
31 |     Test data is present in the 'tests/' directory.
32 |     """
33 | 
34 |     def test_get_feature_columns(self):
35 |         feature_columns = data.get_feature_columns(
36 |             data.TSV,
37 |             ARTIFACT_DIR)
38 |         self.assertEqual(len(feature_columns),
39 |                          len(data.INTEGER_FEATURES) +
40 |                          len(data.CATEGORICAL_FEATURES)
41 |                          )
42 | 
43 |     def test_generate_labelled_input_fn_tsv(self):
44 |         tsv_data_file = 'train.tsv'
45 |         batch_size = 2
46 |         data_glob = '{}{}'.format(DATA_DIR, tsv_data_file)
47 |         labelled_input_fn = data.generate_labelled_input_fn(
48 |             data.TSV,
49 |             2,
50 |             data_glob,
51 |             ARTIFACT_DIR)
52 |         features, labels = labelled_input_fn()
53 | 
54 |         with tf.Session() as sess:
55 |             coord = tf.train.Coordinator()
56 |             threads = tf.train.start_queue_runners(coord=coord)
57 |             result = sess.run({'features': features,
58 |                                'labels': labels})
59 |             coord.request_stop()
60 |             coord.join(threads)
61 | 
62 |         features_out = result['features']
63 |         for key in features_out:
64 |             self.assertEqual(features_out[key].shape, (batch_size, 1))
65 | 
66 |         labels_out = result['labels']
67 |         self.assertEqual(labels_out.shape, (batch_size, 1))
68 | 
69 |     def test_end_to_end_tsv(self):
70 |         job_dir = tf.test.get_temp_dir()
71 | 
72 |         args = argparse.Namespace(
73 |             job_dir=job_dir,
74 |             data_format=data.TSV,
75 |             train_dir=DATA_DIR,
76 |             eval_dir=DATA_DIR,
77 |             artifact_dir=ARTIFACT_DIR,
78 |             batch_size=2,
79 |             train_steps=10,
80 |             eval_steps=1,
81 |             learning_rate=0.5,
82 |             min_eval_frequency=0
83 |         )
84 | 
85 |         task.dispatch(args)
86 | 
87 | 
88 | if __name__ == '__main__':
89 |     tf.test.main()
90 | 


--------------------------------------------------------------------------------
/spark-translate/.gitignore:
--------------------------------------------------------------------------------
1 | maven/target/**
2 | maven/dependency-reduced-pom.xml
3 | 
4 | sbt/target/**
5 | sbt/project/**
6 | !sbt/project/assembly.sbt
7 | 


--------------------------------------------------------------------------------
/spark-translate/README.md:
--------------------------------------------------------------------------------
 1 | This example is a simple demo Spark application that translates words using
 2 | Google's Translation API and running on Cloud Dataproc.
 3 | 
 4 | ## Prerequesites
 5 | 
 6 | 1. You are familiar with [scala](http://scala-lang.org/) and have [Maven](https://maven.apache.org/) and/or [SBT](http://www.scala-sbt.org/) installed. We will use scala to communicate with Spark.
 7 | 
 8 | 2. You have the [Cloud SDK](https://cloud.google.com/sdk/) installed.
 9 | 
10 | 3. You have a Google Cloud project and have [enabled billing](https://cloud.google.com/billing/docs/how-to/modify-project).
11 | 
12 | ## Running the code
13 | 
14 | 1. Record the project ID in an environment variable for later use:
15 |    ```
16 |    export PROJECT=$(gcloud info --format='value(config.project)')
17 |    ```
18 | 
19 | 2. Enable the `translate` and `dataproc` APIs:
20 |    ```
21 |    gcloud services enable translate.googleapis.com dataproc.googleapis.com
22 |    ```
23 | 
24 | 3. Compile the JAR (this may take a few minutes):
25 | 
26 | * Option 1: with Maven
27 |   ```
28 |   cd maven
29 |   mvn package
30 |   ```
31 | * Option 2: with SBT
32 |   ```
33 |   cd sbt
34 |   sbt assembly
35 |   mv target/scala-2.11/translate-example-assembly-1.0.jar target/translate-example-1.0.jar
36 |   ```
37 | 
38 | 4. Create a bucket:
39 |    ```
40 |    gsutil mb gs://$PROJECT-bucket
41 |    ```
42 | 
43 | 5. Upload `words.txt` to the bucket:
44 |    ```
45 |    gsutil cp ../words.txt gs://$PROJECT-bucket
46 |    ```
47 |    The file `words.txt` contains the following:
48 |    ```
49 |    cat
50 |    dog
51 |    fish
52 |    ```
53 | 
54 | 6. Create a Cloud Dataproc cluster:
55 |    ```
56 |    gcloud dataproc clusters create demo-cluster \
57 |    --zone=us-central1-a \
58 |    --scopes=cloud-platform \
59 |    --image-version=1.3
60 |    ```
61 | 
62 | 7. Submit the Spark job to translate the words to French:
63 |    ```
64 |    gcloud dataproc jobs submit spark \
65 |    --cluster demo-cluster \
66 |    --jar target/translate-example-1.0.jar \
67 |    -- fr gs://$PROJECT-bucket words.txt translated-fr
68 |    ```
69 | 
70 | 8. Verify that the words have been translated:
71 |    ```
72 |    gsutil cat gs://$PROJECT-bucket/translated-fr/part-*
73 |    ```
74 |    The output is:
75 |    ```
76 |    chat
77 |    chien
78 |    poisson
79 |    ```
80 | 


--------------------------------------------------------------------------------
/spark-translate/maven/pom.xml:
--------------------------------------------------------------------------------
  1 | <!--
  2 | Copyright 2018 Google Inc.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |       http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | -->
 16 | <project>
 17 |   <modelVersion>4.0.0</modelVersion>
 18 |   <packaging>jar</packaging>
 19 |   <version>1.0</version>
 20 |   <groupId>dataproc-java-dependencies-demo</groupId>
 21 |   <artifactId>translate-example</artifactId>
 22 | 
 23 |   <properties>
 24 |     <maven.compiler.target>1.8</maven.compiler.target>
 25 |     <maven.compiler.source>1.8</maven.compiler.source>
 26 |   </properties>
 27 | 
 28 |   <dependencies>
 29 | 
 30 |     <dependency>
 31 |       <groupId>org.apache.spark</groupId>
 32 |       <artifactId>spark-sql_2.11</artifactId>
 33 |       <version>2.2.1</version>
 34 |       <scope>provided</scope>
 35 |     </dependency>
 36 | 
 37 |     <dependency>
 38 |       <groupId>com.google.cloud</groupId>
 39 |       <artifactId>google-cloud-translate</artifactId>
 40 |       <version>1.35.0</version>
 41 |     </dependency>
 42 | 
 43 |   </dependencies>
 44 | 
 45 |   <build>
 46 |     <plugins>
 47 | 
 48 |       <plugin>
 49 |         <groupId>net.alchim31.maven</groupId>
 50 |         <artifactId>scala-maven-plugin</artifactId>
 51 |         <version>3.3.2</version>
 52 |         <executions>
 53 |           <execution>
 54 |             <goals>
 55 |               <goal>compile</goal>
 56 |               <goal>testCompile</goal>
 57 |             </goals>
 58 |           </execution>
 59 |         </executions>
 60 |         <configuration>
 61 |           <scalaVersion>2.11.8</scalaVersion>
 62 |         </configuration>
 63 |       </plugin>
 64 | 
 65 |       <plugin>
 66 |         <groupId>org.apache.maven.plugins</groupId>
 67 |         <artifactId>maven-shade-plugin</artifactId>
 68 |         <version>3.1.1</version>
 69 |         <executions>
 70 |           <execution>
 71 |             <phase>package</phase>
 72 |             <goals>
 73 |               <goal>shade</goal>
 74 |             </goals>
 75 |             <configuration>
 76 |               <transformers>
 77 |                 <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
 78 |                   <mainClass>demo.TranslateExample</mainClass>
 79 |                 </transformer>
 80 |               </transformers>
 81 |               <filters>    
 82 |                 <filter>
 83 |                   <artifact>*:*</artifact>
 84 |                   <excludes>
 85 |                     <exclude>META-INF/maven/**</exclude>
 86 |                     <exclude>META-INF/*.SF</exclude>
 87 |                     <exclude>META-INF/*.DSA</exclude>
 88 |                     <exclude>META-INF/*.RSA</exclude>
 89 |                   </excludes>
 90 |                 </filter>
 91 |               </filters>
 92 |               <relocations>
 93 |                 <relocation>
 94 |                   <pattern>com</pattern>
 95 |                   <shadedPattern>repackaged.com</shadedPattern>
 96 |                   <includes>
 97 |                     <include>com.google.common.**</include>
 98 |                   </includes>
 99 |                 </relocation>
100 |               </relocations>
101 |             </configuration>
102 |           </execution>
103 |         </executions>
104 |       </plugin>
105 |     </plugins>
106 |   </build>
107 | </project>
108 | 


--------------------------------------------------------------------------------
/spark-translate/maven/src/main/scala/demo/TranslateExample.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  Copyright Google Inc. 2018
 3 |  Licensed under the Apache License, Version 2.0 (the "License");
 4 |  you may not use this file except in compliance with the License.
 5 |  You may obtain a copy of the License at
 6 |  http://www.apache.org/licenses/LICENSE-2.0
 7 |  Unless required by applicable law or agreed to in writing, software
 8 |  distributed under the License is distributed on an "AS IS" BASIS,
 9 |  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 |  See the License for the specific language governing permissions and
11 |  limitations under the License.
12 | */
13 | 
14 | package demo
15 | 
16 | import org.apache.spark.sql.SparkSession
17 | 
18 | import com.google.cloud.translate.Translate
19 | import com.google.cloud.translate.Translate.TranslateOption
20 | import com.google.cloud.translate.TranslateOptions
21 | import com.google.cloud.translate.Translation
22 | 
23 | object TranslateExample {
24 | 
25 |   val translateService = TranslateOptions.getDefaultInstance().getService()
26 | 
27 |   def translate(word: String, language: String): String = {
28 |     val translation = translateService.translate(
29 | 	    word,
30 | 	    TranslateOption.sourceLanguage("en"),
31 | 	    TranslateOption.targetLanguage(language))
32 | 	  return translation.getTranslatedText()
33 |   }
34 | 
35 |   def main(args: Array[String]): Unit = {
36 |   	if (args.length != 4) {
37 |       System.err.println(
38 |         """
39 |           | Usage: TranslateExample <language> <bucket> <input> <output>
40 |           |
41 |           | <language>: Target language code for the translation (e.g. "fr" for French). See the list of supported languages: https://cloud.google.com/translate/docs/languages
42 |           | <bucket>: Bucket's URI
43 |           | <input>: Name of the input text file
44 |           | <output>: Name of the output folder
45 |           |
46 |         """.stripMargin)
47 |       System.exit(1)
48 |     }
49 | 
50 |     val Seq(language, bucket, input, output) = args.toSeq
51 | 
52 |     val spark = SparkSession.builder.appName("Simple Application").getOrCreate()
53 | 
54 |     // Import Dataset encoders
55 |     import spark.implicits._
56 | 
57 |     val words = spark.read.textFile(bucket + "/" + input)
58 | 
59 |     val translated = words.map(word => translate(word, language))
60 | 
61 |     translated.write.mode("overwrite").text(bucket + "/" + output)
62 |   }
63 | }


--------------------------------------------------------------------------------
/spark-translate/sbt/build.sbt:
--------------------------------------------------------------------------------
 1 | lazy val commonSettings = Seq(
 2 |   organization := "dataproc-java-dependencies-demo",
 3 |   name := "translate-example",
 4 |   version := "1.0",
 5 |   scalaVersion := "2.11.8",
 6 | )
 7 | 
 8 | lazy val shaded = (project in file("."))
 9 |   .settings(commonSettings)
10 | 
11 | mainClass in (Compile, packageBin) := Some("demo.TranslateExample")
12 | 
13 | libraryDependencies ++= Seq(
14 |   "org.apache.spark" % "spark-sql_2.11" % "2.2.1" % "provided",
15 |   "com.google.cloud" % "google-cloud-translate" % "1.35.0"
16 | )
17 | 
18 | assemblyShadeRules in assembly := Seq(
19 |   ShadeRule.rename("com.google.common.**" -> "repackaged.com.google.common.@1").inAll
20 | )


--------------------------------------------------------------------------------
/spark-translate/sbt/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6")
2 | 


--------------------------------------------------------------------------------
/spark-translate/sbt/src/main/scala/demo/TranslateExample.scala:
--------------------------------------------------------------------------------
1 | ../../../../../maven/src/main/scala/demo/TranslateExample.scala


--------------------------------------------------------------------------------
/spark-translate/words.txt:
--------------------------------------------------------------------------------
1 | cat
2 | dog
3 | fish
4 | 


--------------------------------------------------------------------------------
/workshops/social-media/README.md:
--------------------------------------------------------------------------------
 1 | # Workshop for Machine Learning and NLP on Social Media Data at Scale
 2 | 
 3 | This workshop will take you through processing Social Media data at scale from start to finish. We'll use a dataset consisting of all Reddit posts since 2016 stored in BigQuery. 
 4 | 
 5 | First, you will load the Reddit data from [BigQuery](https://cloud.google.com/bigquery/) into PySpark using [Cloud Dataproc](https://cloud.google.com/dataproc/):
 6 | [PySpark for Preprocessing BigQuery Data](bit.ly/pyspark-bigquery)
 7 | 
 8 | Next, you will use the open source library [spark-nlp](https://nlp.johnsnowlabs.com/) to create a [topic model](https://www.kdnuggets.com/2016/07/text-mining-101-topic-modeling.html) using [Latent Dirichlet Allocation (LDA)](https://medium.com/@lettier/how-does-lda-work-ill-explain-using-emoji-108abf40fa7d) to learn about trends in your data: [PySpark for Natural Language Processing](bit.ly/spark-nlp)
 9 | 
10 | Lastly, you'll learn to use [Cloud AutoML](https://cloud.google.com/automl/) and [Natural Language APIs](https://cloud.google.com/natural-language/) to derive more insights from your data : [Google Cloud AutoML and Natural Language APIs](bit.ly/social-media-nlp)
11 | 


--------------------------------------------------------------------------------
/workshops/social-media/slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudDataproc/cloud-dataproc/cb0ced6a30bc88a8ddc58d2870c217273be4958c/workshops/social-media/slides.pdf


--------------------------------------------------------------------------------
/workshops/social-media/social-media.txt:
--------------------------------------------------------------------------------
 1 | Process BigQuery Data with PySpark
 2 | bit.ly/pyspark-bigquery
 3 | 
 4 | Use the spark-nlp library for Natural Language Processing  
 5 | bit.ly/spark-nlp
 6 | 
 7 | Use Google Cloud AutoML and Natural Language APIs to process Social Media data
 8 | bit.ly/social-media-nlp
 9 | 
10 | Video discussing AutoML
11 | https://www.youtube.com/watch?reload=9&v=XrMtF_inTZ0
12 | 
13 | The slides are here also:
14 | https://github.com/bradmiro/cloud-dataproc/tree/master/workshops/social-media/slides.pdf


--------------------------------------------------------------------------------