├── .dockerignore
├── CHANGES.txt
├── CONTRIBUTING
├── Dockerfile
├── LICENSE
├── README.md
├── bdutil
├── bdutil_env.sh
├── bigquery_env.sh
├── conf
    ├── hadoop1
    │   ├── bq-mapred-template.xml
    │   ├── core-template.xml
    │   ├── gcs-core-template.xml
    │   ├── hdfs-template.xml
    │   ├── mapred-health-check.sh
    │   └── mapred-template.xml
    └── hadoop2
    │   ├── bigtable-hbase-site-template.xml
    │   ├── bq-mapred-template.xml
    │   ├── capacity-scheduler-template.xml
    │   ├── core-template.xml
    │   ├── gcs-core-template.xml
    │   ├── hdfs-template.xml
    │   ├── mapred-template.xml
    │   └── yarn-template.xml
├── docs
    ├── JOBS.md
    ├── MONITORING.md
    ├── QUICKSTART.md
    └── SHUTDOWN.md
├── extensions
    ├── bigtable
    │   ├── bigtable_env.sh
    │   └── install_hbase_bigtable.sh
    ├── flink
    │   ├── README.md
    │   ├── flink_env.sh
    │   ├── install_flink.sh
    │   └── start_flink.sh
    ├── google
    │   ├── experimental
    │   │   └── resize_env.sh
    │   └── gcs-validate-setup.sh
    ├── hama
    │   ├── README.md
    │   ├── hama_env.sh
    │   ├── install_hama.sh
    │   └── start_hama.sh
    ├── hbase
    │   ├── README.md
    │   ├── hbase_env.sh
    │   ├── install_hbase.sh
    │   └── start_hbase.sh
    ├── querytools
    │   ├── hive-validate-setup.sh
    │   ├── pig-mapred-template.xml
    │   ├── pig-validate-setup.sh
    │   ├── prepare_files.sh
    │   ├── querytools_env.sh
    │   └── setup_profiles.sh
    ├── spark
    │   ├── install_shark.sh
    │   ├── install_spark.sh
    │   ├── spark-validate-setup.sh
    │   ├── spark_configure_startup_processes.sh
    │   ├── spark_env.sh
    │   ├── spark_on_yarn_env.sh
    │   ├── spark_shark_env.sh
    │   ├── start_single_spark_worker.sh
    │   └── start_spark.sh
    ├── storm
    │   ├── README.md
    │   ├── install_storm.sh
    │   ├── install_supervisor.sh
    │   ├── install_zookeeper.sh
    │   ├── jar.xml
    │   ├── start_storm_master.sh
    │   ├── start_storm_worker.sh
    │   └── storm_env.sh
    └── tajo
    │   ├── README.md
    │   ├── configure_tajo.sh
    │   ├── install_tajo.sh
    │   ├── start_tajo.sh
    │   └── tajo_env.sh
├── hadoop-validate-setup.sh
├── hadoop2_env.sh
├── libexec
    ├── bdutil_helpers.sh
    ├── configure_hadoop.sh
    ├── configure_hdfs.sh
    ├── configure_mrv2_mem.py
    ├── configure_startup_processes.sh
    ├── hadoop_helpers.sh
    ├── install_and_configure_bigquery_connector.sh
    ├── install_and_configure_gcs_connector.sh
    ├── install_bdconfig.sh
    ├── install_hadoop.sh
    ├── install_java.sh
    ├── mount_disks.sh
    ├── set_default_fs.sh
    ├── setup_client_nfs.sh
    ├── setup_hadoop_user.sh
    ├── setup_master_nfs.sh
    ├── setup_master_ssh.sh
    ├── setup_worker_ssh.sh
    ├── start_hadoop.sh
    └── start_hadoop2.sh
├── platforms
    ├── cdh
    │   ├── README.md
    │   ├── cdh-core-template.xml
    │   ├── cdh_env.sh
    │   ├── configure_cdh.sh
    │   └── install_cdh.sh
    ├── hdp
    │   ├── README.md
    │   ├── TEST.md
    │   ├── ambari.conf
    │   ├── ambari_env.sh
    │   ├── ambari_functions.sh
    │   ├── ambari_manual_env.sh
    │   ├── ambari_manual_post_deploy_env.sh
    │   ├── configuration.json
    │   ├── create_blueprint.py
    │   ├── install_ambari.sh
    │   ├── install_ambari_components.sh
    │   ├── install_gcs_connector_on_ambari.sh
    │   ├── resources
    │   │   ├── public-hostname-gcloud.sh
    │   │   └── thp-disable.sh
    │   └── update_ambari_config.sh
    ├── mapr
    │   ├── README.md
    │   ├── configure_mapr_instance.sh
    │   ├── mapr_env.sh
    │   ├── mapr_license.txt
    │   ├── node.lst
    │   └── prepare_mapr_image.sh
    └── restart_services.sh
├── sampleapps
    └── querytools
    │   ├── COPYING
    │   ├── README.md
    │   ├── conf
    │       └── hive
    │       │   └── hive-site.xml
    │   ├── examples
    │       └── ngrams
    │       │   ├── hive_query_ngrams.q
    │       │   ├── hive_table_create.sh
    │       │   ├── ngram_hdfs_load.sh
    │       │   ├── ngram_setup.sh
    │       │   └── pig_query_ngrams.pig
    │   ├── project_properties.sh
    │   └── scripts
    │       ├── common_utils.sh
    │       ├── install-packages-on-master__at__host.sh
    │       ├── package_utils.sh
    │       ├── packages-delete-from-gcs__at__host.sh
    │       ├── packages-to-gcs__at__host.sh
    │       ├── setup-hdfs-for-hdtools__at__master.sh
    │       ├── setup-packages__at__master.sh
    │       └── setup-ssh-keys__at__master.sh
├── samples
    ├── bigquery_wordcount.jar
    ├── streaming_word_count.sh
    ├── test-mr-bigquery.sh
    ├── word_count_mapper.py
    └── word_count_reducer.py
├── single_node_env.sh
└── standalone_nfs_cache_env.sh


/.dockerignore:
--------------------------------------------------------------------------------
1 | .git
2 | .gitignore
3 | 
4 | *.swp
5 | */*.swp
6 | */*/*.swp
7 | */*/*/*.swp
8 | */*/*/*/*.swp
9 | 


--------------------------------------------------------------------------------
/CONTRIBUTING:
--------------------------------------------------------------------------------
 1 | Want to contribute? Great! First, read this page (including the small print at the end).
 2 | 
 3 | ### Before you contribute
 4 | Before we can use your code, you must sign the
 5 | [Google Individual Contributor License Agreement](https://developers.google.com/open-source/cla/individual?csw=1)
 6 | (CLA), which you can do online. The CLA is necessary mainly because you own the
 7 | copyright to your changes, even after your contribution becomes part of our
 8 | codebase, so we need your permission to use and distribute your code. We also
 9 | need to be sure of various other things—for instance that you'll tell us if you
10 | know that your code infringes on other people's patents. You don't have to sign
11 | the CLA until after you've submitted your code for review and a member has
12 | approved it, but you must do it before we can put your code into our codebase.
13 | Before you start working on a larger contribution, you should get in touch with
14 | us first through the issue tracker with your idea so that we can help out and
15 | possibly guide you. Coordinating up front makes it much easier to avoid
16 | frustration later on.
17 | 
18 | ### Code reviews
19 | All submissions, including submissions by project members, require review. We
20 | use Github pull requests for this purpose.
21 | 
22 | ### The small print
23 | Contributions made by corporations are covered by a different agreement than
24 | the one above, the Software Grant and Corporate Contributor License Agreement.
25 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM google/cloud-sdk
2 | 
3 | ADD . /bdutil/
4 | 
5 | ENTRYPOINT ["/bdutil/bdutil"]
6 | CMD ["--help"]
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # This project has been deprecated. Please use [Google Cloud Dataproc](https://cloud.google.com/dataproc) to create managed Apache Hadoop and Apache Spark instances on [Google Compute Engine](https://cloud.google.com/compute).
 2 | 
 3 | # bdutil
 4 | 
 5 | bdutil is a command-line script used to manage Apache Hadoop and Apache Spark instances on [Google Compute Engine](https://cloud.google.com/compute). bdutil manages deployment, configuration, and shutdown of your Hadoop instances.
 6 | 
 7 | ## Requirements
 8 | 
 9 | bdutil depends on the [Google Cloud SDK](https://cloud.google.com/sdk). bdutil is supported in any posix-compliant Bash v3 or greater shell.
10 | 
11 | ## Usage
12 | 
13 | See the [QUICKSTART](/docs/QUICKSTART.md) file in the `docs` directory to learn how to set up your Hadoop instances using bdutil.
14 | 
15 | 1. Install and configure the [Google Cloud SDK](https://cloud.google.com/sdk) if you have already not done so
16 | 1. Clone this repository with `git clone https://github.com/GoogleCloudPlatform/bdutil.git`
17 | 1. Modify the following variables in the bdutil_env.sh file:
18 |   1. `PROJECT` - Set to the project ID for all bdutil commands. The project value will be overridden in the following order (where 1 overrides 2, and 2 overrides 3):
19 |     * -p flag value, or if not specified then
20 |     * PROJECT value in bdutil_env.sh, or if not specified then
21 |     * gcloud default project value
22 |   1. `CONFIGBUCKET` - Set to a Google Compute Storage bucket that your project has read/write access to.
23 | 1. Run `bdutil --help` for a list of commands.
24 | 
25 | The script implements the following commands, which are very similar:
26 | 
27 | * `bdutil create` creates and starts instances, but will not apply most configuration settings. You can call `bdutil run_command_steps` on instances afterward to apply configuration settings to them. Typically you wouldn't use this, but would use `bdutil deploy` instead.
28 | * `bdutil deploy` creates and starts instances with all the configuration options specified in the command line and any included configuration scripts.
29 | 
30 | ## Components installed
31 | 
32 | The latest release of bdutil is `1.3.5`. This bdutil release installs the following versions of open source components:
33 | 
34 | * Apache Hadoop - 1.2.1 (2.7.1 if you use the `-e` argument)
35 | * Apache Spark - 1.5.0
36 | * Apache Pig - 0.12
37 | * Apache Hive - 1.2.1
38 | 
39 | ## Documentation
40 | 
41 | The following documentation is useful for bdutil.
42 | 
43 | * **[Quickstart](/docs/QUICKSTART.md)** - A guide on how to get started with bdutil quickly.
44 | * **[Jobs](/docs/JOBS.md)** - How to submit jobs (work) to a bdutil cluster.
45 | * **[Monitoring](/docs/MONITORING.md)** - How to monitor bdutil cluster.
46 | * **[Shutdown](/docs/SHUTDOWN.md)** - How shutdown a bdutil cluster.
47 | 


--------------------------------------------------------------------------------
/bigquery_env.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2013 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # This file contains environment-variable overrides to be used in conjunction
16 | # with bdutil_env.sh in order to deploy a bigquery-enabled Hadoop cluster.
17 | # Usage: ./bdutil deploy bigquery_env.sh
18 | 
19 | GCE_SERVICE_ACCOUNT_SCOPES+=('bigquery')
20 | 
21 | # Whether or not to install and configure the BigQuery connector.
22 | INSTALL_BIGQUERY_CONNECTOR=true
23 | 
24 | 


--------------------------------------------------------------------------------
/conf/hadoop1/bq-mapred-template.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" ?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <configuration>
 4 |   <property>
 5 |     <name>mapred.bq.project.id</name>
 6 |     <value><envVar name="PROJECT"/></value>
 7 |     <description>
 8 |       Google Cloud Project ID to use for BigQuery operations.
 9 |     </description>
10 |   </property>
11 |   <property>
12 |     <name>mapred.bq.gcs.bucket</name>
13 |     <value><envVar name="CONFIGBUCKET"/></value>
14 |     <description>
15 |       The GCS bucket holding temporary BigQuery data for the input connector.
16 |     </description>
17 |   </property>
18 |   <property>
19 |     <name>mapred.bq.output.buffer.size</name>
20 |     <value>67108864</value>
21 |     <description>
22 |       The size in bytes of the output buffer to use when writing to BigQuery.
23 |     </description>
24 |   </property>
25 | </configuration>
26 | 


--------------------------------------------------------------------------------
/conf/hadoop1/core-template.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" ?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <configuration>
 4 |   <property>
 5 |     <name>hadoop.tmp.dir</name>
 6 |     <value><envVar name="HADOOP_TMP_DIR"/></value>
 7 |     <description>A base for other temporary directories.</description>
 8 |   </property>
 9 | </configuration>
10 | 


--------------------------------------------------------------------------------
/conf/hadoop1/gcs-core-template.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" ?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <configuration>
 4 |   <property>
 5 |     <name>fs.gs.project.id</name>
 6 |     <value><envVar name="PROJECT"/></value>
 7 |     <description>
 8 |       Google Cloud Project ID with access to configured GCS buckets.
 9 |     </description>
10 |   </property>
11 |   <property>
12 |     <name>fs.gs.system.bucket</name>
13 |     <value><envVar name="CONFIGBUCKET"/></value>
14 |     <description>
15 |       GCS bucket to use as a default bucket if fs.default.name is not a gs: uri.
16 |     </description>
17 |   </property>
18 |   <property>
19 |     <name>fs.gs.working.dir</name>
20 |     <value>/</value>
21 |     <description>
22 |       The directory relative gs: uris resolve in inside of the default bucket.
23 |     </description>
24 |   </property>
25 |   <property>
26 |     <name>fs.gs.impl</name>
27 |     <value>com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem</value>
28 |     <description>The FileSystem for gs: (GCS) uris.</description>
29 |   </property>
30 |   <property>
31 |     <name>fs.gs.metadata.cache.enable</name>
32 |     <value>true</value>
33 |     <description>
34 |       If true, a DirectoryListCache will be used to supplement "list" requests
35 |       to GCS to fill in any missing items caused by eventual list consistency,
36 |       intercepting create/delete/copy calls to create cache entries. The
37 |       concrete type is determined with fs.gs.metadata.cache.type.
38 |     </description>
39 |   </property>
40 |   <property>
41 |     <name>fs.gs.metadata.cache.type</name>
42 |     <value><envVar name="GCS_METADATA_CACHE_TYPE"/></value>
43 |     <description>
44 |       Specifies which implementation of DirectoryListCache to use for
45 |       supplementing GCS API "list" requests. Supported implementations:
46 |       IN_MEMORY: Enforces immediate consistency within same Java process.
47 |       FILESYSTEM_BACKED: Enforces consistency across all cooperating processes
48 |       pointed at the same local mirror directory, which may be an NFS directory
49 |       for massively-distributed coordination.
50 |     </description>
51 |   </property>
52 |   <property>
53 |     <name>fs.gs.metadata.cache.directory</name>
54 |     <value><envVar name="GCS_FILE_CACHE_DIRECTORY"/></value>
55 |     <description>
56 |       Only used if fs.gs.metadata.cache.type is FILESYSTEM_BACKED, specifies
57 |       the local path to use as the base path for storing mirrored GCS metadata.
58 |       Must be an absolute path, must be a directory, and must be fully
59 |       readable/writable/executable by any user running processes which use the
60 |       GCS connector.
61 |     </description>
62 |   </property>
63 | </configuration>
64 | 


--------------------------------------------------------------------------------
/conf/hadoop1/hdfs-template.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" ?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <configuration>
 4 |   <property>
 5 |     <name>dfs.namenode.rpc-address</name>
 6 |     <value><envVar name="NAMENODE_URI"/></value>
 7 |     <description>
 8 |       RPC address that handles all clients requests. If empty then we'll get
 9 |       thevalue from fs.default.name.The value of this property will take the
10 |       form of hdfs://nn-host1:rpc-port.
11 |     </description>
12 |   </property>
13 |   <property>
14 |     <name>dfs.name.dir</name>
15 |     <value><envVar name="HDFS_NAME_DIR"/></value>
16 |     <description>
17 |       Determines where on the local filesystem the DFS namenode should store the
18 |       name table(fsimage). If this is a comma-delimited list of directories then
19 |       the name table is replicated in all of thedirectories, for redundancy.
20 |     </description>
21 |   </property>
22 |   <property>
23 |     <name>dfs.data.dir</name>
24 |     <value><envVar name="HDFS_DATA_DIRS"/></value>
25 |     <description>
26 |       Determines where on the local filesystem an DFS datanode should store its
27 |       blocks. If this is a comma-delimited list of directories, then data will
28 |       be stored in all named directories, typically on different
29 |       devices.Directories that do not exist are ignored.
30 |     </description>
31 |   </property>
32 |   <property>
33 |     <name>dfs.datanode.data.dir.perm</name>
34 |     <value><envVar name="HDFS_DATA_DIRS_PERM"/></value>
35 |     <description>
36 |       Permissions for the directories on on the local filesystem where the DFS
37 |       data node store its blocks. The permissions can either be octal or
38 |       symbolic.
39 |     </description>
40 |   </property>
41 |   <property>
42 |     <name>dfs.permissions</name>
43 |     <value><envVar name="ENABLE_HDFS_PERMISSIONS"/></value>
44 |     <description>
45 |       If "true", enable permission checking in HDFS. If "false", permission
46 |       checking is turned off, but all other behavior is unchanged. Switching
47 |       from one parameter value to the other does not change the mode, owner or
48 |       group of files or directories.
49 |     </description>
50 |   </property>
51 |   <property>
52 |     <name>dfs.replication</name>
53 |     <value>2</value>
54 |     <description>
55 |       Default block replication. The actual number of replications can be
56 |       specified when the file is created. The default is used if replication
57 |       is not specified in create time.
58 |     </description>
59 |   </property>
60 | </configuration>
61 | 


--------------------------------------------------------------------------------
/conf/hadoop1/mapred-health-check.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Licensed to the Apache Software Foundation (ASF) under one or more
 4 | # contributor license agreements.  See the NOTICE file distributed with
 5 | # this work for additional information regarding copyright ownership.
 6 | # The ASF licenses this file to You under the Apache License, Version 2.0
 7 | # (the "License"); you may not use this file except in compliance with
 8 | # the License.  You may obtain a copy of the License at
 9 | #
10 | #     http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | 
18 | 
19 | # Check to see if the TaskTracker is healthy by checking it's http address.
20 | # Necessary to avoid [MAPREDUCE-4668].
21 | 
22 | # Redirect stderr to stdout.
23 | # Necessary to see problems with health check script in log.
24 | # Will only show stdout if ERROR is present at the beginning of a line.
25 | exec 2>&1
26 | 
27 | BIN=$(dirname "$0")
28 | BIN=$(cd "${BIN}"; pwd)
29 | HADOOP_CMD="${BIN}/hadoop"
30 | 
31 | TASK_TRACKER_HTTP_ADDRESS=$(${HADOOP_CMD} jobtracker -dumpConfiguration 2>/dev/null \
32 |     | sed -n 's/.*task\.tracker\.http\.address","value":"\([.:0-9]*\)".*/\1/p')
33 | 
34 | if [[ -n "${TASK_TRACKER_HTTP_ADDRESS}" ]]; then
35 |   curl -sm 10 -o /dev/null ${TASK_TRACKER_HTTP_ADDRESS}
36 |   ERROR_CODE=$?
37 |   if (( ${ERROR_CODE} == 28 )); then
38 |     echo "ERROR curl timed out trying to reach the TaskTracker web server." \
39 |         "Assuming the TaskTracker is unhealthy."
40 |   elif (( ${ERROR_CODE} )); then
41 |     echo "WARN curl failed to reach the TaskTracker, but did not time out."
42 |   else
43 |     echo "DEBUG Successfully curled TaskTracker."
44 |   fi
45 | else
46 |   echo "WARN Failed to determine TaskTracker http address." \
47 |       "Not checking health."
48 | fi
49 | 
50 | # TaskTracker disregards ERRORs with non-zero exit code.
51 | exit 0
52 | 


--------------------------------------------------------------------------------
/conf/hadoop1/mapred-template.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" ?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <configuration>
 4 |   <property>
 5 |     <name>mapred.job.tracker</name>
 6 |     <value><envVar name="JOB_TRACKER_URI"/></value>
 7 |     <description>
 8 |       The host and port that the MapReduce job tracker runsat.  If "local",
 9 |       then jobs are run in-process as a single mapand reduce task.
10 |     </description>
11 |   </property>
12 |   <property>
13 |     <name>mapred.map.tasks</name>
14 |     <value><envVar name="DEFAULT_NUM_MAPS"/></value>
15 |     <description>
16 |       The default number of map tasks per job.Ignored when mapred.job.tracker is
17 |       "local".
18 |     </description>
19 |   </property>
20 |   <property>
21 |     <name>mapred.reduce.tasks</name>
22 |     <value><envVar name="DEFAULT_NUM_REDUCES"/></value>
23 |     <description>
24 |       The default number of reduce tasks per job. Typically set to 99%of the
25 |       cluster's reduce capacity, so that if a node fails the reduces canstill be
26 |       executed in a single wave.Ignored when mapred.job.tracker is
27 |       "local".
28 |     </description>
29 |   </property>
30 |   <property>
31 |     <name>mapred.tasktracker.map.tasks.maximum</name>
32 |     <value><envVar name="MAP_SLOTS"/></value>
33 |     <description>
34 |       The maximum number of map tasks that will be runsimultaneously by a task
35 |       tracker.
36 |     </description>
37 |   </property>
38 |   <property>
39 |     <name>mapred.tasktracker.reduce.tasks.maximum</name>
40 |     <value><envVar name="REDUCE_SLOTS"/></value>
41 |     <description>
42 |       The maximum number of reduce tasks that will be runsimultaneously by a
43 |       task tracker.
44 |     </description>
45 |   </property>
46 |   <property>
47 |     <name>mapred.child.java.opts</name>
48 |     <value><envVar name="JAVAOPTS"/></value>
49 |     <description>
50 |       Java opts for the task tracker child processes.The following symbol, if
51 |       present, will be interpolated: @taskid@ is replacedby current TaskID. Any
52 |       other occurrences of '@' will go unchanged.For example, to enable verbose
53 |       gc logging to a file named for the taskid in/tmp and to set the heap
54 |       maximum to be a gigabyte, pass a 'value' of:-Xmx1024m -verbose:gc
55 |       -Xloggc:/tmp/@taskid@.gcThe configuration variable mapred.child.ulimit can
56 |       be used to control themaximum virtual memory of the child processes.
57 |     </description>
58 |   </property>
59 |   <property>
60 |     <name>mapred.jobtracker.restart.recover</name>
61 |     <value>true</value>
62 |     <description>
63 |       Whether or not to enable (job) recovery upon restart.
64 |     </description>
65 |   </property>
66 |   <property>
67 |     <name>mapreduce.jobtracker.expire.trackers.interval</name>
68 |     <value>60000</value>
69 |     <description>
70 |       The time-interval, in milliseconds, after which a tasktracker is
71 |       declared 'lost' if it doesn't send heartbeats.  The Hadoop
72 |       distribution default is 600000 (10 minutes), we set this to
73 |       60000 (1 minute) to quickly reassign work.
74 |     </description>
75 |   </property>
76 |   <property>
77 |     <name>mapred.local.dir</name>
78 |     <value><envVar name="MAPRED_LOCAL_DIRS"/></value>
79 |     <description>
80 |         Directories on the local machine in which to store mapreduce temp files.
81 |     </description>
82 |   </property>
83 | </configuration>
84 | 


--------------------------------------------------------------------------------
/conf/hadoop2/bigtable-hbase-site-template.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | 
 4 | <configuration>
 5 |     <property>
 6 |         <name>hbase.client.connection.impl</name>
 7 |         <value><envVar name="BIGTABLE_CONNECTION"/></value>
 8 |     </property>
 9 |     <property>
10 |         <name>google.bigtable.endpoint.host</name>
11 |         <value><envVar name="BIGTABLE_ENDPOINT"/></value>
12 |     </property>
13 |     <property>
14 |         <name>google.bigtable.admin.endpoint.host</name>
15 |         <value><envVar name="BIGTABLE_ADMIN_ENDPOINT"/></value>
16 |     </property>
17 |     <property>
18 |         <name>google.bigtable.project.id</name>
19 |         <value><envVar name="PROJECT"/></value>
20 |     </property>
21 |     <property>
22 |         <name>google.bigtable.zone.name</name>
23 |         <value><envVar name="BIGTABLE_ZONE"/></value>
24 |     </property>
25 |     <property>
26 |         <name>google.bigtable.cluster.name</name>
27 |         <value><envVar name="BIGTABLE_CLUSTER"/></value>
28 |     </property>
29 |     <property>
30 |         <name>yarn.app.mapreduce.am.command-opts</name>
31 |         <value><envVar name="BIGTABLE_BOOT_OPTS"/></value>
32 |     </property>
33 |     <property>
34 |         <name>mapreduce.map.java.opts</name>
35 |         <value><envVar name="BIGTABLE_BOOT_OPTS"/></value>
36 |     </property>
37 |     <property>
38 |         <name>mapreduce.reduce.java.opts</name>
39 |         <value><envVar name="BIGTABLE_BOOT_OPTS"/></value>
40 |     </property>
41 | </configuration>
42 | 


--------------------------------------------------------------------------------
/conf/hadoop2/bq-mapred-template.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" ?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <configuration>
 4 |   <property>
 5 |     <name>mapred.bq.project.id</name>
 6 |     <value><envVar name="PROJECT"/></value>
 7 |     <description>
 8 |       Google Cloud Project ID to use for BigQuery operations.
 9 |     </description>
10 |   </property>
11 |   <property>
12 |     <name>mapred.bq.gcs.bucket</name>
13 |     <value><envVar name="CONFIGBUCKET"/></value>
14 |     <description>
15 |       The GCS bucket holding temporary BigQuery data for the input connector.
16 |     </description>
17 |   </property>
18 |   <property>
19 |     <name>mapred.bq.output.buffer.size</name>
20 |     <value>67108864</value>
21 |     <description>
22 |       The size in bytes of the output buffer to use when writing to BigQuery.
23 |     </description>
24 |   </property>
25 | </configuration>
26 | 


--------------------------------------------------------------------------------
/conf/hadoop2/core-template.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" ?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <configuration>
 4 |   <property>
 5 |     <name>hadoop.tmp.dir</name>
 6 |     <value><envVar name="HADOOP_TMP_DIR"/></value>
 7 |     <description>A base for other temporary directories.</description>
 8 |   </property>
 9 |   <property>
10 |     <name>fs.defaultFS</name>
11 |     <value>file:///</value>
12 |     <description>
13 |       The name of the default file system. A URI whose scheme and authority
14 |       determine the FileSystem implementation. The uri's scheme determines
15 |       the config property (fs.SCHEME.impl) naming the FileSystem
16 |       implementation class. The uri's authority is used to determine the
17 |       host, port, etc. for a filesystem.
18 |     </description>
19 |   </property>
20 | </configuration>
21 | 


--------------------------------------------------------------------------------
/conf/hadoop2/gcs-core-template.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" ?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <configuration>
 4 |   <property>
 5 |     <name>fs.gs.project.id</name>
 6 |     <value><envVar name="PROJECT"/></value>
 7 |     <description>
 8 |       Google Cloud Project ID with access to configured GCS buckets.
 9 |     </description>
10 |   </property>
11 |   <property>
12 |     <name>fs.gs.system.bucket</name>
13 |     <value><envVar name="CONFIGBUCKET"/></value>
14 |     <description>
15 |       GCS bucket to use as a default bucket if fs.default.name is not a gs: uri.
16 |     </description>
17 |   </property>
18 |   <property>
19 |     <name>fs.gs.working.dir</name>
20 |     <value>/</value>
21 |     <description>
22 |       The directory relative gs: uris resolve in inside of the default bucket.
23 |     </description>
24 |   </property>
25 |   <property>
26 |     <name>fs.gs.impl</name>
27 |     <value>com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem</value>
28 |     <description>The FileSystem for gs: (GCS) uris.</description>
29 |   </property>
30 |   <property>
31 |     <name>fs.AbstractFileSystem.gs.impl</name>
32 |     <value>com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS</value>
33 |     <description>The AbstractFileSystem for gs: (GCS) uris.</description>
34 |   </property>
35 |   <property>
36 |     <name>fs.gs.metadata.cache.enable</name>
37 |     <value>true</value>
38 |     <description>
39 |       If true, a DirectoryListCache will be used to supplement "list" requests
40 |       to GCS to fill in any missing items caused by eventual list consistency,
41 |       intercepting create/delete/copy calls to create cache entries. The
42 |       concrete type is determined with fs.gs.metadata.cache.type.
43 |     </description>
44 |   </property>
45 |   <property>
46 |     <name>fs.gs.metadata.cache.type</name>
47 |     <value><envVar name="GCS_METADATA_CACHE_TYPE"/></value>
48 |     <description>
49 |       Specifies which implementation of DirectoryListCache to use for
50 |       supplementing GCS API "list" requests. Supported implementations:
51 |       IN_MEMORY: Enforces immediate consistency within same Java process.
52 |       FILESYSTEM_BACKED: Enforces consistency across all cooperating processes
53 |       pointed at the same local mirror directory, which may be an NFS directory
54 |       for massively-distributed coordination.
55 |     </description>
56 |   </property>
57 |   <property>
58 |     <name>fs.gs.metadata.cache.directory</name>
59 |     <value><envVar name="GCS_FILE_CACHE_DIRECTORY"/></value>
60 |     <description>
61 |       Only used if fs.gs.metadata.cache.type is FILESYSTEM_BACKED, specifies
62 |       the local path to use as the base path for storing mirrored GCS metadata.
63 |       Must be an absolute path, must be a directory, and must be fully
64 |       readable/writable/executable by any user running processes which use the
65 |       GCS connector.
66 |     </description>
67 |   </property>
68 | </configuration>
69 | 


--------------------------------------------------------------------------------
/conf/hadoop2/hdfs-template.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" ?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <configuration>
 4 |   <property>
 5 |     <name>dfs.namenode.secondary.http-address</name>
 6 |     <value><envVar name="MASTER_HOSTNAME"/>:50090</value>
 7 |     <description>
 8 |       The secondary namenode http server address and port.
 9 |     </description>
10 |   </property>
11 |   <property>
12 |     <name>dfs.namenode.rpc-address</name>
13 |     <value><envVar name="MASTER_HOSTNAME"/>:8020</value>
14 |     <description>
15 |       RPC address that handles all clients requests. If empty then we'll get
16 |       thevalue from fs.default.name.The value of this property will take the
17 |       form of hdfs://nn-host1:rpc-port.
18 |     </description>
19 |   </property>
20 |   <property>
21 |     <name>dfs.namenode.name.dir</name>
22 |     <value><envVar name="HDFS_NAME_DIR"/></value>
23 |     <description>
24 |       Determines where on the local filesystem the DFS namenode should store the
25 |       name table(fsimage). If this is a comma-delimited list of directories then
26 |       the name table is replicated in all of thedirectories, for redundancy.
27 |     </description>
28 |   </property>
29 |   <property>
30 |     <name>dfs.datanode.data.dir</name>
31 |     <value><envVar name="HDFS_DATA_DIRS"/></value>
32 |     <description>
33 |       Determines where on the local filesystem an DFS datanode should store its
34 |       blocks. If this is a comma-delimited list of directories, then data will
35 |       be stored in all named directories, typically on different
36 |       devices.Directories that do not exist are ignored.
37 |     </description>
38 |   </property>
39 |   <property>
40 |     <name>dfs.datanode.data.dir.perm</name>
41 |     <value><envVar name="HDFS_DATA_DIRS_PERM"/></value>
42 |     <description>
43 |       Permissions for the directories on on the local filesystem where the DFS
44 |       data node store its blocks. The permissions can either be octal or
45 |       symbolic.
46 |     </description>
47 |   </property>
48 |   <property>
49 |     <name>dfs.permissions.enabled</name>
50 |     <value><envVar name="ENABLE_HDFS_PERMISSIONS"/></value>
51 |     <description>
52 |       If "true", enable permission checking in HDFS. If "false", permission
53 |       checking is turned off, but all other behavior is unchanged. Switching
54 |       from one parameter value to the other does not change the mode, owner or
55 |       group of files or directories.
56 |     </description>
57 |   </property>
58 |   <property>
59 |     <name>dfs.permissions.supergroup</name>
60 |     <value>hadoop</value>
61 |     <description>
62 |       The name of the group of super-users.
63 |     </description>
64 |   </property>
65 |   <property>
66 |     <name>dfs.replication</name>
67 |     <value>2</value>
68 |     <description>
69 |       Default block replication. The actual number of replications can be
70 |       specified when the file is created. The default is used if replication
71 |       is not specified in create time.
72 |     </description>
73 |   </property>
74 | </configuration>
75 | 


--------------------------------------------------------------------------------
/conf/hadoop2/yarn-template.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" ?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <configuration>
 4 |   <property>
 5 |     <name>yarn.resourcemanager.hostname</name>
 6 |     <value><envVar name="MASTER_HOSTNAME"/></value>
 7 |   </property>
 8 |   <property>
 9 |     <name>yarn.nodemanager.aux-services</name>
10 |     <value>mapreduce_shuffle</value>
11 |   </property>
12 |   <property>
13 |     <name>yarn.nodemanager.resource.memory-mb</name>
14 |     <value><envVar name="NODEMANAGER_MEM_MB"/></value>
15 |     <description>
16 |       Amount of physical memory, in MB, that can be allocated for containers.
17 |     </description>
18 |   </property>
19 |   <property>
20 |     <name>yarn.scheduler.maximum-allocation-mb</name>
21 |     <value><envVar name="YARN_MAX_MEM_MB"/></value>
22 |     <description>
23 |       The maximum allocation for every container request at the RM, in MBs.
24 |       Memory requests higher than this won't take effect, and will get capped
25 |       to this value.
26 |     </description>
27 |   </property>
28 |   <property>
29 |     <name>yarn.scheduler.minimum-allocation-mb</name>
30 |     <value><envVar name="YARN_MIN_MEM_MB"/></value>
31 |     <description>
32 |       The minimum allocation for every container request at the RM, in MBs.
33 |       Memory requests lower than this won't take effect, and the specified
34 |       value will get allocated at minimum.
35 |     </description>
36 |   </property>
37 |   <property>
38 |     <name>yarn.nodemanager.resource.cpu-vcores</name>
39 |     <value><envVar name="NUM_CORES"/></value>
40 |     <description>
41 |       Number of vcores that can be allocated for containers. This is used by
42 |       the RM scheduler when allocating resources for containers. This is not
43 |       used to limit the number of physical cores used by YARN containers.
44 |     </description>
45 |   </property>
46 |   <property>
47 |     <name>yarn.log-aggregation-enable</name>
48 |     <value>false</value>
49 |     <description>
50 |       Enable remote logs aggregation to the default FS.
51 |     </description>
52 |   </property>
53 |   <property>
54 |     <name>yarn.nodemanager.remote-app-log-dir</name>
55 |     <value>/yarn-logs/</value>
56 |     <description>
57 |       The remote path, on the default FS, to store logs.
58 |     </description>
59 |   </property>
60 |   <property>
61 |     <name>yarn.resourcemanager.recovery.enabled</name>
62 |     <value>true</value>
63 |     <description>
64 |       Enable RM to recover state after starting.
65 |     </description>
66 |   </property>
67 |   <property>
68 |     <name>yarn.resourcemanager.fs.state-store.uri</name>
69 |     <value>file:///hadoop/yarn/system/rmstore</value>
70 |     <description>
71 |       URI pointing to the location of the FileSystem path where RM state will
72 |       be stored. This is set on the local file system to avoid collisions in
73 |       GCS.
74 |     </description>
75 |   </property>
76 |   <property>
77 |     <name>yarn.nodemanager.local-dirs</name>
78 |     <value><envVar name="NODEMANAGER_LOCAL_DIRS"/></value>
79 |     <description>
80 |         Directories on the local machine in which to application temp files.
81 |     </description>
82 |   </property>
83 | </configuration>
84 | 


--------------------------------------------------------------------------------
/docs/JOBS.md:
--------------------------------------------------------------------------------
 1 | # Jobs
 2 | 
 3 | Once you have [created a cluster](QUICKSTART.md) you can submit "jobs" (work) to it. These can be entirely new jobs, or jobs you port from an existing environment.
 4 | 
 5 | ## Writing Jobs
 6 | 
 7 | To learn about how to write Hadoop jobs from the ground up, see the [Apache Hadoop tutorials](https://hadoop.apache.org/docs/current/hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html).
 8 | 
 9 | Google Cloud Platform offers input/output data connectors for your Hadoop and Spark jobs:
10 | 
11 | * [Google BigQuery Connector for Hadoop](https://github.com/GoogleCloudPlatform/bigdata-interop)
12 | * [Google Cloud Storage Connector for Hadoop](https://github.com/GoogleCloudPlatform/bigdata-interop)
13 | 
14 | ## Porting existing jobs
15 | 
16 | When porting a job from HDFS using the Cloud Storage connector for Hadoop, be sure to use the correct file path syntax (`gs://`).
17 | Also note that `FileSystem.append` is unsupported. If you choose Cloud Storage as your default file system, update your MapReduce, if necessary, to avoid using the append method.
18 | 
19 | ## Running jobs
20 | 
21 | Once you've set up a Hadoop cluster and have written or ported a job, you can run the job using the following steps.
22 | 
23 | ### Validating your setup and data
24 | 
25 | First, validate that your cluster is set up, and that you can access your data. Navigate to the command line to execute the following commands.
26 | 
27 | Type `./bdutil shell` to SSH into the master node of the Hadoop cluster.
28 | Type `hadoop fs -ls /` to check the cluster status. If data outputs, the cluster is set up correctly.
29 | 
30 | ### Running the job
31 | 
32 | Next, run the job from the command line, while you are still connected to the cluster via SSH. Always run jobs as the `hadoop` user to avoid having to type full Hadoop paths in commands.
33 | 
34 | The following example runs a sample job called WordCount. Hadoop installations include this sample in the `/home/hadoop/hadoop-install/hadoop-examples-*.jar file.`
35 | 
36 | To run the WordCount job:
37 | 
38 | 1. Navigate to the command line.
39 | 1. Type `./bdutil shell` to SSH into the master node of the Hadoop cluster.
40 | 1. Type `hadoop fs -mkdir input` to create the `input` directory.
41 | Note that when using Google Cloud Storage as your [default file system](QUICKSTART.md), input automatically resolves to `gs://$<CONFIGBUCKET>/input`.
42 | 1. Copy any file from the web, such as the following example text from Apache, by typing the following command: `curl http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/ClusterSetup.html > setup.html`.
43 | 1. Copy one or more text files into the `input` directory. Using the same Apache text in the previous step, type the following command: `hadoop fs -copyFromLocal setup.html input`.
44 | 1. Type `cd /home/hadoop/hadoop-install/` to navigate to the Hadoop install directory.
45 | 1. Type `hadoop jar hadoop-examples-*.jar wordcount input output` to run the job on data in the input directory, and place results in the output directory.
46 | 
47 | ### Checking job status
48 | 
49 | To check the status of of the Hadoop job, visit the [JobTracker page](http://wiki.apache.org/hadoop/JobTracker). See the [monitoring jobs](MONITORING.md) page for instructions on how to access the JobTracker.
50 | 
51 | ### Cleanup
52 | 
53 | After completing the job, make sure to [shut down the Hadoop cluster](SHUTDOWN.md) for the most cost effective solution.
54 | 


--------------------------------------------------------------------------------
/docs/SHUTDOWN.md:
--------------------------------------------------------------------------------
 1 | # Shutting Down a Hadoop Cluster
 2 | 
 3 | Because [Google Compute Engine](https://cloud.google.com/compute/) charges on a [per-minute basis](https://cloud.google.com/compute/pricing), it can be cost effective to shut down your Hadoop cluster once a workload completes. Once the Hadoop cluster is shut down, your data's accessibility depends on the [default file system](QUICKSTART.md) you've chosen:
 4 | 
 5 | * When using HDFS, data is inaccessible.
 6 | * When using [Google Cloud Storage](https://cloud.google.com/storage/), data is accessible with [gsutil](https://cloud.google.com/storage/docs/gsutil) or the [Google Cloud Platform Console](https://console.cloud.google.com/?_ga=1.81149463.169096153.1475769191).
 7 | 
 8 | **When you delete (shutdown) a cluster, the operation is irreversible.**
 9 | 
10 | ## Issuing the delete command
11 | 
12 | To shut down the Hadoop cluster, use the bdutil file included as part of the setup script. Type `./bdutil delete` in the `bdutil-<version>` directory on the command line to shut down the cluster.
13 | 
14 | Here is an example of the command being run.
15 | 
16 |     ~/bdutil-0.35.1$ ./bdutil delete
17 |     Wed Aug 13 16:03:15 PDT 2014: Using local tmp dir for staging files: /tmp/bdutil-20140813-160315
18 |     Wed Aug 13 16:03:15 PDT 2014: Using custom environment-variable file(s): ./bdutil_env.sh
19 |     Wed Aug 13 16:03:15 PDT 2014: Reading environment-variable file: ./bdutil_env.sh
20 |     Delete cluster with following settings?
21 |           CONFIGBUCKET='<CONFIGBUCKET>'
22 |           PROJECT='<PROJECT>'
23 |           GCE_IMAGE='backports-debian-7'
24 |           GCE_ZONE='us-central1-b'
25 |           GCE_NETWORK='default'
26 |           PREFIX='hadoop'
27 |           NUM_WORKERS=2
28 |           MASTER_HOSTNAME='hadoop-m'
29 |           WORKERS='hadoop-w-0 hadoop-w-1'
30 |           BDUTIL_GCS_STAGING_DIR='gs://<CONFIGBUCKET>/bdutil-staging/hadoop-m'
31 |           (y/n) y
32 |     Wed Aug 13 16:03:16 PDT 2014: Deleting hadoop cluster...
33 |     ...Wed Aug 13 16:03:17 PDT 2014: Waiting on async 'deleteinstance' jobs to finish. Might take a while...
34 |     ...
35 |     Wed Aug 13 16:04:11 PDT 2014: Done deleting VMs!
36 |     Wed Aug 13 16:04:11 PDT 2014: Execution complete. Cleaning up temporary files...
37 |     Wed Aug 13 16:04:11 PDT 2014: Cleanup complete.
38 | 
39 | ## Verifying all resources have been removed
40 | 
41 | You **must** use the same bdutil configuration arguments for cluster creation and deletion. Altering the arguments might result in errors when shutting down the cluster. After the script executes, you can type `gcloud compute instances list --project=<PROJECT> | grep <PREFIX>` and verify that no instances are still running. Similarly, you can type `gcloud compute disks list --project=<PROJECT> | grep <PREFIX>` and verify that no created disks accidentally survived.
42 | 


--------------------------------------------------------------------------------
/extensions/bigtable/bigtable_env.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # This file contains environment-variable overrides to be used in conjunction
16 | # with bdutil_env.sh in order to deploy a Hadoop cluster with HBase installed
17 | # and configured to use Cloud Bigtable.
18 | # Usage: ./bdutil deploy -e extensions/bigtable/bigtable_env.sh.
19 | 
20 | # Directory on each VM in which to install hbase.
21 | HBASE_INSTALL_DIR=/home/hadoop/hbase-install
22 | HBASE_CONF_DIR=${HBASE_INSTALL_DIR}/conf/
23 | BIGTABLE_ENDPOINT=bigtable.googleapis.com
24 | BIGTABLE_ADMIN_ENDPOINT=bigtabletableadmin.googleapis.com
25 | 
26 | BIGTABLE_ZONE=us-central1-b
27 | BIGTABLE_CLUSTER=cluster
28 | 
29 | COMMAND_GROUPS+=(
30 |   "install_bigtable:
31 |      extensions/bigtable/install_hbase_bigtable.sh
32 |   "
33 | )
34 | 
35 | # Installation of bigtable on master and workers
36 | COMMAND_STEPS+=(
37 |   'install_bigtable,install_bigtable'
38 | )
39 | 
40 | ALPN_VERSION=7.1.3.v20150130
41 | ALPN_REMOTE_JAR=http://central.maven.org/maven2/org/mortbay/jetty/alpn/alpn-boot/${ALPN_VERSION}/alpn-boot-${ALPN_VERSION}.jar
42 | BIGTABLE_HBASE_JAR=https://storage.googleapis.com/cloud-bigtable/jars/bigtable-hbase/bigtable-hbase-mapreduce-0.2.2-shaded.jar
43 | BIGTABLE_CONNECTION=com.google.cloud.bigtable.hbase1_1.BigtableConnection
44 | 
45 | # Copied from http://www.us.apache.org/dist/hbase/stable/
46 | # We don't want to overload the apache servers.
47 | HBASE_TARBALL_URI=https://storage.googleapis.com/cloud-bigtable/hbase-dist/hbase-1.1.2/hbase-1.1.2-bin.tar.gz
48 | 
49 | BIGTABLE_LIB_DIR=${HBASE_INSTALL_DIR}/lib/bigtable
50 | ALPN_CLASSPATH=${BIGTABLE_LIB_DIR}/alpn-boot-${ALPN_VERSION}.jar
51 | BIGTABLE_BOOT_OPTS="-Xms1024m -Xmx2048m -Xbootclasspath/p:${ALPN_CLASSPATH}"
52 | 
53 | # TODO: JAVAOPTS gets used in mapred-template.xml.  There should probably be a better way to do this.
54 | JAVAOPTS="$JAVAOPTS -Xbootclasspath/p:$BIGTABLE_BOOT_OPTS"
55 | 
56 | GCE_SERVICE_ACCOUNT_SCOPES+=(
57 |   'https://www.googleapis.com/auth/cloud-bigtable.admin'
58 |   'https://www.googleapis.com/auth/cloud-bigtable.data'
59 |   'https://www.googleapis.com/auth/cloud-bigtable.data.readonly'
60 | )
61 | 


--------------------------------------------------------------------------------
/extensions/flink/README.md:
--------------------------------------------------------------------------------
 1 | Deploying Flink on Google Compute Engine
 2 | ========================================
 3 | 
 4 | Set up a bucket
 5 | ----------------
 6 | 
 7 | If you have not done so, create a bucket for the bdutil config and
 8 | staging files. A new bucket can be created with the gsutil:
 9 | 
10 |     gsutil mb gs://<bucket_name>
11 | 
12 | 
13 | Adapt the bdutil config
14 | -----------------------
15 | 
16 | To deploy Flink with bdutil, adapt at least the following variables in
17 | bdutil_env.sh.
18 | 
19 |     CONFIGBUCKET="<bucket_name>"
20 |     PROJECT="<compute_engine_project_name>"
21 |     NUM_WORKERS=<number_of_workers>
22 | 
23 | 
24 | Bring up a cluster with Flink
25 | -----------------------------
26 | 
27 | To bring up the Flink cluster on Google Compute Engine, execute:
28 | 
29 |     ./bdutil -e extensions/flink/flink_env.sh deploy
30 | 
31 | To run a Flink example job:
32 | 
33 |     ./bdutil shell
34 |     curl http://www.gutenberg.org/cache/epub/2265/pg2265.txt > text
35 |     gsutil cp text gs://<bucket_name>/text
36 |     cd /home/hadoop/flink-install/bin
37 |     ./flink run ../examples/flink-java-examples-*-WordCount.jar gs://<bucket_name>/text gs://<bucket_name>/output


--------------------------------------------------------------------------------
/extensions/flink/flink_env.sh:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #      http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS-IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | # This file contains environment-variable overrides to be used in conjunction
14 | # with bdutil_env.sh in order to deploy a Hadoop + Flink cluster.
15 | # Usage: ./bdutil deploy -e extensions/flink/flink_env.sh
16 | 
17 | 
18 | # In standalone mode, Flink runs the job manager and the task managers (workers)
19 | # on the cluster without using YARN containers. Flink also supports YARN
20 | # deployment which will be implemented in future version of the Flink bdutil plugin.
21 | FLINK_MODE="standalone"
22 | 
23 | # URIs of tarballs for installation.
24 | FLINK_HADOOP1_TARBALL_URI='gs://flink-dist/flink-0.10.1-bin-hadoop1-scala_2.10.tgz'
25 | # Hadoop v2.7 build
26 | FLINK_HADOOP2_TARBALL_URI='gs://flink-dist/flink-0.10.1-bin-hadoop27-scala_2.10.tgz'
27 | 
28 | # Directory on each VM in which to install each package.
29 | FLINK_INSTALL_DIR='/home/hadoop/flink-install'
30 | 
31 | # Optional JVM arguments to pass
32 | # Flink config entry: env.java.opts:
33 | FLINK_JAVA_OPTS="-DsomeOption=value"
34 | 
35 | # Heap memory used by the job manager (master) determined by the physical (free) memory of the server
36 | # Flink config entry: jobmanager.heap.mb
37 | FLINK_JOBMANAGER_MEMORY_FRACTION='0.8'
38 | 
39 | # Heap memory used by the task managers (slaves) determined by the physical (free) memory of the servers
40 | # Flink config entry: taskmanager.heap.mb
41 | FLINK_TASKMANAGER_MEMORY_FRACTION='0.8'
42 | 
43 | # Number of task slots per task manager (worker)
44 | # ideally set to the number of physical cpus
45 | # if set to 'auto', the number of slots will be determined automatically
46 | # Flink config entry: taskmanager.numberOfTaskSlots
47 | FLINK_TASKMANAGER_SLOTS='auto'
48 | 
49 | # Default parallelism (number of concurrent actions per task)
50 | # If set to 'auto', this will be determined automatically
51 | # Flink config entry: parallelism.default
52 | FLINK_PARALLELISM='auto'
53 | 
54 | # The number of buffers for the network stack.
55 | # Flink config entry: taskmanager.network.numberOfBuffers
56 | FLINK_NETWORK_NUM_BUFFERS=2048
57 | 
58 | 
59 | COMMAND_GROUPS+=(
60 |   "install_flink:
61 |      extensions/flink/install_flink.sh
62 |   "
63 |   "start_flink:
64 |      extensions/flink/start_flink.sh
65 |   "
66 | )
67 | 
68 | # Installation of flink on master and workers; then start_flink only on master.
69 | COMMAND_STEPS+=(
70 |   'install_flink,install_flink'
71 |   'start_flink,*'
72 | )
73 | 


--------------------------------------------------------------------------------
/extensions/flink/install_flink.sh:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #      http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS-IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | 
14 | # fail if undeclared variables are used
15 | set -o nounset
16 | # exit on error
17 | set -o errexit
18 | 
19 | 
20 | # Figure out which tarball to use based on which Hadoop version is being used.
21 | set +o nounset
22 | HADOOP_BIN="sudo -u hadoop ${HADOOP_INSTALL_DIR}/bin/hadoop"
23 | HADOOP_VERSION=$(${HADOOP_BIN} version | tr -cd [:digit:] | head -c1)
24 | set -o nounset
25 | if [[ "${HADOOP_VERSION}" == '2' ]]; then
26 |   FLINK_TARBALL_URI=${FLINK_HADOOP2_TARBALL_URI}
27 | else
28 |   FLINK_TARBALL_URI=${FLINK_HADOOP1_TARBALL_URI}
29 | fi
30 | 
31 | # Install Flink via this fancy pipe
32 | gsutil cat "${FLINK_TARBALL_URI}" | tar -C /home/hadoop/ -xzv
33 | mv /home/hadoop/flink* "${FLINK_INSTALL_DIR}"
34 | 
35 | # List all task managers (workers) in the slaves file
36 | # The task managers will be brought up by the job manager (master)
37 | echo ${WORKERS[@]} | tr ' ' '\n' > ${FLINK_INSTALL_DIR}/conf/slaves
38 | 
39 | # Create temp file in hadoop directory which might be mounted to other storage than os
40 | FLINK_TASKMANAGER_TEMP_DIR="/hadoop/flink/tmp"
41 | mkdir -p ${FLINK_TASKMANAGER_TEMP_DIR}
42 | chgrp hadoop -R /hadoop/flink
43 | chmod 777 -R /hadoop/flink
44 | 
45 | # Calculate the memory allocations, MB, using 'free -m'. Floor to nearest MB.
46 | TOTAL_MEM=$(free -m | awk '/^Mem:/{print $2}')
47 | FLINK_JOBMANAGER_MEMORY=$(python -c \
48 |     "print int(${TOTAL_MEM} * ${FLINK_JOBMANAGER_MEMORY_FRACTION})")
49 | FLINK_TASKMANAGER_MEMORY=$(python -c \
50 |     "print int(${TOTAL_MEM} * ${FLINK_TASKMANAGER_MEMORY_FRACTION})")
51 | 
52 | # Determine the number of task slots
53 | if [[ "${FLINK_TASKMANAGER_SLOTS}" == "auto" ]] ; then
54 |     FLINK_TASKMANAGER_SLOTS=`grep -c processor /proc/cpuinfo`
55 | fi
56 | 
57 | # Determine the default parallelism
58 | if [[ "${FLINK_PARALLELISM}" == "auto" ]] ; then
59 |     FLINK_PARALLELISM=$(python -c \
60 |     "print ${NUM_WORKERS} * ${FLINK_TASKMANAGER_SLOTS}")
61 | fi
62 | 
63 | # Apply Flink settings by appending them to the default config
64 | cat << EOF >> ${FLINK_INSTALL_DIR}/conf/flink-conf.yaml
65 | jobmanager.rpc.address: ${MASTER_HOSTNAME}
66 | jobmanager.heap.mb: ${FLINK_JOBMANAGER_MEMORY}
67 | taskmanager.heap.mb: ${FLINK_TASKMANAGER_MEMORY}
68 | taskmanager.numberOfTaskSlots: ${FLINK_TASKMANAGER_SLOTS}
69 | parallelism.default: ${FLINK_PARALLELISM}
70 | taskmanager.network.numberOfBuffers: ${FLINK_NETWORK_NUM_BUFFERS}
71 | env.java.opts: ${FLINK_JAVA_OPTS}
72 | taskmanager.tmp.dirs: ${FLINK_TASKMANAGER_TEMP_DIR}
73 | fs.hdfs.hadoopconf: ${HADOOP_CONF_DIR}
74 | EOF
75 | 
76 | # Find the Hadoop lib dir so and add its gcs-connector to the Flink lib dir
77 | set +o nounset
78 | if [[ -r "${HADOOP_INSTALL_DIR}/libexec/hadoop-config.sh" ]]; then
79 |   . "${HADOOP_INSTALL_DIR}/libexec/hadoop-config.sh"
80 | fi
81 | if [[ -n "${HADOOP_COMMON_LIB_JARS_DIR}" ]] && \
82 |     [[ -n "${HADOOP_PREFIX}" ]]; then
83 |   LIB_JARS_DIR="${HADOOP_PREFIX}/${HADOOP_COMMON_LIB_JARS_DIR}"
84 | else
85 |   LIB_JARS_DIR="${HADOOP_INSTALL_DIR}/lib"
86 | fi
87 | set -o nounset
88 | # Get jar name and path
89 | GCS_JARNAME=$(grep -o '[^/]*\.jar' <<< ${GCS_CONNECTOR_JAR})
90 | LOCAL_GCS_JAR="${LIB_JARS_DIR}/${GCS_JARNAME}"
91 | # create link in Flink lib dir
92 | ln -s "${LOCAL_GCS_JAR}" "${FLINK_INSTALL_DIR}/lib/"
93 | 
94 | 
95 | # Assign ownership of everything to the 'hadoop' user.
96 | chown -R hadoop:hadoop /home/hadoop/
97 | # Make the Flink log directory writable
98 | chmod 777 ${FLINK_INSTALL_DIR}/log
99 | 


--------------------------------------------------------------------------------
/extensions/flink/start_flink.sh:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #      http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS-IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | 
13 | set -o nounset
14 | set -o errexit
15 | 
16 | if [[ ${FLINK_MODE} == 'standalone' ]]; then
17 |   sudo -u hadoop ${FLINK_INSTALL_DIR}/bin/start-cluster.sh
18 | fi


--------------------------------------------------------------------------------
/extensions/google/experimental/resize_env.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Plugin which allows manually resizing bdutil-deployed clusters. To resize
16 | # upwards, set NEW_NUM_WORKERS to the new, larger value, keeping the old
17 | # NUM_WORKERS (or -n flag) at the existing cluster size. Then:
18 | #
19 | # Deploy only the new workers, e.g. {hadoop-w-2, hadoop-w-3, hadoop-w-4}:
20 | # ./bdutil -e my_base_env.sh -e extensions/google/experimental/resize_env.sh deploy
21 | #
22 | # Explicitly start the Hadoop daemons on just the new workers:
23 | # ./bdutil -e my_base_env.sh -e extensions/google/experimental/resize_env.sh run_command -t workers -- "service hadoop-hdfs-datanode start && service hadoop-mapreduce-tasktracker start"
24 | #
25 | # If using Spark as well, explicitly start the Spark daemons on the new workers:
26 | # ./bdutil -e my_base_env.sh -e extensions/google/experimental/resize_env.sh run_command -t workers -u extensions/spark/start_single_spark_worker.sh -- "./start_single_spark_worker.sh"
27 | #
28 | # Edit your base config to reflect your new cluster size:
29 | # echo NUM_WORKERS=5 >> my_base_env.sh
30 | #
31 | # When resizing down, simply set the base NUM_WORKERS to the desired smaller
32 | # size, and set NEW_NUM_WORKERS equal to the current cluster size; this can
33 | # be thought of as "undo-ing" a "resize upwards" command:
34 | # ./bdutil -e my_base_env.sh -n 2 -e extensions/google/experimental/resize_env.sh delete
35 | # echo NUM_WORKERS=2 >> my_base_env.sh
36 | #
37 | # TODO(user): Merge into bdutil as a core command.
38 | NEW_NUM_WORKERS=5
39 | 
40 | # During resizes, make sure to avoid touching the master node.
41 | SKIP_MASTER=true
42 | 
43 | # Save away the base evaluate_late_variable_bindings function so we can
44 | # override it and replace the WORKERS array.
45 | copy_func evaluate_late_variable_bindings old_evaluate_late_variable_bindings
46 | 
47 | function evaluate_late_variable_bindings() {
48 |   old_evaluate_late_variable_bindings
49 | 
50 |   WORKERS=()
51 |   WORKER_ATTACHED_PDS=()
52 | 
53 |   local worker_suffix='w'
54 |   local master_suffix='m'
55 |   if (( ${OLD_HOSTNAME_SUFFIXES} )); then
56 |     echo 'WARNING: Using deprecated -nn and -dn naming convention'
57 |     worker_suffix='dn'
58 |     master_suffix='nn'
59 |   fi
60 |   for ((i = ${NUM_WORKERS}; i < ${NEW_NUM_WORKERS}; i++)); do
61 |     local shift_i=$((${i} - ${NUM_WORKERS}))
62 |     WORKERS[${shift_i}]="${PREFIX}-${worker_suffix}-${i}"
63 |   done
64 |   for ((i = ${NUM_WORKERS}; i < ${NEW_NUM_WORKERS}; i++)); do
65 |     local shift_i=$((${i} - ${NUM_WORKERS}))
66 |     WORKER_ATTACHED_PDS[${shift_i}]="${WORKERS[${shift_i}]}-pd"
67 |   done
68 | 
69 |   local num_workers_to_add=$((${NEW_NUM_WORKERS} - ${NUM_WORKERS}))
70 |   NUM_WORKERS=${num_workers_to_add}
71 | }
72 | 


--------------------------------------------------------------------------------
/extensions/hama/README.md:
--------------------------------------------------------------------------------
 1 | Deploying Hama on Google Compute Engine
 2 | ===============================================
 3 | 
 4 | Apache Hama
 5 | -----------
 6 | Apache Hama is a framework for Big Data analytics which uses the Bulk Synchronous Parallel (BSP) computing model, which was established in 2012 as a Top-Level Project of The Apache Software Foundation. 
 7 | 
 8 | It provides not only pure BSP programming model but also vertex and neuron centric programming models, inspired by Google's Pregel and DistBelief.
 9 | 
10 | Basic Usage
11 | -----------
12 | 
13 | Basic installation of [Apache Hama](http://hama.apache.org/) alongside Hadoop on Google Cloud Platform.
14 | 
15 |     ./bdutil -e extensions/hama/hama_env.sh deploy
16 | 
17 | Or alternatively, using shorthand syntax:
18 | 
19 |     ./bdutil -e hama deploy
20 | 
21 | Status
22 | ------
23 | 
24 | This plugin is currently considered experimental and not officially supported.
25 | Contributions are welcome.
26 | 


--------------------------------------------------------------------------------
/extensions/hama/hama_env.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # This file contains environment-variable overrides to be used in conjunction
16 | # with bdutil_env.sh in order to deploy a Hadoop cluster with Hama installed
17 | # and configured.
18 | # Usage: ./bdutil deploy extensions/hama/hama_env.sh.
19 | 
20 | # URIs of tarball to install.
21 | HAMA_TARBALL_URI='gs://hama-dist/hama-dist-0.7.0.tar.gz'
22 | 
23 | # Default Hama dist tarball requires Hadoop 2.
24 | import_env hadoop2_env.sh
25 | 
26 | # Directory on each VM in which to install hama.
27 | HAMA_INSTALL_DIR='/home/hadoop/hama-install'
28 | 
29 | COMMAND_GROUPS+=(
30 |   "install_hama:
31 |      extensions/hama/install_hama.sh
32 |   "
33 |   "start_hama:
34 |      extensions/hama/start_hama.sh
35 |   "
36 | )
37 | 
38 | # Installation of hama on master and workers; then start_hama only on master.
39 | COMMAND_STEPS+=(
40 |   'install_hama,install_hama'
41 |   'start_hama,*'
42 | )
43 | 


--------------------------------------------------------------------------------
/extensions/hama/install_hama.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | set -o nounset
16 | set -o errexit
17 | 
18 | # Get the filename out of the full URI.
19 | HAMA_TARBALL=${HAMA_TARBALL_URI##*/}
20 | 
21 | # Get the tarball, untar it.
22 | gsutil cp ${HAMA_TARBALL_URI} /home/hadoop/${HAMA_TARBALL}
23 | tar -C /home/hadoop -xzvf /home/hadoop/${HAMA_TARBALL}
24 | mv /home/hadoop/hama*/ ${HAMA_INSTALL_DIR}
25 | 
26 | # Set up hama-site.xml to make sure it can access HDFS.
27 | cat << EOF > ${HAMA_INSTALL_DIR}/conf/hama-site.xml
28 | <?xml version="1.0"?>
29 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
30 | <configuration>
31 |   <property>
32 |     <name>bsp.master.address</name>
33 |     <value>${MASTER_HOSTNAME}:40000</value>
34 |   </property>
35 |   <property>
36 |     <name>hama.zookeeper.quorum</name>
37 |     <value>${MASTER_HOSTNAME}</value>
38 |   </property>
39 |   <property>
40 |     <name>fs.defaultFS</name>
41 |     <value>hdfs://${MASTER_HOSTNAME}:8020/</value>
42 |   </property>
43 | </configuration>
44 | EOF
45 | 
46 | # Set up all workers to be groomservers.
47 | echo ${WORKERS[@]} | tr ' ' '\n' > ${HAMA_INSTALL_DIR}/conf/groomservers
48 | 
49 | # Symlink the Hadoop hdfs-site.xml to hama's "copy" of it.
50 | ln -s ${HADOOP_CONF_DIR}/hdfs-site.xml ${HAMA_INSTALL_DIR}/conf/hdfs-site.xml
51 | 
52 | # Explicitly set up JAVA_HOME for hama.
53 | JAVA_HOME=$(readlink -f $(which java) | sed 's|/bin/java$||')
54 | cat << EOF >> ${HAMA_INSTALL_DIR}/conf/hama-env.sh
55 | export JAVA_HOME=${JAVA_HOME}
56 | EOF
57 | 
58 | # Add the hama 'bin' path to the .bashrc so that it's easy to call 'hama'
59 | # during interactive ssh session.
60 | add_to_path_at_login "${HAMA_INSTALL_DIR}/bin"
61 | 
62 | # Assign ownership of everything to the 'hadoop' user.
63 | chown -R hadoop:hadoop /home/hadoop/ ${HAMA_INSTALL_DIR}
64 | 


--------------------------------------------------------------------------------
/extensions/hama/start_hama.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.  #
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #      http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS-IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | set -o nounset
15 | set -o errexit
16 | 
17 | sudo -u hadoop ${HAMA_INSTALL_DIR}/bin/start-bspd.sh
18 | 


--------------------------------------------------------------------------------
/extensions/hbase/README.md:
--------------------------------------------------------------------------------
 1 | Deploying Apache HBase on Google Compute Engine
 2 | ===============================================
 3 | 
 4 | Basic Usage
 5 | -----------
 6 | 
 7 | Basic installation of [Apache HBase](http://hbase.apache.org/) alongside Hadoop on Google Cloud Platform.
 8 | 
 9 |     ./bdutil -e extensions/hbase/hbase_env.sh deploy
10 | 
11 | Or alternatively, using shorthand syntax:
12 | 
13 |     ./bdutil -e hbase deploy
14 | 
15 | Status
16 | ------
17 | 
18 | This plugin is currently considered experimental and not officially supported.
19 | Contributions are welcome.
20 | 


--------------------------------------------------------------------------------
/extensions/hbase/hbase_env.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # This file contains environment-variable overrides to be used in conjunction
16 | # with bdutil_env.sh in order to deploy a Hadoop cluster with HBase installed
17 | # and configured.
18 | # Usage: ./bdutil deploy extensions/hbase/hbase_env.sh.
19 | 
20 | # URIs of tarball to install.
21 | HBASE_TARBALL_URI='gs://hbase-dist/hbase-0.94.19.tar.gz'
22 | 
23 | # Directory on each VM in which to install hbase.
24 | HBASE_INSTALL_DIR='/home/hadoop/hbase-install'
25 | 
26 | COMMAND_GROUPS+=(
27 |   "install_hbase:
28 |      extensions/hbase/install_hbase.sh
29 |   "
30 |   "start_hbase:
31 |      extensions/hbase/start_hbase.sh
32 |   "
33 | )
34 | 
35 | # Installation of hbase on master and workers; then start_hbase only on master.
36 | COMMAND_STEPS+=(
37 |   'install_hbase,install_hbase'
38 |   'start_hbase,*'
39 | )
40 | 


--------------------------------------------------------------------------------
/extensions/hbase/install_hbase.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | set -o nounset
16 | set -o errexit
17 | 
18 | # Get the filename out of the full URI.
19 | HBASE_TARBALL=${HBASE_TARBALL_URI##*/}
20 | 
21 | # Get the tarball, untar it.
22 | gsutil cp ${HBASE_TARBALL_URI} /home/hadoop/${HBASE_TARBALL}
23 | tar -C /home/hadoop -xzvf /home/hadoop/${HBASE_TARBALL}
24 | mv /home/hadoop/hbase*/ ${HBASE_INSTALL_DIR}
25 | 
26 | # Set up hbase-site.xml to make sure it can access HDFS.
27 | cat << EOF > ${HBASE_INSTALL_DIR}/conf/hbase-site.xml
28 | <?xml version="1.0"?>
29 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
30 | <configuration>
31 |   <property>
32 |     <name>hbase.rootdir</name>
33 |     <value>hdfs://${MASTER_HOSTNAME}:8020/hbase</value>
34 |   </property>
35 |   <property>
36 |     <name>hbase.zookeeper.quorum</name>
37 |     <value>${MASTER_HOSTNAME}</value>
38 |   </property>
39 |   <property>
40 |     <name>hbase.cluster.distributed</name>
41 |     <value>true</value>
42 |   </property>
43 | </configuration>
44 | EOF
45 | 
46 | # Set up all workers to be regionservers.
47 | echo ${WORKERS[@]} | tr ' ' '\n' > ${HBASE_INSTALL_DIR}/conf/regionservers
48 | 
49 | # Symlink the Hadoop hdfs-site.xml to hbase's "copy" of it.
50 | ln -s ${HADOOP_CONF_DIR}/hdfs-site.xml ${HBASE_INSTALL_DIR}/conf/hdfs-site.xml
51 | 
52 | # Explicitly set up JAVA_HOME for hbase.
53 | JAVA_HOME=$(readlink -f $(which java) | sed 's|/bin/java$||')
54 | cat << EOF >> ${HBASE_INSTALL_DIR}/conf/hbase-env.sh
55 | export JAVA_HOME=${JAVA_HOME}
56 | EOF
57 | 
58 | # Add the hbase 'bin' path to the .bashrc so that it's easy to call 'hbase'
59 | # during interactive ssh session.
60 | add_to_path_at_login "${HBASE_INSTALL_DIR}/bin"
61 | 
62 | # Assign ownership of everything to the 'hadoop' user.
63 | chown -R hadoop:hadoop /home/hadoop/ ${HBASE_INSTALL_DIR}
64 | 


--------------------------------------------------------------------------------
/extensions/hbase/start_hbase.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.  #
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #      http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS-IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | set -o nounset
15 | set -o errexit
16 | 
17 | sudo -u hadoop ${HBASE_INSTALL_DIR}/bin/start-hbase.sh
18 | 


--------------------------------------------------------------------------------
/extensions/querytools/hive-validate-setup.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | #
  3 | # Copyright 2014 Google Inc. All Rights Reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #      http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS-IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | # Runs a basic Hive script.
 18 | # Usage: ./bdutil shell < extensions/querytools/hive-validate-setup.sh
 19 | 
 20 | # File hadoop-confg.sh
 21 | HADOOP_CONFIGURE_CMD=''
 22 | HADOOP_CONFIGURE_CMD=$(find ${HADOOP_LIBEXEC_DIR} ${HADOOP_PREFIX} \
 23 |     /home/hadoop /usr/*/hadoop* /usr/*/current/hadoop* -name hadoop-config.sh | head -n 1)
 24 | 
 25 | # If hadoop-config.sh has been found source it
 26 | if [[ -n "${HADOOP_CONFIGURE_CMD}" ]]; then
 27 |   echo "Sourcing '${HADOOP_CONFIGURE_CMD}'"
 28 |   . ${HADOOP_CONFIGURE_CMD}
 29 | fi
 30 | 
 31 | HADOOP_CMD=$(find ${HADOOP_PREFIX} /home/hadoop /usr/*/hadoop* /usr/*/current/hadoop* -wholename '*/bin/hadoop' | head -n 1)
 32 | HIVE_CMD=$(find ${HADOOP_PREFIX} /home/hadoop /usr/*/hive* /usr/*/current/hive* -wholename '*/bin/hive' | head -n 1)
 33 | 
 34 | #if it is still empty then dont run the tests
 35 | if [[ "${HADOOP_CMD}" == '' ]]; then
 36 |   echo "Did not find hadoop'"
 37 |   exit 1
 38 | fi
 39 | 
 40 | #if it is still empty then dont run the tests
 41 | if [[ "${HIVE_CMD}" == '' ]]; then
 42 |   echo "Did not find hive'"
 43 |   exit 1
 44 | fi
 45 | 
 46 | # Upload sample data.
 47 | PARENT_DIR="/tmp/validate_hive_$(date +%s)"
 48 | ${HADOOP_CMD} fs -mkdir ${PARENT_DIR}
 49 | ${HADOOP_CMD} fs -put /etc/passwd ${PARENT_DIR}
 50 | 
 51 | # Create a basic Hive script.
 52 | echo "Creating hivetest.hive..."
 53 | cat << EOF > hivetest.hive
 54 | DROP TABLE bdutil_validate_hive_tbl;
 55 | 
 56 | CREATE TABLE bdutil_validate_hive_tbl (
 57 |   user STRING,
 58 |   dummy STRING,
 59 |   uid INT,
 60 |   gid INT,
 61 |   name STRING,
 62 |   home STRING,
 63 |   shell STRING
 64 | )
 65 | ROW FORMAT DELIMITED
 66 |     FIELDS TERMINATED BY ':'
 67 | STORED AS TEXTFILE;
 68 | 
 69 | LOAD DATA INPATH '${PARENT_DIR}/passwd'
 70 | OVERWRITE INTO TABLE bdutil_validate_hive_tbl;
 71 | 
 72 | SELECT shell, COUNT(*) shell_count
 73 | FROM bdutil_validate_hive_tbl
 74 | GROUP BY shell
 75 | ORDER BY shell_count DESC, shell DESC;
 76 | EOF
 77 | cat hivetest.hive
 78 | 
 79 | # Run the script.
 80 | ${HIVE_CMD} -f hivetest.hive > /tmp/hiveoutput.txt
 81 | 
 82 | echo "Hive output:"
 83 | cat /tmp/hiveoutput.txt
 84 | 
 85 | # Run an equivalent pipeline of command-line invocations which pull out the
 86 | # 'shell' field, sort/uniq to get the counts of each occurence, then finally
 87 | # format to match Hive by printing tab-separated fields:
 88 | # shell_count\tshell
 89 | cat /etc/passwd | awk -F: '{print $7}' | sort | uniq -c | sort -nr | \
 90 |     awk '{print $2, $1}' | sed "s/ /\t/" > /tmp/goldenoutput.txt
 91 | 
 92 | echo "Expected output:"
 93 | cat /tmp/goldenoutput.txt
 94 | 
 95 | EXIT_CODE=0
 96 | if diff /tmp/hiveoutput.txt /tmp/goldenoutput.txt; then
 97 |   echo "Verified correct output."
 98 | else
 99 |   echo "Hive output doesn't match expected output!"
100 |   EXIT_CODE=1
101 | fi
102 | 
103 | # Cleanup.
104 | echo "Cleaning up test data: ${PARENT_DIR}"
105 | ${HADOOP_CMD} fs -rmr -skipTrash ${PARENT_DIR}
106 | 
107 | exit ${EXIT_CODE}
108 | 


--------------------------------------------------------------------------------
/extensions/querytools/pig-mapred-template.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" ?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <configuration>
 4 |   <property>
 5 |     <name>mapred.working.dir</name>
 6 |     <value>/user/<envVar name="HDP_USER"/></value>
 7 |     <description>
 8 |       The FileSystem working directory to use for relative paths.
 9 |     </description>
10 |   </property>
11 | </configuration>
12 | 


--------------------------------------------------------------------------------
/extensions/querytools/pig-validate-setup.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Copyright 2014 Google Inc. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS-IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | # Runs a basic Pig script.
18 | # Usage: ./bdutil shell < extensions/querytools/pig-validate-setup.sh
19 | 
20 | # File hadoop-confg.sh
21 | HADOOP_CONFIGURE_CMD=''
22 | HADOOP_CONFIGURE_CMD=$(find ${HADOOP_LIBEXEC_DIR} ${HADOOP_PREFIX} \
23 |     /home/hadoop /usr/*/hadoop* /usr/*/current/hadoop* -name hadoop-config.sh | head -n 1)
24 | 
25 | # If hadoop-config.sh has been found source it
26 | if [[ -n "${HADOOP_CONFIGURE_CMD}" ]]; then
27 |   echo "Sourcing '${HADOOP_CONFIGURE_CMD}'"
28 |   . ${HADOOP_CONFIGURE_CMD}
29 | fi
30 | 
31 | HADOOP_CMD=$(find ${HADOOP_PREFIX} /home/hadoop /usr/*/hadoop* /usr/*/current/hadoop* -wholename '*/bin/hadoop' | head -n 1)
32 | PIG_CMD=$(find ${HADOOP_PREFIX} /home/hadoop /usr/*/pig* /usr/*/current/pig* -wholename '*/bin/pig' | head -n 1)
33 | 
34 | #if it is still empty then dont run the tests
35 | if [[ "${HADOOP_CMD}" == '' ]]; then
36 |   echo "Did not find hadoop'"
37 |   exit 1
38 | fi
39 | 
40 | #if it is still empty then dont run the tests
41 | if [[ "${PIG_CMD}" == '' ]]; then
42 |   echo "Did not find pig'"
43 |   exit 1
44 | fi
45 | 
46 | # Upload sample data.
47 | PARENT_DIR="/tmp/validate_pig_$(date +%s)"
48 | ${HADOOP_CMD} fs -mkdir ${PARENT_DIR}
49 | ${HADOOP_CMD} fs -put /etc/passwd ${PARENT_DIR}
50 | 
51 | # Create a basic Pig script.
52 | echo "Creating pigtest.pig..."
53 | cat << EOF > pigtest.pig
54 | SET job.name 'PigTest';
55 | data = LOAD '${PARENT_DIR}/passwd'
56 |       USING PigStorage(':')
57 |       AS (user:CHARARRAY, dummy:CHARARRAY, uid:INT, gid:INT,
58 |           name:CHARARRAY, home:CHARARRAY, shell:CHARARRAY);
59 | grp = GROUP data BY (shell);
60 | counts = FOREACH grp GENERATE
61 |         FLATTEN(group) AS shell:CHARARRAY, COUNT(data) AS shell_count:LONG;
62 | res = ORDER counts BY shell_count DESC, shell DESC;
63 | DUMP res;
64 | EOF
65 | cat pigtest.pig
66 | 
67 | # Run the script.
68 | ${PIG_CMD} pigtest.pig > /tmp/pigoutput.txt
69 | 
70 | echo "Pig output:"
71 | cat /tmp/pigoutput.txt
72 | 
73 | # Run an equivalent pipeline of command-line invocations which pull out the
74 | # 'shell' field, sort/uniq to get the counts of each occurence, then finally
75 | # format to match Pig by printing comma-separated fields in parens:
76 | # (shell_count,shell)
77 | cat /etc/passwd | awk -F: '{print $7}' | sort | uniq -c | sort -nr | \
78 |     awk '{print $2, $1}' | sed "s/\(.*\) \(.*\)/(\1,\2)/" > /tmp/goldenoutput.txt
79 | 
80 | echo "Expected output:"
81 | cat /tmp/goldenoutput.txt
82 | 
83 | EXIT_CODE=0
84 | if diff /tmp/pigoutput.txt /tmp/goldenoutput.txt; then
85 |   echo "Verified correct output."
86 | else
87 |   echo "Pig output doesn't match expected output!"
88 |   EXIT_CODE=1
89 | fi
90 | 
91 | # Cleanup.
92 | echo "Cleaning up test data: ${PARENT_DIR}"
93 | ${HADOOP_CMD} fs -rmr -skipTrash ${PARENT_DIR}
94 | 
95 | exit ${EXIT_CODE}
96 | 


--------------------------------------------------------------------------------
/extensions/querytools/prepare_files.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2013 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Places files into expected files; generates a project_properties.sh file
16 | # which other scripts are designed to use.
17 | 
18 | set -o nounset
19 | set -o errexit
20 | 
21 | mkdir -p ${MASTER_PACKAGE_DIR}/conf/hive
22 | mv hive-site.xml ${MASTER_PACKAGE_DIR}/conf/hive/
23 | 
24 | # Dynamically generated a project_properties.sh file which only contains the
25 | # environment variables which must be derived from existing hadoop deployment
26 | # variables.
27 | cat << EOF >> project_properties.sh
28 | SUPPORTED_HDPTOOLS='hive pig'
29 | ZONE=${GCE_ZONE}
30 | MASTER=${MASTER_HOSTNAME}
31 | HADOOP_HOME=${HADOOP_INSTALL_DIR}
32 | EOF
33 | 
34 | # Explicitly set a schemeless working directory, otherwise as of Pig 0.12.0
35 | # PigInputFormat fails to use input paths which are not from the "default"
36 | # FileSystem. No need to clobber existing working-directory settings.
37 | bdconfig merge_configurations \
38 |     --configuration_file ${HADOOP_CONF_DIR}/mapred-site.xml \
39 |     --source_configuration_file pig-mapred-template.xml \
40 |     --resolve_environment_variables \
41 |     --create_if_absent \
42 |     --noclobber
43 | 


--------------------------------------------------------------------------------
/extensions/querytools/querytools_env.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2013 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # This file contains environment-variable overrides to be used in conjunction
16 | # with bdutil_env.sh in order to deploy a Hadoop cluster with Pig and Hive
17 | # installed, using the Cloud Solutions sampleapp.
18 | # Usage: ./bdutil deploy extensions/querytools/querytools_env.sh
19 | 
20 | # Set the default filesystem to be 'hdfs' since Pig and Hive will tend to rely
21 | # on multi-stage pipelines more heavily then plain Hadoop MapReduce, and thus
22 | # be vulnerable to eventual list consistency. Okay to read initially from GCS
23 | # using explicit gs:// URIs and likewise to write the final output to GCS,
24 | # letting any intermediate cross-stage items get stored in HDFS temporarily.
25 | DEFAULT_FS='hdfs'
26 | 
27 | # URIs of tarballs to install.
28 | PIG_TARBALL_URI='gs://querytools-dist/pig-0.12.0.tar.gz'
29 | HIVE_TARBALL_URI='gs://querytools-dist/hive-0.12.0-bin.tar.gz'
30 | 
31 | # Constants normally in project_properties.sh from the sampleapp, but which we
32 | # can propagate out here as shared environment variables instead.
33 | HADOOP_MAJOR_VERSION='1'
34 | HADOOP_USER='hadoop'
35 | HADOOP_GROUP='hadoop'
36 | HDP_USER='hadoop'
37 | HDP_USER_HOME='/home/hadoop'
38 | MASTER_INSTALL_DIR='/home/hadoop'
39 | PACKAGES_DIR='packages'
40 | SCRIPTS_DIR='scripts'
41 | MASTER_PACKAGE_DIR='/tmp/hdp_tools'
42 | HDFS_TMP_DIR='/tmp'
43 | HADOOP_TMP_DIR='/hadoop/tmp'
44 | 
45 | # File dependencies to be used by the scripts.
46 | if [[ -n "${BDUTIL_DIR}" ]]; then
47 |   UPLOAD_FILES+=(
48 |     "${BDUTIL_DIR}/extensions/querytools/pig-mapred-template.xml"
49 |     "${BDUTIL_DIR}/sampleapps/querytools/conf/hive/hive-site.xml"
50 |     "${BDUTIL_DIR}/sampleapps/querytools/scripts/common_utils.sh"
51 |     "${BDUTIL_DIR}/sampleapps/querytools/scripts/package_utils.sh"
52 |   )
53 | fi
54 | COMMAND_GROUPS+=(
55 |   "install_querytools:
56 |      extensions/querytools/prepare_files.sh
57 |      sampleapps/querytools/scripts/setup-packages__at__master.sh
58 |      sampleapps/querytools/scripts/setup-hdfs-for-hdtools__at__master.sh
59 |      extensions/querytools/setup_profiles.sh
60 |   "
61 | )
62 | 
63 | # Querytools installation only needs to run on master.
64 | COMMAND_STEPS+=(
65 |   'install_querytools,*'
66 | )
67 | 


--------------------------------------------------------------------------------
/extensions/querytools/setup_profiles.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Sets up login shells to  have the "hive" and "pig" binaries in the system PATH
16 | # environment variable.
17 | 
18 | add_to_path_at_login "${MASTER_INSTALL_DIR}/pig/bin"
19 | add_to_path_at_login "${MASTER_INSTALL_DIR}/hive/bin"
20 | 


--------------------------------------------------------------------------------
/extensions/spark/install_shark.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | set -o errexit
16 | 
17 | # Figure out which tarball to use based on which Hadoop version is being used.
18 | set +o nounset
19 | HADOOP_BIN="sudo -u hadoop ${HADOOP_INSTALL_DIR}/bin/hadoop"
20 | HADOOP_VERSION=$(${HADOOP_BIN} version | tr -cd [:digit:] | head -c1)
21 | set -o nounset
22 | if [[ "${HADOOP_VERSION}" == '2' ]]; then
23 |   SHARK_TARBALL_URI=${SHARK_HADOOP2_TARBALL_URI}
24 | else
25 |   SHARK_TARBALL_URI=${SHARK_HADOOP1_TARBALL_URI}
26 | fi
27 | 
28 | SHARK_TARBALL=${SHARK_TARBALL_URI##*/}
29 | gsutil cp ${SHARK_TARBALL_URI} /home/hadoop/${SHARK_TARBALL}
30 | tar -C /home/hadoop -xzvf /home/hadoop/${SHARK_TARBALL}
31 | mv /home/hadoop/shark*/ ${SHARK_INSTALL_DIR}
32 | 
33 | # Find the Hadoop lib dir so that we can link its gcs-connector into the
34 | # Shark library path.
35 | set +o nounset
36 | if [[ -r "${HADOOP_INSTALL_DIR}/libexec/hadoop-config.sh" ]]; then
37 |   . "${HADOOP_INSTALL_DIR}/libexec/hadoop-config.sh"
38 | fi
39 | if [[ -n "${HADOOP_COMMON_LIB_JARS_DIR}" ]] && \
40 |     [[ -n "${HADOOP_PREFIX}" ]]; then
41 |   LIB_JARS_DIR="${HADOOP_PREFIX}/${HADOOP_COMMON_LIB_JARS_DIR}"
42 | else
43 |   LIB_JARS_DIR="${HADOOP_INSTALL_DIR}/lib"
44 | fi
45 | set -o nounset
46 | 
47 | GCS_JARNAME=$(grep -o '[^/]*\.jar' <<< ${GCS_CONNECTOR_JAR})
48 | LOCAL_GCS_JAR="${LIB_JARS_DIR}/${GCS_JARNAME}"
49 | ln -s ${LOCAL_GCS_JAR} ${SHARK_INSTALL_DIR}/lib/
50 | 
51 | # Calculate the memory allocations, MB, using 'free -m'. Floor to nearest MB.
52 | TOTAL_MEM=$(free -m | awk '/^Mem:/{print $2}')
53 | SHARK_MEM=$(python -c \
54 |     "print int(${TOTAL_MEM} * ${SHARK_MEM_FRACTION})")
55 | 
56 | 
57 | # Point shark at scala, hadoop, hive, spark, and the spark master.
58 | cat << EOF >> ${SHARK_INSTALL_DIR}/conf/shark-env.sh
59 | export HADOOP_HOME=${HADOOP_INSTALL_DIR}
60 | export SCALA_HOME=${SCALA_INSTALL_DIR}
61 | export SPARK_HOME=${SPARK_INSTALL_DIR}
62 | export SPARK_MEM=${SHARK_MEM}m
63 | 
64 | # Set spark master by copying from spark-env.sh
65 | $(grep 'MASTER=' ${SPARK_INSTALL_DIR}/conf/spark-env.sh)
66 | EOF
67 | 
68 | # Add the spark 'bin' path to the .bashrc so that it's easy to call 'spark'
69 | # during interactive ssh session.
70 | add_to_path_at_login "${SHARK_INSTALL_DIR}/bin"
71 | 
72 | # Assign ownership of everything to the 'hadoop' user.
73 | chown -R hadoop:hadoop /home/hadoop/
74 | 


--------------------------------------------------------------------------------
/extensions/spark/spark_env.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # This file contains environment-variable overrides to be used in conjunction
16 | # with bdutil_env.sh in order to deploy a Hadoop + Spark cluster.
17 | # Usage: ./bdutil deploy -e extensions/spark/spark_env.sh
18 | 
19 | # An enum of [default|standalone|yarn-client|yarn-cluster].
20 | # In standalone mode, Spark runs it's own daemons and job submissions are made
21 | # to the master daemon by default. yarn-client and yarn-cluster both run inside
22 | # YARN containers. default preserves Spark's default.
23 | SPARK_MODE="standalone"
24 | 
25 | # URIs of tarballs to install.
26 | SCALA_TARBALL_URI='gs://spark-dist/scala-2.10.3.tgz'
27 | SPARK_HADOOP1_TARBALL_URI='gs://spark-dist/spark-1.5.0-bin-hadoop1.tgz'
28 | SPARK_HADOOP2_TARBALL_URI='gs://spark-dist/spark-1.5.0-bin-hadoop2.6.tgz'
29 | 
30 | # Directory on each VM in which to install each package.
31 | SCALA_INSTALL_DIR='/home/hadoop/scala-install'
32 | SPARK_INSTALL_DIR='/home/hadoop/spark-install'
33 | 
34 | # Worker memory to provide in spark-env.sh, as a fraction of total physical
35 | # memory. In the event of running Spark on YARN the NODEMANAGER_MEMORY_FRACTION
36 | # in hadoop2_env.sh replaces this.
37 | SPARK_WORKER_MEMORY_FRACTION='0.8'
38 | 
39 | # Default memory per Spark executor, as a fraction of total physical memory;
40 | # used for default spark-shell if not overridden with a -D option. Can be used
41 | # to accommodate multiple spark-shells on a single cluster, e.g. if this value
42 | # is set to half the value of SPARK_WORKER_MEMORY_FRACTION then two sets of
43 | # executors can run simultaneously. However, in such a case, then at the time
44 | # of starting 'spark-shell' you must specify fewer cores, e.g.:
45 | # SPARK_JAVA_OPTS="-Dspark.cores.max=4" spark-shell
46 | SPARK_EXECUTOR_MEMORY_FRACTION='0.8'
47 | 
48 | # Max memory to use by the single Spark daemon process on each node; may need to
49 | # increase when using larger clusters. Expressed as a fraction of total physical
50 | # memory.
51 | SPARK_DAEMON_MEMORY_FRACTION='0.15'
52 | 
53 | # Install JDK because certain Spark commands assume jar is installed.
54 | INSTALL_JDK_DEVEL='true'
55 | 
56 | # Spark-standalone master UI is on port 8080.
57 | MASTER_UI_PORTS=('8080' ${MASTER_UI_PORTS[@]})
58 | 
59 | COMMAND_GROUPS+=(
60 |   "install_spark:
61 |      extensions/spark/install_spark.sh
62 |   "
63 |   "spark_configure_startup:
64 |      extensions/spark/spark_configure_startup_processes.sh
65 |   "
66 |   "start_spark:
67 |      extensions/spark/start_spark.sh
68 |   "
69 | )
70 | 
71 | # Installation of spark on master and workers; then start_spark only on master.
72 | COMMAND_STEPS+=(
73 |   'install_spark,install_spark'
74 |   'spark_configure_startup,spark_configure_startup'
75 |   'start_spark,*'
76 | )
77 | 


--------------------------------------------------------------------------------
/extensions/spark/spark_on_yarn_env.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # This file contains environment-variable overrides to be used in conjunction
16 | # with bdutil_env.sh in order to deploy a Hadoop 2 + Spark on YARN cluster.
17 | # Usage: ./bdutil deploy -e extensions/spark/spark_env.sh
18 | 
19 | # Install YARN and Spark
20 | import_env hadoop2_env.sh
21 | import_env extensions/spark/spark_env.sh
22 | 
23 | # Clusters must have at least 3 workers to run spark-validate-setup.sh
24 | # and many other Spark jobs.
25 | if [[ -z "${NUM_WORKERS}" ]] || (( ${NUM_WORKERS} < 3 )); then
26 |   NUM_WORKERS=3
27 | fi
28 | 
29 | # An enum of [default|standalone|yarn-client|yarn-cluster].
30 | # yarn-client and yarn-cluster both run Spark jobs inside YARN containers
31 | # yarn-cluster also runs the spark-class or spark-submit process inside a
32 | # container, but it cannot support spark-shell, without specifying another
33 | # master.
34 | # e.g. spark-shell --master yarn-client.
35 | SPARK_MODE='yarn-client'
36 | 


--------------------------------------------------------------------------------
/extensions/spark/spark_shark_env.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # This file contains environment-variable overrides to be used in conjunction
16 | # with bdutil_env.sh in order to deploy a Hadoop + Spark + Shark cluster.
17 | # Usage: ./bdutil deploy -e extensions/spark/spark_shark_env.sh
18 | 
19 | import_env extensions/spark/spark_env.sh
20 | 
21 | # URIs of tarballs to install.
22 | SHARK_HADOOP1_TARBALL_URI='gs://spark-dist/shark-0.9.1-bin-hadoop1.tgz'
23 | SHARK_HADOOP2_TARBALL_URI='gs://spark-dist/shark-0.9.1-bin-hadoop2.tgz'
24 | # Shark is not compatible with Spark 1.x
25 | SPARK_HADOOP1_TARBALL_URI='gs://spark-dist/spark-0.9.2-bin-hadoop1.tgz'
26 | SPARK_HADOOP2_TARBALL_URI='gs://spark-dist/spark-0.9.2-bin-hadoop2.tgz'
27 | 
28 | # Directory on each VM in which to install shark
29 | SHARK_INSTALL_DIR='/home/hadoop/shark-install'
30 | 
31 | # Value to give Shark indicating the amount of Spark worker memory
32 | # available/usable by Shark per worker. Expressed as a fraction of total
33 | # physical memory.
34 | SHARK_MEM_FRACTION='0.8'
35 | 
36 | COMMAND_GROUPS+=(
37 |   "install_shark:
38 |      extensions/spark/install_shark.sh
39 |   "
40 | )
41 | 
42 | # Installation of shark
43 | COMMAND_STEPS+=(
44 |   'install_shark,install_shark'
45 | )
46 | 


--------------------------------------------------------------------------------
/extensions/spark/start_single_spark_worker.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Can be used on an individual Spark worker when running Spark in "standalone"
16 | # mode. Requires all other setup of files, configuration, etc., to be complete
17 | # already.
18 | 
19 | set -o errexit
20 | 
21 | source hadoop-env-setup.sh
22 | 
23 | SPARK_MASTER="spark://${MASTER_HOSTNAME}:7077"
24 | sudo -u hadoop ${SPARK_INSTALL_DIR}/sbin/spark-daemon.sh start \
25 |     org.apache.spark.deploy.worker.Worker 0 ${SPARK_MASTER}
26 | 


--------------------------------------------------------------------------------
/extensions/spark/start_spark.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | set -o nounset
16 | set -o errexit
17 | 
18 | if [[ ${SPARK_MODE} == 'standalone' ]]; then
19 |   sudo -u hadoop ${SPARK_INSTALL_DIR}/sbin/start-all.sh
20 | fi
21 | 


--------------------------------------------------------------------------------
/extensions/storm/README.md:
--------------------------------------------------------------------------------
 1 | Deploying Apache Storm on Google Compute Engine
 2 | ===============================================
 3 | 
 4 | Basic Usage
 5 | -----------
 6 | 
 7 | Basic installation of [Apache Storm](https://storm.apache.org/) alongside Hadoop on Google Cloud Platform.
 8 | 
 9 |     ./bdutil -e extensions/storm/storm_env.sh deploy
10 | 
11 | Or alternatively, using shorthand syntax:
12 | 
13 |     ./bdutil -e storm deploy
14 | 
15 | Status
16 | ------
17 | 
18 | This plugin is currently considered experimental and not officially supported.
19 | Contributions are welcome.
20 | 


--------------------------------------------------------------------------------
/extensions/storm/install_storm.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | set -o errexit
15 | 
16 | # Set up Storm
17 | STORM_MASTER_INSTANCE="${MASTER_HOSTNAME}"
18 | 
19 | STORM_INSTALL_TMP_DIR="/storm-$(date +%s)"
20 | mkdir -p ${STORM_INSTALL_TMP_DIR}
21 | 
22 | STORM_TARBALL_BASENAME=$(grep -o '[^/]*\.tar.gz' <<< ${STORM_TARBALL_URI})
23 | STORM_LOCAL_TARBALL="${STORM_INSTALL_TMP_DIR}/${STORM_TARBALL_BASENAME}"
24 | download_bd_resource ${STORM_TARBALL_URI} ${STORM_LOCAL_TARBALL}
25 | 
26 | tar -C ${STORM_INSTALL_TMP_DIR} -xvzf ${STORM_LOCAL_TARBALL}
27 | mkdir -p $(dirname ${STORM_INSTALL_DIR})
28 | mv ${STORM_INSTALL_TMP_DIR}/apache-storm*/ ${STORM_INSTALL_DIR}
29 | 
30 | STORM_LIB_DIR="${STORM_INSTALL_DIR}/lib"
31 | 
32 | if (( ${ENABLE_STORM_BIGTABLE} )); then
33 |   GOOGLE_STORM_LIB_DIR="${STORM_INSTALL_DIR}/lib/google"
34 |   mkdir -p "${GOOGLE_STORM_LIB_DIR}"
35 |   # Download the alpn jar.  The Alpn jar should be a fully qualified URL.
36 |   # download_bd_resource needs a fully qualified file path and not just a
37 |   # directory name to put the file in when the file to download starts with
38 |   # http://.
39 |   ALPN_JAR_NAME="${ALPN_REMOTE_JAR##*/}"
40 |   ALPN_BOOT_JAR="${GOOGLE_STORM_LIB_DIR}/${ALPN_JAR_NAME}"
41 |   download_bd_resource "${ALPN_REMOTE_JAR}" "${ALPN_BOOT_JAR}"
42 | fi
43 | 
44 | 
45 | mkdir -p ${STORM_VAR}
46 | cat << EOF | tee -a ${STORM_INSTALL_DIR}/conf/storm.yaml
47 | storm.zookeeper.servers:
48 |   - "${STORM_MASTER_INSTANCE}"
49 | nimbus.host: "${STORM_MASTER_INSTANCE}"
50 | storm.local.dir: "${STORM_VAR}"
51 | supervisor.slots.ports:
52 |   - 6700
53 |   - 6701
54 |   - 6702
55 |   - 6703
56 | storm.messaging.transport: 'backtype.storm.messaging.netty.Context'
57 | storm.messaging.netty.server_worker_threads: 1
58 | storm.messaging.netty.client_worker_threads: 1
59 | storm.messaging.netty.buffer_size: 5242880
60 | storm.messaging.netty.max_retries: 100
61 | storm.messaging.netty.max_wait_ms: 1000
62 | storm.messaging.netty.min_wait_ms: 100
63 | 
64 | EOF
65 | 
66 | if (( ${ENABLE_STORM_BIGTABLE} )); then
67 |   cat << EOF | tee -a "${STORM_INSTALL_DIR}/conf/storm.yaml"
68 | worker.childopts: "-Xbootclasspath/p:${ALPN_BOOT_JAR}"
69 | EOF
70 | fi
71 | 
72 | # Add the storm 'bin' path to the .bashrc so that it's easy to call 'storm'
73 | # during interactive ssh session.
74 | add_to_path_at_login "${STORM_INSTALL_DIR}/bin"
75 | 
76 | # TODO(user): Fix this a better way.
77 | cp /home/hadoop/hadoop-install/lib/gcs-connector*.jar /home/hadoop/storm-install/lib/
78 | cp /home/hadoop/hadoop-install/hadoop-core*.jar /home/hadoop/storm-install/lib/
79 | cp /home/hadoop/hadoop-install/lib/commons-configuration*.jar /home/hadoop/storm-install/lib/
80 | 
81 | # Assign ownership of everything to the 'hadoop' user.
82 | chown -R hadoop:hadoop /home/hadoop/ ${STORM_VAR}
83 | 


--------------------------------------------------------------------------------
/extensions/storm/install_supervisor.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Installs Supervisor using apt-get.
16 | 
17 | # Strip the debian mirrors to force only using the GCS mirrors. Not ideal for
18 | # production usage due to stripping security.debian.org, but reduces external
19 | # load for non-critical use cases.
20 | 
21 | install_application 'supervisor'
22 | 
23 | # No easy way to install supervisor on CentOS and have it configured
24 | if ! [[ -x $(which apt-get) ]] && [[ -x $(which yum) ]]; then
25 |   # Install supervisor
26 |   yum install -y python-setuptools
27 |   easy_install supervisor
28 |   mkdir -p /etc/supervisor/conf.d/
29 |   mkdir -p /var/log/supervisor
30 | 
31 |   # Set up the supervisor configuration
32 |   cat > supervisord.conf <<EOF
33 | ; supervisor config file
34 | 
35 | [unix_http_server]
36 | file=/var/run//supervisor.sock   ; (the path to the socket file)
37 | chmod=0766
38 | 
39 | [supervisord]
40 | logfile=/var/log/supervisor/supervisord.log ; (main log file;default $CWD/supervisord.log)
41 | pidfile=/var/run/supervisord.pid ; (supervisord pidfile;default supervisord.pid)
42 | childlogdir=/var/log/supervisor            ; ('AUTO' child log dir, default $TEMP)
43 | 
44 | ; the below section must remain in the config file for RPC
45 | ; (supervisorctl/web interface) to work, additional interfaces may be
46 | ; added by defining them in separate rpcinterface: sections
47 | [rpcinterface:supervisor]
48 | supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface
49 | 
50 | [supervisorctl]
51 | serverurl=unix:///var/run//supervisor.sock ; use a unix:// URL  for a unix socket
52 | 
53 | ; The [include] section can just contain the "files" setting.  This
54 | ; setting can list multiple files (separated by whitespace or
55 | ; newlines).  It can also contain wildcards.  The filenames are
56 | ; interpreted as relative to this file.  Included files *cannot*
57 | ; include files themselves.
58 | 
59 | [include]
60 | files = /etc/supervisor/conf.d/*.conf
61 | EOF
62 | 
63 |   # Move the configuration file into the right folder
64 |   mv supervisord.conf /etc/
65 | 
66 |   # Start Supervisor
67 |   supervisord
68 |   supervisorctl start all
69 | fi
70 | 
71 | perl -pi -e 's|(?<=chmod=).*|0766| ;' \
72 |  /etc/supervisor/supervisord.conf
73 | 
74 | 


--------------------------------------------------------------------------------
/extensions/storm/install_zookeeper.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Set up ZooKeeper
16 | ZK_INSTALL_TMP_DIR="/zookeeper-$(date +%s)"
17 | mkdir -p ${ZK_INSTALL_TMP_DIR}
18 | 
19 | ZOOKEEPER_TARBALL_BASENAME=$(\
20 |     grep -o '[^/]*\.tar.gz' <<< ${ZOOKEEPER_TARBALL_URI})
21 | ZOOKEEPER_LOCAL_TARBALL="${ZK_INSTALL_TMP_DIR}/${ZOOKEEPER_TARBALL_BASENAME}"
22 | download_bd_resource ${ZOOKEEPER_TARBALL_URI} ${ZOOKEEPER_LOCAL_TARBALL}
23 | 
24 | tar -C ${ZK_INSTALL_TMP_DIR} -xvzf ${ZOOKEEPER_LOCAL_TARBALL}
25 | mkdir -p $(dirname ${ZOOKEEPER_INSTALL_DIR})
26 | mv ${ZK_INSTALL_TMP_DIR}/zookeeper*/ ${ZOOKEEPER_INSTALL_DIR}
27 | 
28 | mkdir -p ${ZOOKEEPER_VAR}/data
29 | mkdir -p ${ZOOKEEPER_VAR}/log
30 | 
31 | # Copies the sample config into the actual config, but sets the dataDir value to be the data subdirectory of the zookeeper var directory
32 | perl -p -e "s|(?<=dataDir=).*|${ZOOKEEPER_VAR}/data|" \
33 |   ${ZOOKEEPER_INSTALL_DIR}/conf/zoo_sample.cfg > ${ZOOKEEPER_INSTALL_DIR}/conf/zoo.cfg
34 | 
35 | # Sets the dir locations for the log and tracelog and sets root.logger value to "INFO, ROLLINGFILE" instead of "INFO, CONSOLE"
36 | perl -pi -e 's|^(zookeeper.(?:trace)?log.dir=).*|$1'${ZOOKEEPER_VAR}'/log| ; s|(?<=zookeeper.root.logger=).*|INFO, ROLLINGFILE| ;' \
37 |   ${ZOOKEEPER_INSTALL_DIR}/conf/log4j.properties
38 | 
39 | 
40 | # Add the zookeeper 'bin' path to the .bashrc so that it's easy to call access
41 | # zookeeper files during interactive ssh session.
42 | add_to_path_at_login "${ZOOKEEPER_INSTALL_DIR}/bin"
43 | 
44 | # Assign ownership of everything to the 'hadoop' user.
45 | chown -R hadoop:hadoop /home/hadoop/ ${ZOOKEEPER_VAR}
46 | 
47 | # Define Supervisor Configuration for ZooKeeper
48 | cat > /etc/supervisor/conf.d/zookeeper.conf <<EOF
49 | [program:zookeeper]
50 | command=${ZOOKEEPER_INSTALL_DIR}/bin/zkServer.sh start-foreground
51 | numprocs=1
52 | autostart=true
53 | autorestart=true
54 | user=hadoop
55 | redirect_stderr=true
56 | stdout_logfile=${ZOOKEEPER_VAR}/log/stdout.log
57 | EOF
58 | 


--------------------------------------------------------------------------------
/extensions/storm/jar.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <!--
 3 | 
 4 |   This is a sample configuration file for setting up logging for the
 5 |   "storm jar" command. In particular, it demonstrates how you can
 6 |   enable debug logging in Storm for a package.
 7 | 
 8 |   Starting with v0.9.0 Storm uses log4j-over-slf4j and logback,
 9 |   so it uses a logback config file rather than a log4j config file
10 |   as it did previous to v0.9.
11 | 
12 |   In order to use this file, copy it into "/home/hadoop/storm-install/logback"
13 |   on the master of your cluster (where you run the "storm jar" command).
14 |   Then edit /home/hadoop/storm-install/bin/storm to make it use this file;
15 |   look for the "jar" function, and direct Storm as to where it should find
16 |   its logging config file by adding the "logopts=" line and the ", logopts"
17 |   in the "jvmopts=" line as shown here:
18 | 
19 |     def jar(jarfile, klass, *args):
20 |         logopts="-Dlogback.configurationFile=" + STORM_DIR + "/logback/jar.xml"
21 |         exec_storm_class(
22 |             klass,
23 |             jvmtype="-client",
24 |             extrajars=[jarfile, USER_CONF_DIR, STORM_DIR + "/bin"],
25 |             args=args,
26 |             jvmopts=JAR_JVM_OPTS + ["-Dstorm.jar=" + jarfile, logopts])
27 | 
28 |   Now when you run "storm jar" it will load this config file.
29 | 
30 |   To enable debugging for a particular package, edit or copy the
31 |   "logger" element, change the "name" attribute to match your package,
32 |   and set the "value" attribute on the "level" element to DEBUG.
33 |   You can also quiet down the logging by changing the root level from
34 |   INFO to WARN or ERROR (or even OFF).
35 | 
36 | -->
37 | 
38 | <configuration scan="true" scanPeriod="60 seconds">
39 | 
40 |   <appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender">
41 |     <encoder>
42 |       <pattern>%d{yyyy-MM-dd HH:mm:ss} %c{1} [%p] %m%n</pattern>
43 |     </encoder>
44 |   </appender>
45 | 
46 |   <root level="INFO">
47 |     <appender-ref ref="CONSOLE"/>
48 |   </root>
49 | 
50 |   <logger name="com.google" additivity="false">
51 |     <level value="DEBUG" />
52 |     <appender-ref ref="CONSOLE" />
53 |   </logger>
54 | 
55 | </configuration>
56 | 


--------------------------------------------------------------------------------
/extensions/storm/start_storm_master.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Define Supervisor Configurations for Storm
16 | cat > /etc/supervisor/conf.d/storm.conf <<EOF
17 | [program:storm-nimbus]
18 | command=${STORM_INSTALL_DIR}/bin/storm nimbus
19 | numprocs=1
20 | autostart=true
21 | autorestart=true
22 | user=hadoop
23 | redirect_stderr=true
24 | stdout_logfile=${STORM_VAR}/nimbus.log
25 | 
26 | [program:storm-ui]
27 | command=${STORM_INSTALL_DIR}/bin/storm ui
28 | numprocs=1
29 | autostart=true
30 | autorestart=true
31 | user=hadoop
32 | redirect_stderr=true
33 | stdout_logfile=${STORM_VAR}/ui.log
34 | 
35 | [program:storm-logviewer]
36 | command=${STORM_INSTALL_DIR}/bin/storm logviewer
37 | numprocs=1
38 | autostart=true
39 | autorestart=true
40 | user=hadoop
41 | redirect_stderr=true
42 | stdout_logfile=${STORM_VAR}/logviewer.log
43 | EOF
44 | 
45 | # Reload supervisor
46 | supervisorctl reload
47 | 


--------------------------------------------------------------------------------
/extensions/storm/start_storm_worker.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Define Supervisor Configurations for Storm
16 | cat > /etc/supervisor/conf.d/storm.conf <<EOF
17 | [program:storm-supervisor]
18 | command=${STORM_INSTALL_DIR}/bin/storm supervisor
19 | numprocs=1
20 | autostart=true
21 | autorestart=true
22 | user=root
23 | redirect_stderr=true
24 | stdout_logfile=${STORM_VAR}/supervisor.log
25 | 
26 | [program:storm-logviewer]
27 | command=${STORM_INSTALL_DIR}/bin/storm logviewer
28 | numprocs=1
29 | autostart=true
30 | autorestart=true
31 | user=hadoop
32 | redirect_stderr=true
33 | stdout_logfile=${STORM_VAR}/logviewer.log
34 | EOF
35 | 
36 | # Reload supervisor
37 | supervisorctl reload
38 | 


--------------------------------------------------------------------------------
/extensions/storm/storm_env.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # This file contains environment-variable overrides to be used in conjunction
16 | # with bdutil_env.sh in order to deploy a Hadoop + Storm cluster.
17 | # Usage: ./bdutil deploy -e extensions/storm/storm_env.sh
18 | 
19 | # Enabling this adds the ALPN jar to the Storm workers and deploy
20 | # Required for Cloud Bigtable usage
21 | ENABLE_STORM_BIGTABLE=${ENABLE_STORM_BIGTABLE:-true}
22 | normalize_boolean 'ENABLE_STORM_BIGTABLE'
23 | ALPN_REMOTE_JAR=http://central.maven.org/maven2/org/mortbay/jetty/alpn/alpn-boot/7.0.0.v20140317/alpn-boot-7.0.0.v20140317.jar
24 | 
25 | # Install direcotries
26 | STORM_INSTALL_DIR='/home/hadoop/storm-install'
27 | ZOOKEEPER_INSTALL_DIR='/home/hadoop/zookeeper-install'
28 | 
29 | # local info directories
30 | STORM_VAR='/var/storm'
31 | ZOOKEEPER_VAR='/var/zookeeper'
32 | 
33 | # URIs of tarballs
34 | STORM_TARBALL_URI='gs://storm-dist/apache-storm-0.9.2-incubating.tar.gz'
35 | ZOOKEEPER_TARBALL_URI='gs://zookeeper-dist/zookeeper-3.4.6.tar.gz'
36 | 
37 | # Storm UI is on port 8080.
38 | MASTER_UI_PORTS=('8080' ${MASTER_UI_PORTS[@]})
39 | 
40 | if (( ${ENABLE_STORM_BIGTABLE} )); then
41 | 
42 |   GCE_SERVICE_ACCOUNT_SCOPES+=(
43 |     'https://www.googleapis.com/auth/cloud-bigtable.admin'
44 |     'https://www.googleapis.com/auth/cloud-bigtable.data'
45 |     'https://www.googleapis.com/auth/cloud-bigtable.data.readonly'
46 |  )
47 | fi
48 | 
49 | COMMAND_GROUPS+=(
50 |   "install_storm:
51 |     extensions/storm/install_supervisor.sh
52 |     extensions/storm/install_zookeeper.sh
53 |     extensions/storm/install_storm.sh
54 |   "
55 |   "start_storm_worker:
56 |     extensions/storm/start_storm_worker.sh
57 |   "
58 |   "start_storm_master:
59 |     extensions/storm/start_storm_master.sh
60 |   "
61 | )
62 | 
63 | 
64 | COMMAND_STEPS+=(
65 |   'install_storm,install_storm'
66 |   'start_storm_master,start_storm_worker'
67 | )
68 | 


--------------------------------------------------------------------------------
/extensions/tajo/README.md:
--------------------------------------------------------------------------------
  1 | Deploying Apache Tajo™ on Google Cloud Platform
  2 | ===============================================
  3 | 
  4 | Apache Tajo
  5 | -----------
  6 | 
  7 | Apache Tajo is a robust big data warehouse system. Dubbed "an SQL-on-Hadoop", Tajo is optimized for running low-latency, scalable ad-hoc queries and ETL jobs on large data sets stored on both HDFS and other data sources including Amazon S3 and Google Cloud Storage. By supporting SQL standards and leveraging advanced query optimization techniques, Tajo support both interactive analysis and complex ETL in a single solution.
  8 | 
  9 | This documents explains how to setup Tajo cluster on Google Cloud Platform using bdutil.
 10 | 
 11 | Getting Started
 12 | ---------------
 13 | 
 14 | 1. Install gcloud SDK
 15 | 
 16 |     https://cloud.google.com/sdk/
 17 | 
 18 | 2. Install bdutil Tajo extension
 19 | 
 20 |     $ git clone https://github.com/GoogleCloudPlatform/bdutil.git
 21 | 
 22 | 3. Configure
 23 | 
 24 |     $ vi  bdutil_env.sh
 25 | 
 26 |     ```
 27 |     ############### REQUIRED ENVIRONMENT VARIABLES (no defaults) ##################
 28 | 
 29 |     # A GCS bucket used for sharing generated SSH keys and GHFS configuration.
 30 |     CONFIGBUCKET="YOUR_BUCKET"
 31 | 
 32 |     # The Google Cloud Platform text-based project-id which owns the GCE resources.
 33 |     PROJECT="YOUR_PROJECT_ID"
 34 | 
 35 |     ###############################################################################
 36 | 
 37 |     GCE_ZONE="YOUR_ZONE"
 38 | 
 39 |     # change it to your instance type.
 40 |     GCE_MACHINE_TYPE='n1-standard-4'
 41 | 
 42 |     # number of worker nodes
 43 |     NUM_WORKERS=2
 44 | 
 45 |     # Prefix to be shared by all VM instance names in the cluster, as well as for
 46 |     # SSH configuration between the JobTracker node and the TaskTracker nodes.
 47 |     PREFIX='tajo'
 48 |     ```
 49 | 
 50 |     $ vi extensions/tajo/tajo_env.sh
 51 | 
 52 |     ```
 53 |     # path to tajo tarball
 54 |     TAJO_TARBALL_URI='gs://PATH_TO_TAJO_TARBALL/tajo-x.xx.x.tar.gz'
 55 |     ```
 56 | 
 57 | 4. Using cloudSQL for Tajo meta store (optional)
 58 | 
 59 | By default, Tajo stores its meta data in built-in Derby database in Tajo master node. Since it is ephemeral storage, you'd better use it for test purpose only. For continuous analysis work, using permanent meta store such as cloudSQL is strongly recommended.
 60 | 
 61 | To use existing cloudSQL or MySQL instance for Tajo meta store, set the instance id and connection information.
 62 | Tajo master node need to be allowed to connect catalog server.
 63 | 
 64 |     $ vi extensions/tajo/tajo_env.sh
 65 | 
 66 |     ```
 67 |     CATALOG_HOST="YOUR_DBMS_HOST"
 68 |     CATALOG_ID="YOUR_DBMS_ID"
 69 |     CATALOG_PW="YOUR_DBMS_PW"
 70 |     CATALOG_DB=tajo
 71 |     ```
 72 | 
 73 | To use Derby, leave it blank.
 74 | 
 75 | Deployment
 76 | ----------
 77 | 
 78 | To deploy Tajo with Hadoop2 daemon:
 79 | 
 80 |     $ ./bdutil -e hadoop2,tajo deploy
 81 | 
 82 | Destroy
 83 | -------
 84 | 
 85 | To delete Tajo cluster:
 86 | 
 87 |     ./bdutil -e delete
 88 | 
 89 | Or specify PREFIX,
 90 | 
 91 |     ./bdutil -P tajo delete
 92 | 
 93 | Basic Usage
 94 | -----------
 95 | 
 96 | By default, Tajo install directory is /home/hadoop/tajo-install.
 97 | 
 98 | SSH to Tajo master node:
 99 | 
100 |     gcloud compute ssh --project=YOUR_PROJECT_ID --zone=YOUR_ZONE hadoop-m --ssh-flag="-t" --command="sudo su -l hadoop"
101 | 
102 | Run Tajo command line shell (tsql):
103 | 
104 |     /home/hadoop/tajo-install/bin/tsql
105 | 
106 | Or simply,
107 | 
108 |     tsql
109 | 
110 | To stop and start Tajo daemon (Should run as "hadoop" user):
111 | 
112 |     stop-tajo.sh
113 |     start-tajo.sh
114 | 
115 | To check Tajo status, see Tajo web UI in your browser:
116 | 
117 |     http://TAJO_MASTER_NODE_IP:26080/
118 | 
119 | To connect Tajo from your desktop (eg. via SQL workbench tools), JDBC connection string looks like:
120 | 
121 |     jdbc:tajo://TAJO_MASTER_NODE_IP:26002/dbname
122 | 
123 | Note that 26080 and 26002 port need to be open.
124 | 
125 | Advanced Configuration
126 | ----------------------
127 | 
128 | Refer to Tajo configuration documents for advanced configuration. (http://tajo.apache.org/docs/current/configuration.html)
129 | 
130 | Status
131 | ------
132 | 
133 | This plugin is currently considered experimental and not officially supported.
134 | Contributions are more than welcome.
135 | 
136 | 


--------------------------------------------------------------------------------
/extensions/tajo/install_tajo.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | set -o nounset
16 | set -o errexit
17 | 
18 | # Get the filename out of the full URI.
19 | TAJO_TARBALL=${TAJO_TARBALL_URI##*/}
20 | 
21 | # Get the tarball, untar it.
22 | gsutil cp ${TAJO_TARBALL_URI} /home/hadoop/${TAJO_TARBALL}
23 | tar -C /home/hadoop -xzvf /home/hadoop/${TAJO_TARBALL}
24 | mv /home/hadoop/tajo*/ ${TAJO_INSTALL_DIR}
25 | 
26 | # Add the TAJO 'bin' path to the .bashrc so that it's easy to call 'TAJO'
27 | # during interactive ssh session.
28 | add_to_path_at_login "${TAJO_INSTALL_DIR}/bin"
29 | 
30 | if [ ! -d /hadoop_gcs_connector_metadata_cache ]
31 | then
32 |   mkdir /hadoop_gcs_connector_metadata_cache
33 |   chown hadoop.hadoop /hadoop_gcs_connector_metadata_cache/
34 | fi
35 | 
36 | sudo chown -R hadoop.hadoop ${TAJO_INSTALL_DIR}
37 | 


--------------------------------------------------------------------------------
/extensions/tajo/start_tajo.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.  #
 2 | # Licensed under the Apache License, Version 2.0 (the "License");
 3 | # you may not use this file except in compliance with the License.
 4 | # You may obtain a copy of the License at
 5 | #
 6 | #      http://www.apache.org/licenses/LICENSE-2.0
 7 | #
 8 | # Unless required by applicable law or agreed to in writing, software
 9 | # distributed under the License is distributed on an "AS-IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 | 
14 | set -o nounset
15 | set -o errexit
16 | 
17 | sudo -u hadoop ${TAJO_INSTALL_DIR}/bin/start-tajo.sh
18 | 


--------------------------------------------------------------------------------
/extensions/tajo/tajo_env.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # This file contains environment-variable overrides to be used in conjunction
16 | # with bdutil_env.sh in order to deploy a Hadoop cluster with Tajo installed
17 | # and configured.
18 | # Usage: ./bdutil -e hadoop2,tajo deploy
19 | 
20 | # URIs of Tajo tarball to install.
21 | # Recommended Tajo version: Apache Tajo 0.11.0 or higher (eg. gs://your_bucket/tajo-x.xx.x.tar.gz)
22 | TAJO_TARBALL_URI='gs://tajo-dist/tajo-0.11.0.tar.gz'
23 | 
24 | # Tajo root directory
25 | TAJO_ROOT_DIR="gs://${CONFIGBUCKET}/tajo"
26 | 
27 | # Tajo heap memory
28 | # Default is 'total_memory - 1024' (MB)
29 | MASTER_HEAPSIZE=
30 | WORKER_HEAPSIZE=
31 | 
32 | # Task min memory of Tajo worker
33 | # Default is 3000 (MB)
34 | WORKER_RESOURCE_MEMORY=3000
35 | 
36 | # Catalog (meta store) setting for Tajo.
37 | # To use cloudSQL or MySQL, set the following variables
38 | # Or leave them blank to use default Derby meta store.
39 | #
40 | # Host or IP of DB Server.
41 | CATALOG_HOST=
42 | # The account ID of DB.
43 | CATALOG_ID=
44 | # The account password of DB.
45 | CATALOG_PW=
46 | # The database name
47 | CATALOG_DB=tajo
48 | 
49 | INSTALL_JDK_DEVEL=true
50 | 
51 | # Tajo will be installed in this directory on each VM
52 | TAJO_INSTALL_DIR='/home/hadoop/tajo-install'
53 | 
54 | if [ `expr "$HADOOP_CONF_DIR" : '.*etc/hadoop'` = 0 ]
55 | then
56 |   echo "Tajo does not support Hadoop1. Run './bdutil -e hadoop2,tajo deploy'"
57 |   exit 1
58 | fi
59 | 
60 | COMMAND_GROUPS+=(
61 |   "install_tajo:
62 |      extensions/tajo/install_tajo.sh
63 |   "
64 |   "configure_tajo:
65 |      extensions/tajo/configure_tajo.sh
66 |   "
67 |   "start_tajo:
68 |      extensions/tajo/start_tajo.sh
69 |   "
70 | )
71 | 
72 | COMMAND_STEPS+=(
73 |   'install_tajo,install_tajo'
74 |   'configure_tajo,configure_tajo'
75 |   'start_tajo,*'
76 | )
77 | 
78 | 


--------------------------------------------------------------------------------
/hadoop2_env.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Sets environment variables for YARN based Hadoop 2.x deployment
16 | 
17 | GCS_CACHE_CLEANER_LOGGER='INFO,RFA'
18 | 
19 | # URI of Hadoop tarball to be deployed. Must begin with gs:// or http(s)://
20 | # Use 'gsutil ls gs://hadoop-dist/hadoop-*.tar.gz' to list Google supplied options
21 | HADOOP_TARBALL_URI="gs://hadoop-dist/hadoop-2.7.1.tar.gz"
22 | 
23 | # Directory holding config files and scripts for Hadoop
24 | HADOOP_CONF_DIR="${HADOOP_INSTALL_DIR}/etc/hadoop"
25 | 
26 | # Fraction of worker memory to be used for YARN containers
27 | NODEMANAGER_MEMORY_FRACTION=0.8
28 | 
29 | # Decimal number controlling the size of map containers in memory and virtual
30 | # cores. Since by default Hadoop only supports memory based container
31 | # allocation, each map task will be given a container with roughly
32 | # (CORES_PER_MAP_TASK / <total-cores-on-node>) share of the memory available to
33 | # the NodeManager for containers. Thus an n1-standard-4 with CORES_PER_MAP_TASK
34 | # set to 2 would be able to host 4 / 2 = 2 map containers (and no other
35 | # containers). For more details see the script 'libexec/configure-mrv2-mem.py'.
36 | CORES_PER_MAP_TASK=1.0
37 | 
38 | # Decimal number controlling the size of reduce containers in memory and virtual
39 | # cores. See CORES_PER_MAP_TASK for more details.
40 | CORES_PER_REDUCE_TASK=2.0
41 | 
42 | # Decimal number controlling the size of application master containers in memory
43 | # and virtual cores. See CORES_PER_MAP_TASK for more details.
44 | CORES_PER_APP_MASTER=2.0
45 | 
46 | # Connector with Hadoop AbstractFileSystem implemenation for YARN
47 | GCS_CONNECTOR_JAR='https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-1.6.2-hadoop2.jar'
48 | 
49 | BIGQUERY_CONNECTOR_JAR='https://storage.googleapis.com/hadoop-lib/bigquery/bigquery-connector-0.10.3-hadoop2.jar'
50 | 
51 | 
52 | HDFS_DATA_DIRS_PERM='700'
53 | 
54 | # 8088 for YARN, 50070 for HDFS.
55 | MASTER_UI_PORTS=('8088' '50070')
56 | 
57 | # Allow to tune the YARN scheduler to
58 | YARN_SCHEDULER_CAPACITY_MAXIMUM_APPLICATIONS=10000
59 | YARN_SCHEDULER_CAPACITY_MAX_AM_PERCENT=0.2
60 | 
61 | # Use Hadoop 2 specific configuration templates.
62 | if [[ -n "${BDUTIL_DIR}" ]]; then
63 |   UPLOAD_FILES=($(find ${BDUTIL_DIR}/conf/hadoop2 -name '*template.xml'))
64 |   UPLOAD_FILES+=("${BDUTIL_DIR}/libexec/hadoop_helpers.sh")
65 |   UPLOAD_FILES+=("${BDUTIL_DIR}/libexec/configure_mrv2_mem.py")
66 | fi
67 | 
68 | # Use Hadoop 2 specific start scripts
69 | COMMAND_GROUPS+=(
70 |   'deploy_start2:
71 |     libexec/start_hadoop2.sh'
72 | )
73 | 
74 | COMMAND_STEPS=(
75 |   "deploy-ssh-master-setup,*"
76 |   'deploy-core-setup,deploy-core-setup'
77 |   "*,deploy-ssh-worker-setup"
78 |   "deploy-master-nfs-setup,*",
79 |   "deploy-client-nfs-setup,deploy-client-nfs-setup",
80 |   'deploy_start2,*'
81 | )
82 | 


--------------------------------------------------------------------------------
/libexec/configure_hdfs.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Configures HDFS
16 | 
17 | set -e
18 | 
19 | source hadoop_helpers.sh
20 | 
21 | if (( ${ENABLE_HDFS} )); then
22 | 
23 |   HDFS_ADMIN=$(get_hdfs_superuser)
24 | 
25 |   # Location of HDFS metadata on namenode
26 |   export HDFS_NAME_DIR=/hadoop/dfs/name
27 | 
28 |   # If disks are mounted use all of them for HDFS data
29 |   MOUNTED_DISKS=($(find /mnt -maxdepth 1 -mindepth 1))
30 |   if [[ ${#MOUNTED_DISKS[@]} -eq 0 ]]; then
31 |     MOUNTED_DISKS=('')
32 |   fi
33 | 
34 |   # Location of HDFS data blocks on datanodes; for each mounted disk, add the
35 |   # path /mnt/diskname/hadoop/dfs/data as a data directory, or if no mounted
36 |   # disks exist, just go with the absolute path /hadoop/dfs/data.
37 |   HDFS_DATA_DIRS="${MOUNTED_DISKS[@]/%//hadoop/dfs/data}"
38 | 
39 |   # Do not create HDFS_NAME_DIR, or Hadoop will think it is already formatted
40 |   mkdir -p /hadoop/dfs ${HDFS_DATA_DIRS}
41 | 
42 |   chown ${HDFS_ADMIN}:hadoop -L -R /hadoop/dfs ${HDFS_DATA_DIRS}
43 | 
44 |   # Make sure the data dirs have the expected permissions.
45 |   chmod ${HDFS_DATA_DIRS_PERM} ${HDFS_DATA_DIRS}
46 | 
47 |   # Set general Hadoop environment variables
48 | 
49 |   # Calculate the memory allocations, MB, using 'free -m'. Floor to nearest MB.
50 |   TOTAL_MEM=$(free -m | awk '/^Mem:/{print $2}')
51 |   NAMENODE_MEM_MB=$(python -c "print int(${TOTAL_MEM} * \
52 |       ${HDFS_MASTER_MEMORY_FRACTION} / 2)")
53 |   SECONDARYNAMENODE_MEM_MB=${NAMENODE_MEM_MB}
54 | 
55 |   cat << EOF >> ${HADOOP_CONF_DIR}/hadoop-env.sh
56 | 
57 | # Increase the maximum NameNode / SecondaryNameNode heap.
58 | HADOOP_NAMENODE_OPTS="-Xmx${NAMENODE_MEM_MB}m \${HADOOP_NAMENODE_OPTS}"
59 | HADOOP_SECONDARYNAMENODE_OPTS="-Xmx${SECONDARYNAMENODE_MEM_MB}m \${HADOOP_SECONDARYNAMENODE_OPTS}"
60 | EOF
61 | 
62 |   # Increase maximum number of files for HDFS
63 |   MAX_FILES=16384
64 |   ulimit -n ${MAX_FILES}
65 |   cat << EOF > /etc/security/limits.d/hadoop.conf
66 | ${HDFS_ADMIN} hard nofile ${MAX_FILES}
67 | ${HDFS_ADMIN} soft nofile ${MAX_FILES}
68 | EOF
69 | 
70 |   export HDFS_DATA_DIRS="${HDFS_DATA_DIRS// /,}"
71 | 
72 |   bdconfig merge_configurations \
73 |       --configuration_file ${HADOOP_CONF_DIR}/hdfs-site.xml \
74 |       --source_configuration_file hdfs-template.xml \
75 |       --resolve_environment_variables \
76 |       --create_if_absent \
77 |       --clobber
78 | fi
79 | 


--------------------------------------------------------------------------------
/libexec/install_and_configure_bigquery_connector.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2013 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Downloads and installs the relevant bigquery-connector-<version>.jar.
16 | # Also configures it for use with hadoop.
17 | 
18 | set -e
19 | 
20 | if (( ${INSTALL_BIGQUERY_CONNECTOR} )); then
21 |   if [[ -r "${HADOOP_INSTALL_DIR}/libexec/hadoop-config.sh" ]]; then
22 |     . "${HADOOP_INSTALL_DIR}/libexec/hadoop-config.sh"
23 |   fi
24 |   if [[ -n "${HADOOP_COMMON_LIB_JARS_DIR}" ]] && \
25 |       [[ -n "${HADOOP_PREFIX}" ]]; then
26 |     LIB_JARS_DIR="${HADOOP_PREFIX}/${HADOOP_COMMON_LIB_JARS_DIR}"
27 |   else
28 |     LIB_JARS_DIR="${HADOOP_INSTALL_DIR}/lib"
29 |   fi
30 | 
31 | 
32 |   # Grab the connector jarfile, add it to installation /lib directory.
33 |   JARNAME=$(grep -o '[^/]*\.jar' <<< ${BIGQUERY_CONNECTOR_JAR})
34 |   LOCAL_JAR="${LIB_JARS_DIR}/${JARNAME}"
35 | 
36 |   download_bd_resource "${BIGQUERY_CONNECTOR_JAR}" "${LOCAL_JAR}"
37 | 
38 |   chown hadoop:hadoop ${LOCAL_JAR}
39 | 
40 |   echo "export HADOOP_CLASSPATH=\$HADOOP_CLASSPATH:${LOCAL_JAR}" \
41 |       >> ${HADOOP_CONF_DIR}/hadoop-env.sh
42 | 
43 |   bdconfig merge_configurations \
44 |       --configuration_file ${HADOOP_CONF_DIR}/mapred-site.xml \
45 |       --source_configuration_file bq-mapred-template.xml \
46 |       --resolve_environment_variables \
47 |       --create_if_absent \
48 |       --noclobber
49 | 
50 |   chown -R hadoop:hadoop ${HADOOP_CONF_DIR}
51 | fi
52 | 


--------------------------------------------------------------------------------
/libexec/install_and_configure_gcs_connector.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.D
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Downloads and installs the relevant gcs-connector-<version>.jar.
16 | # Also configures it for use with hadoop.
17 | 
18 | if (( ${INSTALL_GCS_CONNECTOR} )) ; then
19 | 
20 |   if [[ -r "${HADOOP_INSTALL_DIR}/libexec/hadoop-config.sh" ]]; then
21 |     . "${HADOOP_INSTALL_DIR}/libexec/hadoop-config.sh"
22 |   fi
23 |   if [[ -n "${HADOOP_COMMON_LIB_JARS_DIR}" ]] && \
24 |       [[ -n "${HADOOP_PREFIX}" ]]; then
25 |     LIB_JARS_DIR="${HADOOP_PREFIX}/${HADOOP_COMMON_LIB_JARS_DIR}"
26 |   else
27 |     LIB_JARS_DIR="${HADOOP_INSTALL_DIR}/lib"
28 |   fi
29 | 
30 |   # Grab the connector jarfile, add it to installation /lib directory.
31 |   JARNAME=$(grep -o '[^/]*\.jar' <<< ${GCS_CONNECTOR_JAR})
32 |   LOCAL_JAR="${LIB_JARS_DIR}/${JARNAME}"
33 | 
34 |   download_bd_resource "${GCS_CONNECTOR_JAR}" "${LOCAL_JAR}"
35 | 
36 |   echo "export HADOOP_CLASSPATH=\$HADOOP_CLASSPATH:${LOCAL_JAR}" \
37 |       >> ${HADOOP_CONF_DIR}/hadoop-env.sh
38 | 
39 |   if (( ${ENABLE_NFS_GCS_FILE_CACHE} )); then
40 |     export GCS_METADATA_CACHE_TYPE='FILESYSTEM_BACKED'
41 |     export GCS_FILE_CACHE_DIRECTORY="$(get_nfs_mount_point)"
42 |   else
43 |     export GCS_METADATA_CACHE_TYPE='IN_MEMORY'
44 |     # For IN_MEMORY cache, this directory won't actually be used, but we set
45 |     # it to a sane default for easy manual experimentation of file caching.
46 |     export GCS_FILE_CACHE_DIRECTORY='/tmp/gcs_connector_metadata_cache'
47 |   fi
48 |   bdconfig merge_configurations \
49 |       --configuration_file ${HADOOP_CONF_DIR}/core-site.xml \
50 |       --source_configuration_file gcs-core-template.xml \
51 |       --resolve_environment_variables \
52 |       --create_if_absent \
53 |       --noclobber
54 | 
55 |   # Install a script that can be used to cleanup filesystem-based GCS caches.
56 |   if [[ "$(hostname -s)" == "${MASTER_HOSTNAME}" \
57 |       && "${ENABLE_NFS_GCS_FILE_CACHE}" -ne 0 ]] ; then
58 |     setup_cache_cleaner
59 |   fi
60 | fi
61 | 


--------------------------------------------------------------------------------
/libexec/install_bdconfig.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Downloads and installs bdconfig and the xml templates
16 | 
17 | set -e
18 | 
19 | # Download and use bdconfig for xml configuration.
20 | if [[ ! -f "$(which bdconfig)" ]]; then
21 |   download_bd_resource "${BDCONFIG}" /tmp/bdconfig.tar.gz
22 |   mkdir -p /usr/local/share/google
23 |   tar -C /usr/local/share/google -xzf /tmp/bdconfig.tar.gz
24 |   ln -s /usr/local/share/google/bdconfig*/bdconfig /usr/local/bin
25 | fi
26 | 


--------------------------------------------------------------------------------
/libexec/install_java.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Installs the OpenJDK Java7 JRE using apt-get.
16 | 
17 | # Strip the debian mirrors to force only using the GCS mirrors. Not ideal for
18 | # production usage due to stripping security.debian.org, but reduces external
19 | # load for non-critical use cases.
20 | 
21 | if (( ${INSTALL_JDK_DEVEL} )); then
22 |   echo 'Installing JDK with compiler and tools'
23 |   install_application "openjdk-7-jdk" "java-1.7.0-openjdk-devel"
24 | else
25 |   echo 'Installing minimal JRE'
26 |   install_application "openjdk-7-jre-headless" "java-1.7.0-openjdk"
27 | fi
28 | 


--------------------------------------------------------------------------------
/libexec/mount_disks.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2013 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Mounts any attached persistent and ephemeral disks non-boot disks
16 | 
17 | set -e
18 | 
19 | # Get a list of disks from the metadata server.
20 | BASE_DISK_URL='http://metadata.google.internal/computeMetadata/v1/instance/disks/'
21 | MOUNT_TOOL_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/compute-image-packages/legacy/google-startup-scripts/usr/share/google/safe_format_and_mount'
22 | DISK_PATHS=$(curl_v1_metadata "${BASE_DISK_URL}")
23 | MOUNTED_DISKS=()
24 | 
25 | MOUNT_TOOL=/tmp/${MOUNT_TOOL_URL##*/}
26 | download_bd_resource ${MOUNT_TOOL_URL} ${MOUNT_TOOL}
27 | chmod a+x ${MOUNT_TOOL}
28 | 
29 | for DISK_PATH in ${DISK_PATHS}; do
30 |   # Use the metadata server to determine the official index/name of each disk.
31 |   DISK_NAME=$(curl_v1_metadata "${BASE_DISK_URL}${DISK_PATH}device-name")
32 |   DISK_INDEX=$(curl_v1_metadata "${BASE_DISK_URL}${DISK_PATH}index")
33 |   DISK_TYPE=$(curl_v1_metadata "${BASE_DISK_URL}${DISK_PATH}type")
34 | 
35 |   # Index '0' is the boot disk and is thus already mounted.
36 |   if [[ "${DISK_INDEX}" == '0' ]]; then
37 |     echo "Boot disk is ${DISK_NAME}; will not attempt to mount it."
38 |     continue
39 |   fi
40 | 
41 |   if [[ "${DISK_TYPE}" == 'EPHEMERAL' ]]; then
42 |     DISK_PREFIX='ed'
43 |   elif [[ "${DISK_TYPE}" == 'PERSISTENT' ]]; then
44 |     DISK_PREFIX='pd'
45 |   fi
46 | 
47 |   # The metadata-specified 'name' can be converted to a disk 'id' by prepending
48 |   # 'google-' and finding it under /dev/disk/by-id.
49 |   DISK_ID="/dev/disk/by-id/google-${DISK_NAME}"
50 |   echo "Resolved disk name '${DISK_NAME}' to expected path '${DISK_ID}'."
51 | 
52 |   # We will name the mount-point after the official 'disk index'; this means
53 |   # there will be no mounted disk with suffix '0' since '0' is the boot disk.
54 |   DATAMOUNT="/mnt/${DISK_PREFIX}${DISK_INDEX}"
55 |   mkdir -p ${DATAMOUNT}
56 |   MOUNTED_DISKS+=(${DATAMOUNT})
57 |   echo "Mounting '${DISK_ID}' under mount point '${DATAMOUNT}'..."
58 | 
59 |   ${MOUNT_TOOL} -m 'mkfs.ext4 -F' ${DISK_ID} ${DATAMOUNT}
60 | 
61 |   # Idempotently update /etc/fstab
62 |   if cut -d '#' -f 1 /etc/fstab | grep -qvw ${DATAMOUNT}; then
63 |     DISK_UUID=$(blkid ${DISK_ID} -s UUID -o value)
64 |     MOUNT_ENTRY=($(grep -w ${DATAMOUNT} /proc/mounts))
65 |     # Taken from /usr/share/google/safe_format_and_mount
66 |     MOUNT_OPTIONS='defaults,discard'
67 |     echo "UUID=${DISK_UUID} ${MOUNT_ENTRY[@]:1:2} ${MOUNT_OPTIONS} 0 2 \
68 |         # added by bdutil" >> /etc/fstab
69 |   fi
70 | done
71 | 
72 | # If disks are mounted use the first one to hold target of symlink /hadoop
73 | if (( ${#MOUNTED_DISKS[@]} )); then
74 |   MOUNTED_HADOOP_DIR=${MOUNTED_DISKS[0]}/hadoop
75 |   mkdir -p ${MOUNTED_HADOOP_DIR}
76 |   if [[ ! -d /hadoop ]]; then
77 |     ln -s ${MOUNTED_HADOOP_DIR} /hadoop
78 |   fi
79 | fi
80 | 


--------------------------------------------------------------------------------
/libexec/set_default_fs.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Sets the default file system for Hadoop.
16 | 
17 | set -e
18 | 
19 | # Set FS specific config variables
20 | if [[ "${DEFAULT_FS}" == 'gs' ]]; then
21 |   DEFAULT_FS_NAME="gs://${CONFIGBUCKET}/"
22 | elif [[ "${DEFAULT_FS}" == 'hdfs' ]]; then
23 |   DEFAULT_FS_NAME="${NAMENODE_URI}"
24 | fi
25 | 
26 | bdconfig set_property \
27 |     --configuration_file ${HADOOP_CONF_DIR}/core-site.xml \
28 |     --name 'fs.default.name' \
29 |     --value ${DEFAULT_FS_NAME} \
30 |     --clobber
31 | 
32 | bdconfig set_property \
33 |     --configuration_file ${HADOOP_CONF_DIR}/core-site.xml \
34 |     --name 'fs.defaultFS' \
35 |     --value ${DEFAULT_FS_NAME} \
36 |     --clobber
37 | 


--------------------------------------------------------------------------------
/libexec/setup_client_nfs.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | if (( ${INSTALL_GCS_CONNECTOR} )) && \
16 |    (( ${ENABLE_NFS_GCS_FILE_CACHE} )) ; then
17 |   # Set up the GCS_ADMIN user.
18 |   setup_gcs_admin
19 | 
20 |   install_application "nfs-common" "nfs-utils"
21 |   install_application "autofs"
22 | 
23 |   NFS_MOUNT_POINT="$(get_nfs_mount_point)"
24 |   NFS_EXPORT_POINT="$(get_nfs_export_point)"
25 | 
26 |   mkdir -p "${NFS_MOUNT_POINT}"
27 |   chown ${GCS_ADMIN}:${GCS_ADMIN} "${NFS_MOUNT_POINT}"
28 |   if ! grep -e "auto.hadoop_gcs_metadata_cache" /etc/auto.master ; then
29 |     echo "/- /etc/auto.hadoop_gcs_metadata_cache nobind" >> /etc/auto.master
30 |   fi
31 | 
32 |   MOUNT_STRING="/${NFS_MOUNT_POINT} -fstype=nfs,defaults,rw,hard,intr"
33 |   MOUNT_STRING="${MOUNT_STRING} ${GCS_CACHE_MASTER_HOSTNAME}:${NFS_EXPORT_POINT}"
34 |   echo "${MOUNT_STRING}" > /etc/auto.hadoop_gcs_metadata_cache
35 | 
36 |   if [[ -f /usr/lib/systemd/system/autofs.service ]] \
37 |       && which systemctl ; then
38 |     systemctl enable autofs
39 |   fi
40 | 
41 |   service autofs restart
42 | fi
43 | 


--------------------------------------------------------------------------------
/libexec/setup_hadoop_user.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Adds a new 'hadoop' user which will be used to run the hadoop servers.
16 | 
17 | set -e
18 | 
19 | mkdir -p /home/hadoop
20 | mkdir -p /home/hadoop/.ssh
21 | 
22 | if ! (id -u hadoop >& /dev/null); then
23 |   useradd --system --shell /bin/bash -M --home /home/hadoop --user-group hadoop
24 | fi
25 | 
26 | if skeleton_files=$(find /etc/skel/ -maxdepth 1 -type f); then
27 |   cp ${skeleton_files} /home/hadoop
28 | fi
29 | 
30 | chown -R hadoop:hadoop /home/hadoop
31 | 
32 | mkdir -p ~hadoop/.ssh
33 | chown -R hadoop:hadoop ~hadoop/.ssh/
34 | 
35 | if [[ -x $(which restorecon) ]]; then
36 |   restorecon -Rv /home
37 | fi
38 | 
39 | mkdir -p /var/log/hadoop
40 | chown hadoop:hadoop /var/log/hadoop
41 | 


--------------------------------------------------------------------------------
/libexec/setup_master_ssh.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2013 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Sets up ssh keys on the master and then uploads them to a GCS CONFIGBUCKET
16 | # for worker to later download.
17 | 
18 | set -e
19 | 
20 | mkdir -p /home/hadoop/.ssh/
21 | chmod 700 /home/hadoop/.ssh
22 | 
23 | PRIVATE_KEY_NAME='hadoop_master_id_rsa'
24 | PUBLIC_KEY_NAME="${PRIVATE_KEY_NAME}.pub"
25 | LOCAL_PUBLIC_KEY="/home/hadoop/.ssh/${PUBLIC_KEY_NAME}"
26 | REMOTE_PUBLIC_KEY="${BDUTIL_GCS_STAGING_DIR}/${PUBLIC_KEY_NAME}"
27 | LOCAL_PRIVATE_KEY="/home/hadoop/.ssh/${PRIVATE_KEY_NAME}"
28 | 
29 | ssh-keygen -N "" -f ${LOCAL_PRIVATE_KEY}
30 | 
31 | # Authorize ssh into self as well, in case the master is also a worker node.
32 | cat ${LOCAL_PUBLIC_KEY} >> /home/hadoop/.ssh/authorized_keys
33 | 
34 | echo "Host ${PREFIX}*" >> /home/hadoop/.ssh/config
35 | echo "  IdentityFile ${LOCAL_PRIVATE_KEY}" >> /home/hadoop/.ssh/config
36 | echo '  UserKnownHostsFile /dev/null' >> /home/hadoop/.ssh/config
37 | echo '  CheckHostIP no' >> /home/hadoop/.ssh/config
38 | echo '  StrictHostKeyChecking no' >> /home/hadoop/.ssh/config
39 | 
40 | gsutil cp ${LOCAL_PUBLIC_KEY} ${REMOTE_PUBLIC_KEY}
41 | 


--------------------------------------------------------------------------------
/libexec/setup_worker_ssh.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2013 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Downloads shared ssh keys previously generated by the hadoop master and
16 | # uses them to configure intra-cluster ssh access.
17 | 
18 | set -e
19 | 
20 | mkdir -p ~hadoop/.ssh/
21 | 
22 | PRIVATE_KEY_NAME='hadoop_master_id_rsa'
23 | PUBLIC_KEY_NAME="${PRIVATE_KEY_NAME}.pub"
24 | LOCAL_PUBLIC_KEY="/home/hadoop/.ssh/${PUBLIC_KEY_NAME}"
25 | REMOTE_PUBLIC_KEY="${BDUTIL_GCS_STAGING_DIR}/${PUBLIC_KEY_NAME}"
26 | 
27 | gsutil cp ${REMOTE_PUBLIC_KEY} ${LOCAL_PUBLIC_KEY}
28 | cat ${LOCAL_PUBLIC_KEY} >> ~hadoop/.ssh/authorized_keys
29 | 
30 | echo "Host ${PREFIX}*" >> ~hadoop/.ssh/config
31 | echo '  UserKnownHostsFile /dev/null' >> ~hadoop/.ssh/config
32 | echo '  CheckHostIP no' >> ~hadoop/.ssh/config
33 | echo '  StrictHostKeyChecking no' >> ~hadoop/.ssh/config
34 | 
35 | chown -R hadoop:hadoop ~hadoop/.ssh/
36 | chmod 700 ~hadoop/.ssh
37 | 


--------------------------------------------------------------------------------
/libexec/start_hadoop.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Starts relevant hadoop daemon servers as the 'hadoop' user.
16 | set -e
17 | 
18 | source hadoop_helpers.sh
19 | 
20 | HADOOP_PORTS=(50010 50020 50030 50060 50070 50075 50090)
21 | 
22 | cd ${HADOOP_INSTALL_DIR}
23 | 
24 | # Test for sshability to workers.
25 | for NODE in ${WORKERS[@]}; do
26 |   sudo -u hadoop ssh ${NODE} "exit 0"
27 | done
28 | 
29 | # Wait for our ports to be free, but keep running even if not.
30 | wait_until_ports_free_and_report "${HADOOP_PORTS[@]}" || true
31 | 
32 | # Start namenode and jobtracker
33 | if (( ${ENABLE_HDFS} )); then
34 |   start_with_retry_namenode start_dfs_hadoop_1 &
35 | fi
36 | start_with_retry_jobtracker &
37 | for SUBPROC in $(jobs -p); do
38 |   wait ${SUBPROC}
39 | done
40 | 
41 | check_filesystem_accessibility
42 | 


--------------------------------------------------------------------------------
/libexec/start_hadoop2.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Starts relevant hadoop daemon servers as the 'hadoop' user.
16 | 
17 | set -e
18 | 
19 | source hadoop_helpers.sh
20 | 
21 | HADOOP_PORTS=(8088 50010 50020 50070 50090)
22 | 
23 | cd ${HADOOP_INSTALL_DIR}
24 | 
25 | # Test for sshability to workers.
26 | for NODE in ${WORKERS[@]}; do
27 |   sudo -u hadoop ssh ${NODE} "exit 0"
28 | done
29 | 
30 | # Wait for our ports to be free, but keep running even if not.
31 | wait_until_ports_free_and_report "${HADOOP_PORTS[@]}" || true
32 | 
33 | if (( ${ENABLE_HDFS} )); then
34 |   # Start namenode and jobtracker
35 |   start_with_retry_namenode start_dfs_hadoop_2
36 | 
37 |   if [[ "${DEFAULT_FS}" == 'hdfs' ]]; then
38 |     # Set up HDFS /tmp and /user dirs
39 |     initialize_hdfs_dirs
40 |   fi
41 | fi
42 | 
43 | # Start up resource and node managers
44 | sudo -u hadoop ./sbin/start-yarn.sh
45 | service hadoop-mapreduce-historyserver start
46 | 
47 | check_filesystem_accessibility
48 | 


--------------------------------------------------------------------------------
/platforms/cdh/README.md:
--------------------------------------------------------------------------------
 1 | Deploying Cloudera Data Hub (CDH) on Google Compute Engine
 2 | ==========================================================
 3 | 
 4 | Basic Usage
 5 | -----------
 6 | 
 7 | This plugin replaces the vanilla Apache binary tarballs with [Cloudera Data Hub](http://www.cloudera.com/content/cloudera/en/products-and-services/cdh.html) packages. Cluster configuration is the same as in core bdutil.
 8 | 
 9 |     ./bdutil -e platforms/cdh/cdh_env.sh deploy
10 | 
11 | Or alternatively, using shorthand syntax:
12 | 
13 |     ./bdutil -e cdh deploy
14 | 
15 | Status
16 | ------
17 | 
18 | This plugin is currently considered experimental and not officially supported.
19 | Contributions are welcome.
20 | 


--------------------------------------------------------------------------------
/platforms/cdh/cdh-core-template.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" ?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <configuration>
 4 |   <property>
 5 |     <name>hadoop.proxyuser.hue.hosts</name>
 6 |     <value>*</value>
 7 |   </property>
 8 |   <property>
 9 |     <name>hadoop.proxyuser.hue.groups</name>
10 |     <value>*</value>
11 |   </property>
12 |   <property>
13 |     <name>hadoop.proxyuser.oozie.hosts</name>
14 |     <value>*</value>
15 |   </property>
16 |   <property>
17 |     <name>hadoop.proxyuser.oozie.groups</name>
18 |     <value>*</value>
19 |   </property>
20 | </configuration>
21 | 


--------------------------------------------------------------------------------
/platforms/cdh/cdh_env.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Extension file for deploying CDH with bdutil
16 | 
17 | # Requies Hadoop 2 libraries (for recent versions at least).
18 | import_env hadoop2_env.sh
19 | 
20 | # Change these.
21 | CDH_VERSION=5
22 | # Components are installed / started in the order they are listed.
23 | MASTER_COMPONENTS="hadoop-hdfs-namenode hadoop-hdfs-secondarynamenode
24 |     hadoop-yarn-resourcemanager hadoop-mapreduce-historyserver
25 |     hive-metastore hive pig oozie hue"
26 | DATANODE_COMPONENTS="hadoop-hdfs-datanode hadoop-yarn-nodemanager
27 |     hadoop-mapreduce"
28 | 
29 | # Install JDK with compiler/tools instead of just the minimal JRE.
30 | INSTALL_JDK_DEVEL=true
31 | 
32 | # Hardware configuration.
33 | NUM_WORKERS=4
34 | WORKER_ATTACHED_PDS_SIZE_GB=1500
35 | MASTER_ATTACHED_PD_SIZE_GB=1500
36 | 
37 | # Don't change these.
38 | HADOOP_CONF_DIR='/etc/hadoop/conf'
39 | HADOOP_INSTALL_DIR='/usr/lib/hadoop'
40 | DEFAULT_FS='hdfs'
41 | UPLOAD_FILES+=('platforms/cdh/cdh-core-template.xml')
42 | USE_ATTACHED_PDS=true
43 | 
44 | COMMAND_GROUPS+=(
45 |   "deploy-cdh:
46 |      libexec/mount_disks.sh
47 |      libexec/install_java.sh
48 |      platforms/cdh/install_cdh.sh
49 |      libexec/install_bdconfig.sh
50 |      libexec/configure_hadoop.sh
51 |      libexec/install_and_configure_gcs_connector.sh
52 |      libexec/configure_hdfs.sh
53 |      libexec/set_default_fs.sh
54 |      platforms/cdh/configure_cdh.sh"
55 | 
56 |   "restart_services:
57 |      platforms/restart_services.sh"
58 | )
59 | 
60 | COMMAND_STEPS=(
61 |   'deploy-cdh,deploy-cdh'
62 |   'deploy-master-nfs-setup,*'
63 |   'deploy-client-nfs-setup,deploy-client-nfs-setup'
64 |   'restart_services,restart_services'
65 | )
66 | 


--------------------------------------------------------------------------------
/platforms/cdh/configure_cdh.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Copyright 2014 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS-IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | # Misc configurations for components not installed elsewhere.
17 | # Not necessarily CDH specific.
18 | 
19 | # Use FQDNs
20 | grep ${HOSTNAME} -lR ${HADOOP_CONF_DIR} \
21 |   | xargs -r sed -i "s/${HOSTNAME}/$(hostname --fqdn)/g"
22 | 
23 | # Configure Hive Metastore
24 | if dpkg -s hive-metastore > /dev/null; then
25 |   # Configure Hive metastorea
26 |   bdconfig set_property \
27 |       --configuration_file /etc/hive/conf/hive-site.xml \
28 |       --name 'hive.metastore.uris' \
29 |       --value "thrift://$(hostname --fqdn):9083" \
30 |       --clobber
31 | fi
32 | 
33 | # Configure Hue
34 | if dpkg -s hue > /dev/null; then
35 |   # Replace localhost with hostname.
36 |   sed -i "s/#*\([^#]*=.*\)localhost/\1$(hostname --fqdn)/" /etc/hue/conf/hue.ini
37 | fi
38 | 
39 | # Configure Oozie
40 | if dpkg -s oozie > /dev/null; then
41 |   sudo -u oozie /usr/lib/oozie/bin/ooziedb.sh create -run
42 | 
43 |   # Try to enable gs:// paths
44 |   bdconfig set_property \
45 |       --configuration_file /etc/oozie/conf/oozie-site.xml \
46 |       --name 'oozie.service.HadoopAccessorService.supported.filesystems' \
47 |       --value 'hdfs,gs,webhdfs,hftp' \
48 |       --clobber
49 | fi
50 | 
51 | # Enable WebHDFS
52 | bdconfig set_property \
53 |     --configuration_file ${HADOOP_CONF_DIR}/hdfs-site.xml \
54 |     --name 'dfs.webhdfs.enabled' \
55 |     --value true \
56 |     --clobber
57 | 
58 | # Enable Hue / Oozie impersonation
59 | bdconfig merge_configurations \
60 |     --configuration_file ${HADOOP_CONF_DIR}/core-site.xml \
61 |     --source_configuration_file cdh-core-template.xml \
62 |     --resolve_environment_variables \
63 |     --clobber
64 | 


--------------------------------------------------------------------------------
/platforms/cdh/install_cdh.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Copyright 2014 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS-IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | #TODO(user) support other Linux distributions.
17 | ARCHIVE_URL="http://archive.cloudera.com/cdh${CDH_VERSION}/debian/jessie/amd64/cdh"
18 | cat << EOF > /etc/apt/sources.list.d/cloudera.list
19 | deb ${ARCHIVE_URL} jessie-cdh${CDH_VERSION} contrib
20 | deb-src ${ARCHIVE_URL} jessie-cdh${CDH_VERSION} contrib
21 | EOF
22 | # TODO(user): fix insecure download of apt-key.
23 | download_bd_resource ${ARCHIVE_URL}/archive.key /tmp/cloudera.key
24 | apt-key add /tmp/cloudera.key
25 | 
26 | apt-get update
27 | 
28 | if [[ $(hostname -s) == ${MASTER_HOSTNAME} ]]; then
29 |   COMPONENTS="${MASTER_COMPONENTS}"
30 | else
31 |   COMPONENTS="${DATANODE_COMPONENTS}"
32 | fi
33 | 
34 | for COMPONENT in ${COMPONENTS}; do
35 |   if ! install_application ${COMPONENT}; then
36 |     # Check that it was actually installed as Services often fail to start.
37 |     dpkg -s ${COMPONENT}
38 |   fi
39 |   # Stop installed services:
40 |   if [[ -x "/etc/init.d/${COMPONENT}" ]]; then
41 |     service ${COMPONENT} stop
42 |   fi
43 | done
44 | 


--------------------------------------------------------------------------------
/platforms/hdp/TEST.md:
--------------------------------------------------------------------------------
 1 | ## Prep
 2 | 
 3 | ```
 4 | CONFIGBUCKET=hdp-00
 5 | PROJECT=hdp-00
 6 | switches="-b ${CONFIGBUCKET} -p ${PROJECT}"
 7 | 
 8 | # add this to make it a smaller test than the defaults
 9 | switches+="
10 |     --master_attached_pd_size_gb 100
11 |     --worker_attached_pds_size_gb 100
12 |     -n 1
13 |     -m n1-standard-2"
14 | 
15 | 
16 | bdutil="./bdutil ${switches}"
17 | ```
18 | 
19 | ## Test ambari_env.sh
20 | 
21 | ```
22 | environment=platforms/hdp/ambari_env.sh
23 | bdutil="${bdutil} -e ${environment}"
24 | 
25 | ## deploy
26 | ${bdutil} deploy
27 | 
28 | ## test
29 | ${bdutil} shell < ./hadoop-validate-setup.sh
30 | ${bdutil} shell < ./hadoop-validate-gcs.sh
31 | ${bdutil} shell < ./extensions/querytools/hive-validate-setup.sh
32 | ${bdutil} shell < ./extensions/querytools/pig-validate-setup.sh
33 | #${bdutil} shell < ./extensions/spark/spark-validate-setup.sh
34 | 
35 | ## delete
36 | ${bdutil} delete
37 | ```
38 | 
39 | 
40 | ## Test ambari_manual_env.sh
41 | 
42 | ```
43 | environment=platforms/hdp/ambari_manual_env.sh
44 | bdutil="${bdutil} -e ${environment}"
45 | 
46 | ## deploy
47 | ${bdutil} deploy
48 | 
49 | ## test
50 | # need to add an automated test here:
51 |     ${bdutil} shell # do something here like check the appropriate number of hosts in /api/v1/hosts
52 | 
53 | ## delete
54 | ${bdutil} delete
55 | 
56 | ```
57 | 
58 | ## Test re-using disks across multiple deployments of same instance count
59 | 
60 | ```
61 | environment=platforms/hdp/ambari_env.sh
62 | bdutil="${bdutil} -e ${environment}"
63 | unset CREATE_ATTACHED_PDS_ON_DEPLOY
64 | unset DELETE_ATTACHED_PDS_ON_DELETE
65 | 
66 | ## create
67 | export CREATE_ATTACHED_PDS_ON_DEPLOY=true
68 | ${bdutil} deploy
69 | 
70 | ## generate some data onto HDFS, and dont’ delete it
71 | echo "hadoop fs -mkdir redeploy-validation.tmp" | ${bdutil} shell
72 | ## if you want more data than that:
73 | #${bdutil} -u hadoop-validate-setup.sh run_command -- \
74 | #    sudo -u "$(whoami)" TERA_CLEANUP_SKIP=true TERA_GEN_NUM_RECORDS=100000 ./hadoop-validate-setup.sh
75 | 
76 | ## check that the ‘validate_...’ dir is there
77 | echo "hadoop fs -ls" | ${bdutil} shell
78 | 
79 | ## delete the cluster but keep disks
80 | export DELETE_ATTACHED_PDS_ON_DELETE=false
81 | ${bdutil} delete
82 | 
83 | ## create with existing disks
84 | export CREATE_ATTACHED_PDS_ON_DEPLOY=false
85 | ${bdutil} deploy
86 | 
87 | ## check that the ‘validate_...’ dir is there
88 | echo "hadoop fs -ls" | ${bdutil} shell
89 | 
90 | ## delete everything to cleanup this testing
91 | export DELETE_ATTACHED_PDS_ON_DELETE=true
92 | ${bdutil} delete
93 | ```
94 | 


--------------------------------------------------------------------------------
/platforms/hdp/ambari.conf:
--------------------------------------------------------------------------------
 1 | ########################################################################
 2 | ########################################################################
 3 | ## This is the base configuration file for the                        ##
 4 | ## Hortonworks Data Platform (HDP) extension to Google's `bdutil`     ##
 5 | ##                                                                    ##
 6 | ## Most of the values are commented out and just shown here for       ##
 7 | ## completeness, together with their default value.                   ##
 8 | ########################################################################
 9 | ########################################################################
10 | 
11 | ## ambari.conf
12 | ## Provides configuration for 'bdutil' installations of Ambari
13 | 
14 | 
15 | ## bdutil setting overrides
16 | ## For further details see:
17 | ##   `bdutil_env.sh`
18 | ##   https://cloud.google.com/hadoop/setting-up-a-hadoop-cluster
19 | 
20 | ## Your Google Cloud Platform configbucket & project
21 | ## Must be set here,
22 | ##   or in `bdutil_env.sh`
23 | ##   or with the -b & -p switches to `bdutil`
24 | #CONFIGBUCKET=""
25 | #PROJECT=""
26 | 
27 | ## the region/zone to deploy into
28 | #GCE_ZONE='us-central1-a'
29 | 
30 | ## Number of worker nodes. Total nodes will be NUM_WORKERS+1
31 | #NUM_WORKERS=4
32 | 
33 | ## Google Compute Engine machine type
34 | #GCE_MACHINE_TYPE='n1-standard-4'
35 | 
36 | ## Amount of storage to attach
37 | #WORKER_ATTACHED_PDS_SIZE_GB=1500
38 | #MASTER_ATTACHED_PD_SIZE_GB=1500
39 | 
40 | ## Amount of storage to give the boot disk.
41 | ## A full HDP stack starts to fill up 10 GB.
42 | #MASTER_BOOT_DISK_SIZE_GB=50
43 | #WORKER_BOOT_DISK_SIZE_GB=50
44 | 
45 | ## Storage types (pd-standard or pd-ssd)
46 | #WORKER_ATTACHED_PDS_TYPE='pd-standard'
47 | #MASTER_ATTACHED_PD_TYPE='pd-standard'
48 | 
49 | 
50 | ## HDP settings
51 | ## ============
52 | 
53 | ## If 'true', URLs for web interfaces, such as the jobtracker will be
54 | ## linked from Ambari with the public IP.
55 | ## Default is false. You will need to SSH to reach the host in this case.
56 | #AMBARI_PUBLIC=false
57 | 
58 | #AMBARI_VERSION='2.2.1.0'
59 | #AMBARI_REPO=http://public-repo-1.hortonworks.com/ambari/centos6/${AMBARI_VERSION:0:1}.x/updates/${AMBARI_VERSION}/ambari.repo
60 | 
61 | ## The distribution to install on your cluster.
62 | #AMBARI_STACK='HDP'
63 | #AMBARI_STACK_VERSION='2.4'
64 | 
65 | ## The components of that distribution to install on the cluster.
66 | ## Default is all but Kerberos, Apache Knox, Apache Ranger, and Hortonworks
67 | # SmartSense.
68 | #AMBARI_SERVICES="ACCUMULO AMBARI_METRICS ATLAS FALCON FLUME GANGLIA HBASE HDFS
69 | #    HIVE KAFKA MAHOUT MAPREDUCE2 OOZIE PIG SLIDER SPARK SQOOP STORM TEZ YARN
70 | #    ZOOKEEPER"
71 | 
72 | ## You can run with as little as:
73 | #AMBARI_SERVICES='HDFS MAPREDUCE2 YARN'
74 | 
75 | ## If using HDP 2.2, these are the supported services:
76 | #AMBARI_SERVICES="AMBARI_METRICS FALCON FLUME GANGLIA HBASE HDFS HIVE KAFKA
77 | #    MAPREDUCE2 OOZIE PIG SLIDER SPARK SQOOP STORM TEZ YARN ZOOKEEPER"
78 | 
79 | ## If you want to use a different JAVA
80 | ## Default is set by alternatives to 'openjdk-7-devel'
81 | #JAVA_HOME="/etc/alternatives/java_sdk"
82 | 


--------------------------------------------------------------------------------
/platforms/hdp/ambari_env.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | # ambari_env.sh
17 | #
18 | # Extension providing a cluster with Apache Ambari installed and automatically
19 | # provisions and configures the cluster's software. This installs and configures
20 | # the GCS connector.
21 | 
22 | ########################################################################
23 | ## There should be nothing to edit here, use ambari.conf              ##
24 | ########################################################################
25 | 
26 | # Import the base Ambari installation
27 | import_env platforms/hdp/ambari_manual_env.sh
28 | 
29 | # The distribution to install on your cluster.
30 | AMBARI_STACK="${AMBARI_STACK:-HDP}"
31 | AMBARI_STACK_VERSION="${AMBARI_STACK_VERSION:-2.4}"
32 | 
33 | ## The components of that distribution to install on the cluster.
34 | # Default is all but Kerberos, Apache Knox, Apache Ranger, and Hortonworks
35 | # SmartSense.
36 | AMBARI_SERVICES="${AMBARI_SERVICES:-ACCUMULO AMBARI_METRICS ATLAS FALCON FLUME
37 |     GANGLIA HBASE HDFS HIVE KAFKA MAHOUT MAPREDUCE2 OOZIE PIG SLIDER SPARK SQOOP
38 |     STORM TEZ YARN ZOOKEEPER}"
39 | 
40 | 
41 | if [[ -n "${BDUTIL_DIR}" ]]; then
42 |   UPLOAD_FILES+=(
43 |     "${BDUTIL_DIR}/platforms/hdp/create_blueprint.py"
44 |   )
45 | fi
46 | 
47 | COMMAND_GROUPS+=(
48 |   "install-ambari-components:
49 |      platforms/hdp/install_ambari_components.sh
50 |   "
51 | )
52 | 
53 | COMMAND_STEPS+=(
54 |   'install-ambari-components,*'
55 |   'install-gcs-connector-on-ambari,install-gcs-connector-on-ambari'
56 |   'update-ambari-config,*'
57 | )
58 | 


--------------------------------------------------------------------------------
/platforms/hdp/ambari_functions.sh:
--------------------------------------------------------------------------------
  1 | ## Tools for interacting with Ambari SERVER
  2 | 
  3 | AMBARI_TIMEOUT=${AMBARI_TIMEOUT:-3600}
  4 | POLLING_INTERVAL=${POLLING_INTERVAL:-10}
  5 | 
  6 | 
  7 | function ambari_wait() {
  8 |   local condition="$1"
  9 |   local goal="$2"
 10 |   local failed="FAILED"
 11 |   local limit=$(( ${AMBARI_TIMEOUT} / ${POLLING_INTERVAL} + 1 ))
 12 | 
 13 |   for (( i=0; i<${limit}; i++ )); do
 14 |     local status=$(bash -c "${condition}")
 15 |     echo "ambari_wait status: ${status}" >&2
 16 |     if [[ "${status}" == "${goal}" ]]; then
 17 |       break
 18 |     elif [[ "${status}" =~ "${failed}" ]]; then
 19 |       echo "Ambari operiation failed with status: ${status}" >&2
 20 |       return 1
 21 |     fi
 22 |     sleep ${POLLING_INTERVAL}
 23 |   done
 24 | 
 25 |   if [[ ${i} == ${limit} ]]; then
 26 |     echo "ambari_wait did not finish within" \
 27 |         "'${AMBARI_TIMEOUT}' seconds. Exiting." >&2
 28 |     return 1
 29 |   fi
 30 | }
 31 | 
 32 | # Only useful during a fresh install where we expect no failures
 33 | # Will not work if any requested TIMEDOUT/ABORTED
 34 | function ambari_wait_requests_completed() {
 35 |       # Avoid race conditions with requests.
 36 |       sleep 10
 37 |       AMBARI_CLUSTER=$(get_ambari_cluster_name)
 38 |       # Poll for completion
 39 |       ambari_wait "${AMBARI_CURL} ${AMBARI_API}/clusters/${AMBARI_CLUSTER}/requests \
 40 |             | grep -Eo 'http://.*/requests/[0-9]+' \
 41 |             | xargs ${AMBARI_CURL} \
 42 |             | grep request_status \
 43 |             | grep -Eo '\"[A-Z_]+\"' \
 44 |             | sort | uniq | paste -sd'+'" \
 45 |             '"COMPLETED"'
 46 | }
 47 | 
 48 | function ambari_service_stop() {
 49 |     AMBARI_CLUSTER=$(get_ambari_cluster_name)
 50 |     if [[ -z "${SERVICE}" ]]; then
 51 |         echo "Taking no action as no SERVICE was defined. You may specific ALL to stop all Services."
 52 |     else
 53 |         AMBARI_REQUEST='{"RequestInfo": {"context" :"Stop '${SERVICE}' via REST"}, "Body": {"ServiceInfo": {"state": "INSTALLED"}}}'
 54 |         if [[ "${SERVICE}" == "ALL" ]]; then
 55 |             ${AMBARI_CURL} -i -X PUT -d "${AMBARI_REQUEST}" ${AMBARI_API}/clusters/${AMBARI_CLUSTER}/services/
 56 |         else
 57 |             ${AMBARI_CURL} -i -X PUT -d "${AMBARI_REQUEST}" ${AMBARI_API}/clusters/${AMBARI_CLUSTER}/services/${SERVICE}
 58 |         fi
 59 |     fi
 60 | }
 61 | 
 62 | function ambari_service_start() {
 63 |     AMBARI_CLUSTER=$(get_ambari_cluster_name)
 64 |     if [[ -z "${SERVICE}" ]]; then
 65 |         echo "Taking no action as no SERVICE was defined"
 66 |     else
 67 |         AMBARI_REQUEST='{"RequestInfo": {"context" :"Start '${SERVICE}' via REST"}, "Body": {"ServiceInfo": {"state": "STARTED"}}}'
 68 |         if [[ "${SERVICE}" == 'ALL' ]]; then
 69 |             ${AMBARI_CURL} -i -X PUT -d "${AMBARI_REQUEST}" ${AMBARI_API}/clusters/${AMBARI_CLUSTER}/services/
 70 |         else
 71 |             ${AMBARI_CURL} -i -X PUT -d "${AMBARI_REQUEST}" ${AMBARI_API}/clusters/${AMBARI_CLUSTER}/services/${SERVICE}
 72 |         fi
 73 |     fi
 74 | }
 75 | 
 76 | # set SERVICE=ALL to restart all services
 77 | function ambari_service_restart() {
 78 |     ambari_service_stop
 79 |     ambari_wait_requests_completed
 80 |     ambari_service_start
 81 |     ambari_wait_requests_completed
 82 | }
 83 | 
 84 | function ambari_restart_all_services() {
 85 |     AMBARI_CLUSTER=$(get_ambari_cluster_name)
 86 |     SERVICES=($(${AMBARI_CURL} ${AMBARI_API}/clusters/${AMBARI_CLUSTER}/services \
 87 |         | grep -Eo 'http://.*/services/[^\"]+'))
 88 | 
 89 |     for STATE in 'INSTALLED' 'STARTED'; do
 90 |       ${AMBARI_CURL} -X PUT -d "{\"ServiceInfo\":{\"state\":\"${STATE}\"}}" "${SERVICES[@]}"
 91 |       ambari_wait_requests_completed
 92 |     done
 93 | }
 94 | 
 95 | # Make variable substitutions in a json file.
 96 | function subsitute_bash_in_json() {
 97 |   local custom_configuration_file="$1"
 98 |   loginfo "Replacing variables in ${custom_configuration_file}."
 99 |   perl -pi -e 's/\$\{([^\}]*)\}/$ENV{$1}/e' ${custom_configuration_file}
100 | }
101 | 
102 | # Print out name of first (and presumably only) cluster in Ambari.
103 | function get_ambari_cluster_name() {
104 |   ${AMBARI_CURL} ${AMBARI_API}/clusters \
105 |       | sed -n 's/.*cluster_name" : "\(\S*\)".*/\1/p' \
106 |       | head -1
107 | }
108 | 


--------------------------------------------------------------------------------
/platforms/hdp/ambari_manual_env.sh:
--------------------------------------------------------------------------------
  1 | # Copyright 2014 Google Inc. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS-IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | 
 16 | # ambari_manual_env.sh
 17 | #
 18 | # Extension installing Apache Ambari on the cluster allowing the user to
 19 | # manually log in and provision and configure the clusters software.
 20 | # This installs but does not configure the GCS connector.
 21 | 
 22 | ########################################################################
 23 | ## There should be nothing to edit here, use ambari.conf              ##
 24 | ########################################################################
 25 | 
 26 | # Remove core bdutil upload files.
 27 | UPLOAD_FILES=()
 28 | 
 29 | # Import hadoop2_env.sh just for the GCS_CONNECTOR_JAR.
 30 | import_env hadoop2_env.sh
 31 | 
 32 | # Default to 4 workers plus master for good spreading of master daemons.
 33 | NUM_WORKERS=4
 34 | # Use CentOS instead of Debian.
 35 | GCE_IMAGE=''
 36 | GCE_IMAGE_FAMILY='centos-6'
 37 | GCE_IMAGE_PROJECT='centos-cloud'
 38 | 
 39 | # Create attached storage
 40 | USE_ATTACHED_PDS=true
 41 | # Since we'll be using HDFS as the default file system, size disks to grant
 42 | # maximum I/O per VM.
 43 | WORKER_ATTACHED_PDS_SIZE_GB=1500
 44 | MASTER_ATTACHED_PD_SIZE_GB=1500
 45 | 
 46 | ## Amount of storage to give the boot disk.
 47 | ## A full HDP stack starts to fill up 10 GB.
 48 | MASTER_BOOT_DISK_SIZE_GB=${MASTER_BOOT_DISK_SIZE_GB:-50}
 49 | WORKER_BOOT_DISK_SIZE_GB=${MASTER_BOOT_DISK_SIZE_GB:-50}
 50 | 
 51 | # Install the full Java JDK. Most services need it
 52 | INSTALL_JDK_DEVEL=true
 53 | JAVA_HOME=/etc/alternatives/java_sdk
 54 | 
 55 | ## import configuration overrides
 56 | import_env platforms/hdp/ambari.conf
 57 | 
 58 | ## Version of Ambari and location of YUM package repository
 59 | AMBARI_VERSION="${AMBARI_VERSION:-2.2.1.0}"
 60 | AMBARI_REPO=${AMBARI_REPO:-http://public-repo-1.hortonworks.com/ambari/centos6/${AMBARI_VERSION:0:1}.x/updates/${AMBARI_VERSION}/ambari.repo}
 61 | 
 62 | ## If 'true', URLs for web interfaces, such as the jobtracker will below
 63 | ## linked from Ambari with the public IP.
 64 | ## Default is false. You will need to SSH to reach the host in this case.
 65 | AMBARI_PUBLIC=${AMBARI_PUBLIC:-false}
 66 | normalize_boolean 'AMBARI_PUBLIC'
 67 | 
 68 | # HDFS will always be the default file system (even if changed here), because
 69 | # many services require it to be. This is purely advisory.
 70 | DEFAULT_FS='hdfs'
 71 | 
 72 | GCS_CACHE_CLEANER_LOG_DIRECTORY="/var/log/hadoop/${GCS_CACHE_CLEANER_USER}"
 73 | GCS_CACHE_CLEANER_LOGGER='INFO,RFA'
 74 | HADOOP_CONF_DIR="/etc/hadoop/conf"
 75 | HADOOP_INSTALL_DIR="/usr/local/lib/hadoop"
 76 | 
 77 | # For interacting with Ambari Server API
 78 | AMBARI_API="http://localhost:8080/api/v1"
 79 | AMBARI_CURL='curl -fsSu admin:admin -H X-Requested-By:ambari'
 80 | MASTER_UI_PORTS=('8080')
 81 | 
 82 | import_env platforms/hdp/ambari_functions.sh
 83 | 
 84 | if [[ -n "${BDUTIL_DIR}" ]]; then
 85 |   UPLOAD_FILES+=(
 86 |     "${BDUTIL_DIR}/libexec/hadoop_helpers.sh"
 87 |     "${BDUTIL_DIR}/platforms/hdp/configuration.json"
 88 |     "${BDUTIL_DIR}/platforms/hdp/resources/public-hostname-gcloud.sh"
 89 |     "${BDUTIL_DIR}/platforms/hdp/resources/thp-disable.sh"
 90 |   )
 91 | fi
 92 | 
 93 | COMMAND_GROUPS+=(
 94 |   "ambari-setup:
 95 |      libexec/mount_disks.sh
 96 |      libexec/install_java.sh
 97 |      libexec/setup_hadoop_user.sh
 98 |      platforms/hdp/install_ambari.sh
 99 |   "
100 | 
101 |   "install-gcs-connector-on-ambari:
102 |      platforms/hdp/install_gcs_connector_on_ambari.sh
103 |   "
104 | 
105 |   "update-ambari-config:
106 |      platforms/hdp/update_ambari_config.sh
107 |   "
108 | )
109 | 
110 | COMMAND_STEPS=(
111 |   'ambari-setup,ambari-setup'
112 |   'deploy-master-nfs-setup,*'
113 |   'deploy-client-nfs-setup,deploy-client-nfs-setup'
114 | )
115 | 


--------------------------------------------------------------------------------
/platforms/hdp/ambari_manual_post_deploy_env.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | # ambari_env.sh
17 | #
18 | # Extension providing a cluster with Apache Ambari installed and automatically
19 | # provisions and configures the cluster's software. This installs and configures
20 | # the GCS connector.
21 | 
22 | ########################################################################
23 | ## There should be nothing to edit here, use ambari.conf              ##
24 | ########################################################################
25 | 
26 | # Import the base Ambari installation
27 | import_env platforms/hdp/ambari_manual_env.sh
28 | 
29 | COMMAND_STEPS=(
30 |   'install-gcs-connector-on-ambari,install-gcs-connector-on-ambari'
31 |   'update-ambari-config,*'
32 | )
33 | 


--------------------------------------------------------------------------------
/platforms/hdp/configuration.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "configurations" : {
 3 |       "core-site" : {
 4 |         "fs.gs.project.id": "${PROJECT}",
 5 |         "fs.gs.system.bucket": "${CONFIGBUCKET}",
 6 |         "fs.gs.working.dir": "/",
 7 |         "fs.gs.impl" : "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem",
 8 |         "fs.AbstractFileSystem.gs.impl": "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS",
 9 |         "fs.gs.metadata.cache.enable": "true",
10 |         "fs.gs.metadata.cache.type": "${GCS_METADATA_CACHE_TYPE}",
11 |         "fs.gs.metadata.cache.directory": "${GCS_FILE_CACHE_DIRECTORY}",
12 | 
13 |         "hadoop.proxyuser.root.hosts": "*",
14 |         "hadoop.proxyuser.root.groups": "*",
15 |         "hadoop.proxyuser.root.users": "*"
16 |       },
17 |       "hdfs-site" : {
18 |         "dfs.replication" : "2"
19 |       },
20 |       "mapred-site" : {
21 |         "mapreduce.job.working.dir" : "/user/${user.name}"
22 |       }
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/platforms/hdp/install_gcs_connector_on_ambari.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Copyright 2014 Google Inc. All Rights Reserved.
 4 | 
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS-IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | ## install_gcs_connector_on_ambari.sh
19 | ## This file:
20 | ##   * downloads the relevant gcs-connector-<version>.jar
21 | ##   * installs into a local lib dir
22 | ##   * adds that lib dir to relevant classpaths
23 | 
24 | if (( ${INSTALL_GCS_CONNECTOR} )) ; then
25 | loginfo "installing GCS_CONNECTOR_JAR on each node"
26 |   LIB_JARS_DIR="${HADOOP_INSTALL_DIR}/lib"
27 |   mkdir -p ${LIB_JARS_DIR}
28 | 
29 |   # Grab the connector jarfile, add it to installation /lib directory.
30 |   JARNAME=$(grep -o '[^/]*\.jar' <<< ${GCS_CONNECTOR_JAR})
31 |   LOCAL_JAR="${LIB_JARS_DIR}/${JARNAME}"
32 | 
33 |   download_bd_resource "${GCS_CONNECTOR_JAR}" "${LOCAL_JAR}"
34 | 
35 |   # link gcs connector into main hadoop lib dir
36 |   source <(grep "^export HADOOP_HOME=" /etc/hadoop/conf/hadoop-env.sh) || true
37 |   if [[ -d "${HADOOP_HOME}/lib/" ]]; then
38 |     ln -sv "${LOCAL_JAR}" "${HADOOP_HOME}/lib/"
39 |   fi
40 | fi
41 | 


--------------------------------------------------------------------------------
/platforms/hdp/resources/public-hostname-gcloud.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | curl -Ls -m 5 http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0/access-configs/0/external-ip -H "Metadata-Flavor: Google"
3 | 
4 | 


--------------------------------------------------------------------------------
/platforms/hdp/resources/thp-disable.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # disable transparent huge pages: for Hadoop
 3 | thp_disable=true
 4 | if [ "${thp_disable}" = true ]; then
 5 |     for path in redhat_transparent_hugepage transparent_hugepage; do
 6 |         if test -f /sys/kernel/mm/${path}/enabled; then
 7 |             echo never > /sys/kernel/mm/${path}/enabled
 8 |         fi
 9 |         if test -f /sys/kernel/mm/${path}/defrag; then
10 |             echo never > /sys/kernel/mm/${path}/defrag
11 |         fi
12 |     done
13 | fi
14 | exit 0
15 | 


--------------------------------------------------------------------------------
/platforms/hdp/update_ambari_config.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # finalize the cluster configuration
16 | 
17 | source hadoop_helpers.sh
18 | 
19 | # initialize hdfs dirs
20 | loginfo "Set up HDFS /tmp and /user dirs"
21 | initialize_hdfs_dirs admin
22 | 
23 | 
24 | AMBARI_CLUSTER=$(get_ambari_cluster_name)
25 | 
26 | # update hadoop configuration to include the gcs connector
27 | if (( ${INSTALL_GCS_CONNECTOR} )) ; then
28 |   loginfo "Setting up GCS connector cache cleaner and configuration."
29 |   if (( ${ENABLE_NFS_GCS_FILE_CACHE} )); then
30 |     export GCS_METADATA_CACHE_TYPE='FILESYSTEM_BACKED'
31 |     export GCS_FILE_CACHE_DIRECTORY="$(get_nfs_mount_point)"
32 | 
33 |     setup_cache_cleaner
34 |   else
35 |     export GCS_METADATA_CACHE_TYPE='IN_MEMORY'
36 |     # For IN_MEMORY cache, this directory won't actually be used, but we set
37 |     # it to a sane default for easy manual experimentation of file caching.
38 |     export GCS_FILE_CACHE_DIRECTORY='/tmp/gcs_connector_metadata_cache'
39 |   fi
40 | 
41 |   # If it wasn't set at cluster creation configure the GCS connector.
42 |   if ! /var/lib/ambari-server/resources/scripts/configs.sh \
43 |       get localhost ${AMBARI_CLUSTER} core-site \
44 |       | grep -q '^"fs.gs'; then
45 |     subsitute_bash_in_json configuration.json
46 |     sed -n < configuration.json \
47 |         's/.*"\(fs\.\S*gs\.\S*\)"\s*:\s*"\([^"]*\)".*/\1 \2/p' \
48 |         | xargs -n 2 /var/lib/ambari-server/resources/scripts/configs.sh \
49 |         set localhost ${AMBARI_CLUSTER} core-site
50 |     # Will reload core-site.xml
51 |     SERVICES_TO_UPDATE+=" HDFS"
52 |   fi
53 | 
54 |   loginfo "Adding /usr/local/lib/hadoop/lib to " \
55 |       "mapreduce.application.classpath."
56 |   NEW_CLASSPATH=$(/var/lib/ambari-server/resources/scripts/configs.sh \
57 |       get localhost ${AMBARI_CLUSTER} mapred-site \
58 |       | grep -E '^"mapreduce.application.classpath"' \
59 |       | tr -d \" \
60 |       | awk '{print "/usr/local/lib/hadoop/lib/*,"$3}' | sed 's/,$//')
61 |   /var/lib/ambari-server/resources/scripts/configs.sh \
62 |       set localhost ${AMBARI_CLUSTER} \
63 |       mapred-site mapreduce.application.classpath ${NEW_CLASSPATH}
64 |   sleep 10
65 | fi
66 | 
67 | loginfo "Restarting services, because Ambari usually requires it."
68 | SERVICE='ALL'
69 | ambari_service_stop
70 | ambari_wait_requests_completed
71 | ambari_service_start
72 | ambari_wait_requests_completed
73 | 
74 | # Check GCS connectivity
75 | check_filesystem_accessibility
76 | 
77 | # Set up files and pig views, which was added in Ambari 2.1.
78 | #
79 | if version_at_least "${AMBARI_VERSION}" '2.1'; then
80 |   # This should be done automatically but it wasn't as of 2016-03-16.
81 |   for view in FILES PIG; do
82 |     # Both of these views are currently 1.0.0
83 |     VIEW="${AMBARI_API}/views/${view}/versions/1.0.0/instances/AUTO_${view}_INSTANCE"
84 |     if ${AMBARI_CURL} ${VIEW} |& grep -q '404 Not Found'; then
85 |       ${AMBARI_CURL} -X POST ${VIEW} \
86 |         -d "{\"ViewInstanceInfo\": {\"cluster_handle\": \"${AMBARI_CLUSTER}\"}}"
87 |     fi
88 |   done
89 | fi
90 | 


--------------------------------------------------------------------------------
/platforms/mapr/README.md:
--------------------------------------------------------------------------------
 1 | MapR Cluster on Google Compute Engine
 2 | -------------------------------------
 3 | 
 4 | The [MapR distribution](https://www.mapr.com/products/mapr-distribution-including-apache-hadoop) for Hadoop adds enterprise-grade features to the Hadoop platform that make Hadoop easier to use and more dependable. The MapR distribution for Hadoop is fully integrated with the [Google Compute Engine (GCE)](https://cloud.google.com/compute/) framework, allowing customers to deploy a MapR cluster with ready access to Google's cloud infrastructure. MapR provides network file system (NFS) and open database connectivity (ODBC) interfaces, a comprehensive management suite, and automatic compression. MapR provides high availability with a no-NameNode architecture and data protection with snapshots, disaster recovery, and cross-cluster mirroring.
 5 | 
 6 | ### Make sure you have...
 7 | * an active [Google Cloud Platform](https://console.developers.google.com/) account.
 8 | * a client machine with [Google Cloud SDK](https://cloud.google.com/sdk/) and [bdutil](https://cloud.google.com/hadoop/downloads) installed.
 9 | * access to a GCE project where you can add instances, buckets and disks.
10 | * a valid MapR license (optional).
11 | 
12 | ### Now, to launch a MapR Cluster on GCE using `bdutil`...
13 | 
14 | 1. Set the project and bucket in `mapr_env.sh` (located under `bdutil/platforms/mapr/`).
15 | 2. Update `node.lst` to determine the [allocation of cluster roles](http://doc.mapr.com/display/MapR/MapR+Cluster+on+the+Google+Compute+Engine#MapRClusterontheGoogleComputeEngine-gce-config) for the nodes in the cluster. For reference, the config file contains a simple 4-node [M7](https://www.mapr.com/products/hadoop-download) cluster allocation.
16 | 	* Node names must have the PREFIX mentioned in `mapr_env.sh`
17 | 	* Node names must have suffixes: -m, -w-0, -w-1, -w-2 ...
18 | 	For example, if the PREFIX is 'mapr', node names must be 'mapr-m', 'mapr-w-0', 'mapr-w-1', ... 
19 | 	* NUM_WORKERS in `mapr_env.sh` must equal one less than number of nodes in `node.lst`
20 | 3. (Optional) Copy a valid license into `mapr_license.txt`
21 | 4. Deploy the cluster by invoking in the bdutil root directory: 
22 | 	```
23 | 	./bdutil -e mapr deploy
24 | 	```
25 | 
26 | 5. Access the cluster by invoking: 
27 | 	```
28 | 	gcloud compute config-ssh
29 | 	``` 
30 | 
31 | 	The output shows how to ssh into a node. Login as the `MAPR_USER` mentioned in `mapr_env.sh` (for example, `ssh mapr@node1.us-central1-f.t-diplomatic-962`).
32 | 6. Test an example application by running:
33 | 	```
34 | 	yarn jar $MAPR_HOME/hadoop/hadoop-2.5.1/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.5.1-mapr-1501.jar pi 16 100
35 | 	```
36 | 
37 | 
38 | ### At the end...
39 | To delete the cluster, ensure `mapr_env.sh` is same as in when deployed. In the bdutil root directory, invoke: 
40 | ```
41 | ./bdutil -e mapr delete
42 | ```
43 | 
44 | ### Additional Resources
45 | * [Free Hadoop On-Demand Training](https://www.mapr.com/services/mapr-academy/big-data-hadoop-online-training)
46 | * [Why MapR](https://www.mapr.com/why-hadoop/why-mapr)
47 | * [MapR Development Guide](http://doc.mapr.com/display/MapR/Development+Guide)
48 | * [MapR Documentation](http://doc.mapr.com/)
49 | * [MapR Support](https://www.mapr.com/support/overview)
50 | * [Another way](http://doc.mapr.com/display/MapR/MapR+Cluster+on+the+Google+Compute+Engine) to deploy
51 | * [MapR-on-GCE](https://github.com/mapr/gce)
52 | 
53 | **LICENSE:** [Apache License, Version 2.0](https://github.com/GoogleCloudPlatform/bdutil/blob/master/LICENSE)


--------------------------------------------------------------------------------
/platforms/mapr/mapr_license.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudDataproc/bdutil/967fd15b1f690e961f7d61809e4976aaa4ade90f/platforms/mapr/mapr_license.txt


--------------------------------------------------------------------------------
/platforms/mapr/node.lst:
--------------------------------------------------------------------------------
 1 | # Simple 4-node M7 cluster 
 2 | # NOTE: 
 3 | # (1) Node names MUST have the PREFIX mentioned in 'mapr_env.sh'
 4 | # (2) Node names MUST have suffixes: -m, -w-0, -w-1, -w-2 ...
 5 | #     For example, if the PREFIX is 'mapr', 
 6 | #     node names MUST be 'mapr-m', 'mapr-w-0', 'mapr-w-1', ... 
 7 | # (3) Do not forget to update NUM_WORKERS variable
 8 | # Refer to MapR documentation for other values
 9 | mapr-m:zookeeper,cldb,fileserver,nodemanager,nfs,webserver,hbase
10 | mapr-w-0:zookeeper,cldb,fileserver,nodemanager,nfs,hbase
11 | mapr-w-1:zookeeper,resourcemanager,historyserver,fileserver,nodemanager,nfs,hbase
12 | mapr-w-2:resourcemanager,fileserver,nodemanager,nfs,hbase
13 | 


--------------------------------------------------------------------------------
/platforms/restart_services.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Copyright 2014 Google Inc. All Rights Reserved.D
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS-IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | # Restarts services corresponding to installed packages.
17 | # Performs last minute initialization as needed.
18 | 
19 | set -e
20 | 
21 | source hadoop_helpers.sh
22 | 
23 | if [[ $(hostname -s) == ${MASTER_HOSTNAME} ]]; then
24 |   COMPONENTS=${MASTER_COMPONENTS}
25 | else
26 |   COMPONENTS=${DATANODE_COMPONENTS}
27 | fi
28 | 
29 | # Component ordering is sensitive. hive-metastore must come before hive-server2
30 | # and hdfs must be up before oozie.
31 | for COMPONENT in ${COMPONENTS}; do
32 |   if [[ -x /etc/init.d/${COMPONENT} ]]; then
33 |     # Initialize HDFS
34 |     if [[ ${COMPONENT} == 'hadoop-hdfs-namenode' ]]; then
35 |       service hadoop-hdfs-namenode stop
36 |       # Do not refomat if already formatted.
37 |       yes n | service hadoop-hdfs-namenode init
38 |       service hadoop-hdfs-namenode start
39 | 
40 |       # Setup /tmp and /user directories.
41 |       if [[ "${DEFAULT_FS}" == 'hdfs' ]]; then
42 |         initialize_hdfs_dirs
43 |       fi
44 |     # Initialize Oozie. Requires Namenode to be up.
45 |     elif [[ ${COMPONENT} == 'oozie' ]]; then
46 |       # Requires HDFS to be up and running.
47 |       # Might be CDH specific.
48 |       oozie-setup sharelib create -fs ${NAMENODE_URI} \
49 |           -locallib /usr/lib/oozie/oozie-sharelib-yarn*
50 |       service oozie restart
51 |     else
52 |       service ${COMPONENT} restart
53 |     fi
54 |   fi
55 | done
56 | 


--------------------------------------------------------------------------------
/sampleapps/querytools/conf/hive/hive-site.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
 3 | <!--
 4 |    Licensed to the Apache Software Foundation (ASF) under one or more
 5 |    contributor license agreements.  See the NOTICE file distributed with
 6 |    this work for additional information regarding copyright ownership.
 7 |    The ASF licenses this file to You under the Apache License, Version 2.0
 8 |    (the "License"); you may not use this file except in compliance with
 9 |    the License.  You may obtain a copy of the License at
10 | 
11 |        http://www.apache.org/licenses/LICENSE-2.0
12 | 
13 |    Unless required by applicable law or agreed to in writing, software
14 |    distributed under the License is distributed on an "AS IS" BASIS,
15 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |    See the License for the specific language governing permissions and
17 |    limitations under the License.
18 | -->
19 | 
20 | <configuration>
21 | 
22 | <property>
23 |   <name>hive.metastore.warehouse.dir</name>
24 |   <value>/user/${user.name}/warehouse</value>
25 |   <description>location of default database for the warehouse</description>
26 | </property>
27 | 
28 | </configuration>
29 | 
30 | 


--------------------------------------------------------------------------------
/sampleapps/querytools/examples/ngrams/hive_query_ngrams.q:
--------------------------------------------------------------------------------
 1 | --
 2 | -- Copyright 2013 Google Inc. All Rights Reserved.
 3 | --
 4 | -- Licensed under the Apache License, Version 2.0 (the "License");
 5 | -- you may not use this file except in compliance with the License.
 6 | -- You may obtain a copy of the License at
 7 | --
 8 | --     http://www.apache.org/licenses/LICENSE-2.0
 9 | --
10 | -- Unless required by applicable law or agreed to in writing, software
11 | -- distributed under the License is distributed on an "AS IS" BASIS,
12 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | -- See the License for the specific language governing permissions and
14 | --
15 | 
16 | --
17 | -- This script is intended to be run from the Hive shell:
18 | --
19 | --   hive> source hive_query_ngrams.q;
20 | --
21 | -- or from the operating system shell:
22 | --
23 | --   $ hive -f hive_query_ngrams.q
24 | --
25 | -- The result of this query is a table of records indicating the count
26 | -- of occurrences of the words "radio" and "television" in the Google
27 | -- ngrams corpora for each year since 1920.
28 | --
29 | -- This query ensures that a record exists in the result for every year
30 | -- since 1920, even if there were no instances of a given word.
31 | -- In practice this is unnecessary as radio and television both occur
32 | -- more than once in the data set for every year since 1920.
33 | --
34 | -- The structure of this query is to join three distinct subqueries (on year):
35 | --    y: list of years since 1920 (implicitly ordered by the DISTINCT operation)
36 | --    r: sum of instances of the word "radio" for each year since 1920
37 | --    t: sum of instances of the word "television" for each year since 1920
38 | --
39 | 
40 | SELECT y.year AS year,
41 |        r.instance_count AS radio, t.instance_count AS television,
42 |        CAST(r.instance_count AS DOUBLE)/(r.instance_count + t.instance_count)
43 |         AS pct
44 | FROM
45 |  (SELECT DISTINCT year AS year FROM
46 |     (SELECT distinct year from 1gram where prefix = 'r' and year >= 1920
47 |      UNION ALL
48 |      SELECT distinct year from 1gram where prefix = 't' and year >= 1920) y_all)
49 |     y
50 | JOIN
51 |  (SELECT LOWER(word) AS ngram_col, year, SUM(instance_count) AS instance_count
52 |   FROM 1gram
53 |   WHERE LOWER(word) = 'radio' AND prefix='r' AND (year >= 1920)
54 |   GROUP BY LOWER(word), year) r
55 | ON y.year = r.year
56 | JOIN
57 |  (SELECT LOWER(word) AS ngram_col, year, SUM(instance_count) AS instance_count
58 |   FROM 1gram
59 |   WHERE LOWER(word) = 'television' AND prefix='t' AND (year >= 1920)
60 |   GROUP BY LOWER(word), year) t
61 | ON y.year = t.year
62 | ORDER BY year;
63 | 
64 | EXIT;
65 | 
66 | --
67 | -- This is a simplified version of the above which eliminates the explicit
68 | -- generation of the "year" list.  It assumes (correctly) that the word
69 | -- "television" appears every year that "radio" does.
70 | -- This query is listed here for reference and educational purposes only.
71 | --
72 | -- SELECT a.year, a.instance_count, b.instance_count,
73 | --        CAST(a.instance_count AS DOUBLE)/(a.instance_count + b.instance_count)
74 | -- FROM
75 | --  (SELECT LOWER(word) AS ngram_col, year, SUM(instance_count) AS instance_count
76 | --   FROM 1gram
77 | --   WHERE LOWER(word) = 'radio' AND prefix='r' AND (year >= 1920)
78 | --   GROUP BY LOWER(word), year) a
79 | -- JOIN
80 | --  (SELECT LOWER(word) AS ngram_col, year, SUM(instance_count) AS instance_count
81 | --   FROM 1gram
82 | --   WHERE LOWER(word) = 'television' AND prefix='t' AND (year >= 1920)
83 | --   GROUP BY LOWER(word), year) b
84 | -- ON a.year = b.year
85 | -- ORDER BY year;
86 | --
87 | 


--------------------------------------------------------------------------------
/sampleapps/querytools/examples/ngrams/hive_table_create.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2013 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | #
17 | # This script is intended to be run from the unix command line
18 | # on an instance with hive installed (and the hive executable
19 | # available in the user PATH).
20 | #
21 | # It is assumed that the one has already run the shell script
22 | # ngram_hdfs_load.sh which will have downloaded the associated
23 | # ngram data and deposited it into HDFS under /user/hdpusr/ngrams/<ngram>
24 | #
25 | # This script will create a table ("1gram") and then load each
26 | # file into a separate partition within the table.
27 | #
28 | 
29 | set -o errexit
30 | set -o nounset
31 | 
32 | # Select what to install
33 | readonly SCRIPT_DIR=$(dirname $0)
34 | source $SCRIPT_DIR/ngram_setup.sh
35 | 
36 | # Create the table if it does not already exist
37 | hive << END_CREATE
38 |   CREATE TABLE IF NOT EXISTS $NGRAMS (
39 |       word STRING,
40 |       year INT,
41 |       instance_count INT,
42 |       book_count INT
43 |   )
44 |   PARTITIONED BY (prefix STRING)
45 |   ROW FORMAT DELIMITED
46 |       FIELDS TERMINATED BY '\t'
47 |   STORED AS TEXTFILE
48 |   ;
49 |   EXIT
50 |   ;
51 | END_CREATE
52 | 
53 | # Get the list of files to put into the table
54 | FILE_PATTERN=$(printf $SOURCE_FORMAT $NGRAMS "" "")
55 | FILE_LIST=$($HDFS_CMD -ls $HDFS_DIR | grep $FILE_PATTERN | awk '{ print $8 }')
56 | for filepath in $FILE_LIST; do
57 |   filename=$(basename $filepath)
58 |   prefix=${filename##$FILE_PATTERN}
59 | 
60 |   hive --silent << END_LOAD
61 |     LOAD DATA INPATH '$HDFS_DIR/$filename'
62 |     OVERWRITE INTO TABLE $NGRAMS
63 |     PARTITION (prefix='$prefix')
64 |     ;
65 |     EXIT
66 |     ;
67 | END_LOAD
68 | done
69 | 
70 | echo "Data loaded into hive table $NGRAMS"
71 | 


--------------------------------------------------------------------------------
/sampleapps/querytools/examples/ngrams/ngram_setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2013 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | # Utility script, sourced by both ngram_hdfs_load.sh and hive_table_create.sh
17 | # This script will set a series of constants, some based on the choice
18 | # of the command line "N" value (defaults to 1).  N indicates the ngram
19 | # dataset to download and copy into HDFS.
20 | 
21 | readonly SOURCE_FORMAT="googlebooks-eng-all-%s-20120701-%s%s"
22 | readonly SOURCE_LOCATION="gs://books/ngrams/books"
23 | 
24 | # The "hadoop" executable should be in the user path
25 | readonly HDFS_CMD="hadoop fs"
26 | 
27 | # What to install: 1gram by default
28 | N=1
29 | 
30 | # Now parse command line arguments
31 | while [[ $# -ne 0 ]]; do
32 |   case "$1" in
33 |     --N=*)
34 |       N=${1#--N=}
35 |       shift
36 |       ;;
37 |     --help)
38 |       N=
39 |       shift
40 |       ;;
41 |     *)
42 |   esac
43 | done
44 | 
45 | if [[ ! $N -ge 1 ]]; then
46 |   echo "usage $(basename $0): --N=<n>"
47 |   exit 1
48 | fi
49 | 
50 | # Now set constants based on the selection of N
51 | readonly NGRAMS="${N}gram"
52 | readonly HDFS_DIR="ngrams/$NGRAMS"
53 | readonly STAGE_DIR="/hadoop/tmp/$USER/ngrams/$NGRAMS"
54 | 
55 | 


--------------------------------------------------------------------------------
/sampleapps/querytools/project_properties.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2013 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | # Begin: edit these values to set up your cluster
17 | # GCS bucket for packages
18 | readonly GCS_PACKAGE_BUCKET={{{{ bucket_name }}}}
19 | # Zone of the Hadoop master instance
20 | readonly ZONE={{{{ zone_id }}}}
21 | # Hadoop master instance name
22 | readonly MASTER={{{{ master_hostname }}}}
23 | 
24 | # Subdirectory in cloud storage where packages are pushed at initial setup
25 | readonly GCS_PACKAGE_DIR=hdp_tools
26 | 
27 | # Full GCS URIs of the Pig and Hive tarballs, if packages-to-gcs__at__host.sh
28 | # is used; alternatively, these can be set to other pre-existing GCS paths
29 | readonly SUPPORTED_HDPTOOLS="hive pig"
30 | readonly TARBALL_BASE="gs://$GCS_PACKAGE_BUCKET/$GCS_PACKAGE_DIR/packages"
31 | readonly HIVE_TARBALL_URI="$TARBALL_BASE/hive/hive-*.tar.gz"
32 | readonly PIG_TARBALL_URI="$TARBALL_BASE/pig/pig-*.tar.gz"
33 | 
34 | # Directory on master where hadoop is installed
35 | readonly HADOOP_HOME=/home/hadoop/hadoop
36 | 
37 | # Set to the major version of hadoop ("1" or "2")
38 | readonly HADOOP_MAJOR_VERSION="1"
39 | 
40 | # Hadoop username and group on Compute Engine Cluster
41 | readonly HADOOP_USER=hadoop
42 | readonly HADOOP_GROUP=hadoop
43 | 
44 | # Hadoop client username on Compute Engine Cluster
45 | readonly HDP_USER=hdpuser
46 | 
47 | # Directory on master where packages are installed
48 | readonly HDP_USER_HOME=/home/hdpuser
49 | readonly MASTER_INSTALL_DIR=/home/hdpuser
50 | 
51 | # End: edit these values to set up your cluster
52 | 
53 | 
54 | # Begin: constants used througout the solution
55 | 
56 | # Subdirectory where packages files (tar.gz) are stored
57 | readonly PACKAGES_DIR=packages
58 | 
59 | # Subdirectory where scripts are stored
60 | readonly SCRIPTS_DIR=scripts
61 | 
62 | # Subdirectory on master where we pull down package files
63 | readonly MASTER_PACKAGE_DIR=/tmp/hdp_tools
64 | 
65 | # User tmp dir in HDFS
66 | readonly HDFS_TMP_DIR="/tmp"
67 | 
68 | # Hadoop temp dir (hadoop.tmp.dir)
69 | readonly HADOOP_TMP_DIR="/hadoop/tmp"
70 | 
71 | # End: constants used througout the solution
72 | 


--------------------------------------------------------------------------------
/sampleapps/querytools/scripts/common_utils.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2013 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | set -o nounset
17 | set -o errexit
18 | 
19 | function emit() {
20 |   echo -e "$@"
21 | }
22 | readonly -f emit
23 | 
24 | function die() {
25 |   echo -e "$@" >&2
26 |   exit 1
27 | }
28 | readonly -f die
29 | 
30 | 


--------------------------------------------------------------------------------
/sampleapps/querytools/scripts/install-packages-on-master__at__host.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # Copyright 2013 Google Inc. All Rights Reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | set -o nounset
 17 | set -o errexit
 18 | 
 19 | readonly SCRIPTDIR=$(dirname $0)
 20 | 
 21 | # Pull in global properties
 22 | source project_properties.sh
 23 | 
 24 | # Pull in common functions
 25 | source $SCRIPTDIR/common_utils.sh
 26 | 
 27 | # Files to push to master; place project_properties.sh in the same directory
 28 | # as the other scripts
 29 | readonly SCRIPT_FILES_TO_PUSH="\
 30 |   project_properties.sh \
 31 |   $SCRIPTS_DIR/common_utils.sh \
 32 |   $SCRIPTS_DIR/package_utils.sh \
 33 |   $SCRIPTS_DIR/setup-hdfs-for-hdtools__at__master.sh \
 34 |   $SCRIPTS_DIR/setup-packages__at__master.sh \
 35 |   $SCRIPTS_DIR/setup-ssh-keys__at__master.sh \
 36 | "
 37 | readonly MASTER_PACKAGE_SUBDIRS="\
 38 |   $MASTER_PACKAGE_DIR/$SCRIPTS_DIR \
 39 |   $MASTER_PACKAGE_DIR/conf/hive \
 40 |   $MASTER_PACKAGE_DIR/ssh-key
 41 | "
 42 | 
 43 | # Ensure permissions on the script files before we push them
 44 | chmod 755 $SCRIPT_FILES_TO_PUSH
 45 | 
 46 | # Create the destination directory on the master
 47 | emit ""
 48 | emit "Ensuring setup directories exist on master:"
 49 | gcutil ssh --zone=$ZONE --ssh_arg -t $MASTER sudo -i \
 50 |         "rm -rf $MASTER_PACKAGE_DIR && \
 51 |          mkdir -p $MASTER_PACKAGE_SUBDIRS"
 52 | 
 53 | # Push the setup script to the master
 54 | emit ""
 55 | emit "Pushing the setup scripts to the master:"
 56 | gcutil push --zone=$ZONE $MASTER \
 57 |         $SCRIPT_FILES_TO_PUSH $MASTER_PACKAGE_DIR/$SCRIPTS_DIR
 58 | 
 59 | # Push configuration to the master
 60 | emit ""
 61 | emit "Pushing configuration to the master:"
 62 | gcutil push --zone=$ZONE $MASTER \
 63 |         conf/hive/* $MASTER_PACKAGE_DIR/conf/hive
 64 | 
 65 | # Execute the setup script on the master
 66 | emit ""
 67 | emit "Launching the user and package setup script on the master:"
 68 | gcutil ssh --zone=$ZONE --ssh_arg -t $MASTER \
 69 |         sudo $MASTER_PACKAGE_DIR/$SCRIPTS_DIR/setup-packages__at__master.sh
 70 | 
 71 | # Execute the HDFS setup script on the master
 72 | emit ""
 73 | emit "Launching the HDFS setup script on the master:"
 74 | gcutil ssh --zone=$ZONE --ssh_arg -t $MASTER \
 75 |         sudo \
 76 |         $MASTER_PACKAGE_DIR/$SCRIPTS_DIR/setup-hdfs-for-hdtools__at__master.sh
 77 | 
 78 | # Set up SSH keys for the user
 79 | emit ""
 80 | emit "Generating SSH keys for user $HDP_USER"
 81 | 
 82 | readonly KEY_DIR=./ssh-key
 83 | mkdir -p $KEY_DIR
 84 | rm -f $KEY_DIR/$HDP_USER $KEY_DIR/${HDP_USER}.pub
 85 | 
 86 | ssh-keygen -t rsa -P '' -f $KEY_DIR/$HDP_USER
 87 | chmod o+r $KEY_DIR/${HDP_USER}.pub
 88 | emit "Pushing SSH keys for user $HDP_USER to $MASTER"
 89 | gcutil push --zone=$ZONE $MASTER \
 90 |         $KEY_DIR/${HDP_USER}.pub $MASTER_PACKAGE_DIR/ssh-key/
 91 | emit "Adding SSH public key for user $HDP_USER to authorized_keys"
 92 | gcutil ssh --zone=$ZONE --ssh_arg -t $MASTER \
 93 |         sudo sudo -u $HDP_USER -i \
 94 |         $MASTER_PACKAGE_DIR/$SCRIPTS_DIR/setup-ssh-keys__at__master.sh \
 95 |           $MASTER_PACKAGE_DIR/ssh-key
 96 | 
 97 | MASTER_IP=$(gcutil getinstance --zone=$ZONE $MASTER | \
 98 |             awk -F '|' \
 99 |               '$2 ~ / *external-ip */ { gsub(/[ ]*/, "", $3); print $3 }')
100 | 
101 | emit ""
102 | emit "***"
103 | emit "SSH keys generated locally to:"
104 | emit "  Public key: $KEY_DIR/$HDP_USER.pub"
105 | emit "  Private key: $KEY_DIR/$HDP_USER"
106 | emit ""
107 | emit "Public key installed on $MASTER to ~$HDP_USER/.ssh/authorized_keys"
108 | emit ""
109 | emit "You may now ssh to user $HDP_USER@$MASTER with:"
110 | emit "   ssh -i $KEY_DIR/$HDP_USER $HDP_USER@$MASTER_IP"
111 | emit "***"
112 | 
113 | emit ""
114 | emit "Installation complete"
115 | 


--------------------------------------------------------------------------------
/sampleapps/querytools/scripts/package_utils.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2013 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | set -o nounset
17 | set -o errexit
18 | 
19 | function pkgutil_get_list() {
20 |   local pkg_dir="$1"
21 | 
22 |   find $pkg_dir -mindepth 2 -maxdepth 2 | sort
23 | }
24 | readonly -f pkgutil_get_list
25 | 
26 | function pkgutil_pkg_name() {
27 |   local pkg_dir="$1"
28 |   local pkg="$2"
29 | 
30 |   # Strip the "package" directory
31 |   local pkg_stripped=${pkg#$pkg_dir/}
32 | 
33 |   # Get the query-tool specific directory name
34 |   echo ${pkg_stripped%/*}
35 | }
36 | readonly -f pkgutil_pkg_name
37 | 
38 | function pkgutil_pkg_file() {
39 |   local pkg_dir="$1"
40 |   local pkg="$2"
41 | 
42 |   # Return just the filename
43 |   echo ${pkg##*/}
44 | }
45 | readonly -f pkgutil_pkg_file
46 | 
47 | function pkgutil_emit_list() {
48 |   local pkg_dir="$1"
49 |   local pkg_list="$2"
50 | 
51 |   emit ""
52 |   emit "Discovered packages:"
53 |   for pkg in $pkg_list; do
54 |     # Get the query-tool specific directory name
55 |     local pkg_name=$(pkgutil_pkg_name $pkg_dir $pkg)
56 | 
57 |     # Get the name of the zip file
58 |     local pkg_file=$(pkgutil_pkg_file $pkg_dir $pkg)
59 | 
60 |     emit "  $pkg_name ($pkg_file)"
61 |   done
62 | }
63 | readonly -f pkgutil_emit_list
64 | 
65 | 


--------------------------------------------------------------------------------
/sampleapps/querytools/scripts/packages-delete-from-gcs__at__host.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2013 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | # packages-delete-from-gcs
17 | #   This script removes the Hadoop query tool packages from Google Cloud
18 | #   Storage which were uploaded by packages-to-gcs__at__host.sh
19 | 
20 | set -o nounset
21 | set -o errexit
22 | 
23 | readonly SCRIPTDIR=$(dirname $0)
24 | 
25 | # Pull in global properties
26 | source project_properties.sh
27 | 
28 | # Pull in common functions
29 | source $SCRIPTDIR/common_utils.sh
30 | 
31 | # Remove packages from GCS
32 | emit ""
33 | emit "Removing packages:"
34 | gsutil rm -R -f gs://$GCS_PACKAGE_BUCKET/$GCS_PACKAGE_DIR
35 | 
36 | emit ""
37 | emit "Package removal complete"
38 | 
39 | 


--------------------------------------------------------------------------------
/sampleapps/querytools/scripts/packages-to-gcs__at__host.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2013 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | # packages-to-gcs
17 | #   This script examines the Hadoop tools packages directory for a list
18 | #   of packages to push to Google Cloud Storage.
19 | #
20 | #   All packages should be found in the "packages" subdirectory.
21 | #   The required format is for the package name to be a subdirectory
22 | #   and the associated TAR.GZ file to be inside the package subdirectory:
23 | #     packages/
24 | #          hive/
25 | #            hive-0.10.0.tar.gz
26 | #          pig/
27 | #            pig-0.11.1.tar.gz
28 | 
29 | set -o nounset
30 | set -o errexit
31 | 
32 | readonly SCRIPTDIR=$(dirname $0)
33 | 
34 | # Pull in global properties
35 | source project_properties.sh
36 | 
37 | # Pull in common functions
38 | source $SCRIPTDIR/common_utils.sh
39 | source $SCRIPTDIR/package_utils.sh
40 | 
41 | # The resulting PACKAGE_LIST will contain one entry per package where the
42 | # the entry is of the form "package_dir/package/gzip"
43 | #    (for example packages/hive/hive-0.10.0.tar.gz)
44 | PACKAGE_LIST=$(pkgutil_get_list $PACKAGES_DIR)
45 | if [[ -z $PACKAGE_LIST ]]; then
46 |   die "No package found in $PACKAGES_DIR subdirectory"
47 | fi
48 | 
49 | # Emit package list
50 | pkgutil_emit_list "$PACKAGES_DIR" "$PACKAGE_LIST"
51 | 
52 | # Push packages to GCS
53 | emit ""
54 | emit "Pushing packages to gs://$GCS_PACKAGE_BUCKET/$GCS_PACKAGE_DIR/:"
55 | gsutil -m cp -R $PACKAGES_DIR gs://$GCS_PACKAGE_BUCKET/$GCS_PACKAGE_DIR/
56 | 
57 | emit ""
58 | emit "Package upload complete"
59 | 
60 | 


--------------------------------------------------------------------------------
/sampleapps/querytools/scripts/setup-hdfs-for-hdtools__at__master.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2013 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | set -o nounset
17 | set -o errexit
18 | 
19 | SCRIPT=$(basename $0)
20 | SCRIPTDIR=$(dirname $0)
21 | 
22 | source $SCRIPTDIR/project_properties.sh
23 | source $SCRIPTDIR/common_utils.sh
24 | 
25 | readonly HDFS_CMD="sudo -u $HADOOP_USER -i $HADOOP_HOME/bin/hadoop fs"
26 | readonly HDFS_ROOT_USER="$HADOOP_USER"
27 | 
28 | function hdfs_mkdir () {
29 |   local dir=$1
30 |   local owner=${2:-}
31 |   local permissions=${3:-}
32 | 
33 |   emit "  Checking directory $dir"
34 |   if ! $HDFS_CMD -test -d $dir 2> /dev/null; then
35 |     emit "    Creating directory $dir"
36 |     $HDFS_CMD -mkdir $dir
37 |   fi
38 | 
39 |   if [[ -n "$owner" ]]; then
40 |     emit "    Ensuring owner $owner"
41 |     $HDFS_CMD -chown $owner $dir
42 |   fi
43 | 
44 |   if [[ -n "$permissions" ]]; then
45 |     emit "    Ensuring permissions $permissions"
46 |     $HDFS_CMD -chmod $permissions $dir
47 |   fi
48 | }
49 | readonly -f hdfs_mkdir
50 | 
51 | emit ""
52 | emit "*** Begin: $SCRIPT running on master $(hostname) ***"
53 | 
54 | # Ensure that /tmp exists (it should) and is fully accessible
55 | hdfs_mkdir "$HDFS_TMP_DIR" "$HDFS_ROOT_USER" "777"
56 | 
57 | # Create a hive-specific scratch space in /tmp for the hdpuser
58 | hdfs_mkdir "$HDFS_TMP_DIR/hive-$HDP_USER" "$HDP_USER"
59 | 
60 | # Create a warehouse directory (hive) for the hdpuser
61 | hdfs_mkdir "/user" "$HDFS_ROOT_USER"
62 | hdfs_mkdir "/user/$HDP_USER" "$HDP_USER"
63 | hdfs_mkdir "/user/$HDP_USER/warehouse" "$HDP_USER"
64 | 
65 | # Create a mapreduce staging directory for the hdpuser
66 | if [[ "${HADOOP_MAJOR_VERSION}" == "2" ]]; then
67 |   hdfs_mkdir "/hadoop/mapreduce" "$HADOOP_USER" "o+rw"
68 |   hdfs_mkdir "/hadoop/mapreduce/staging" "$HADOOP_USER" "o+rw"
69 |   hdfs_mkdir "/hadoop/mapreduce/staging/history" "$HADOOP_USER" "777"
70 |   hdfs_mkdir "/hadoop/mapreduce/staging/$HDP_USER" "$HDP_USER"
71 | else
72 |   hdfs_mkdir "$HADOOP_TMP_DIR/mapred/staging/$HDP_USER" "$HDP_USER"
73 | fi
74 | 
75 | emit ""
76 | emit "*** End: $SCRIPT running on master $(hostname) ***"
77 | emit ""
78 | 
79 | 


--------------------------------------------------------------------------------
/sampleapps/querytools/scripts/setup-ssh-keys__at__master.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2013 Google Inc. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | # This script runs on the Hadoop master node as the target user ($HDP_USER).
17 | # It is asssumed that a public key file for the user has been pushed
18 | # onto the master node and the location of that file is the first argument
19 | # to the script.
20 | 
21 | set -o nounset
22 | set -o errexit
23 | 
24 | readonly SCRIPT=$(basename $0)
25 | readonly SCRIPTDIR=$(dirname $0)
26 | 
27 | # Pull in global properties
28 | source $SCRIPTDIR/project_properties.sh
29 | source $SCRIPTDIR/common_utils.sh
30 | 
31 | if [[ $# -lt 1 ]]; then
32 |   die "usage: $0 [keys-dir]"
33 | fi
34 | 
35 | KEY_DIR=$1; shift
36 | KEY_FILE=$KEY_DIR/${USER}.pub
37 | 
38 | if [[ ! -e $KEY_FILE ]]; then
39 |   die "Public key file not found: $KEY_FILE"
40 | fi
41 | 
42 | # Ensure that the .ssh directory and authorized_keys files exist
43 | if [[ ! -e $HOME/.ssh/authorized_keys ]]; then
44 |   mkdir -p $HOME/.ssh
45 |   chmod 700 $HOME/.ssh
46 | 
47 |   touch $HOME/.ssh/authorized_keys
48 |   chmod 600 $HOME/.ssh/authorized_keys
49 | fi
50 | 
51 | # Add the public key file for the user to authorized_keys
52 | emit "Updating $HOME/.ssh/authorized_keys"
53 | (echo "# Added $(date)" && cat $KEY_FILE) >> $HOME/.ssh/authorized_keys
54 | 
55 | 


--------------------------------------------------------------------------------
/samples/bigquery_wordcount.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudDataproc/bdutil/967fd15b1f690e961f7d61809e4976aaa4ade90f/samples/bigquery_wordcount.jar


--------------------------------------------------------------------------------
/samples/test-mr-bigquery.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # Copyright 2013 Google Inc. All Rights Reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS-IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | ###############################################################################
18 | # Sets up and runs WordCount job to verify BigQuery setup.
19 | # Usage:
20 | #   Specify fully-qualified outputTable, e.g. "[datasetId].[tableId]":
21 | #       ./bdutil -v -u "samples/*" run_command ./test-mr-bigquery.sh [outputTable]
22 | #   Auto-generate/create a datasetId, and use that (provide no args)
23 | #       ./bdutil -v -u "samples/*" run_command ./test-mr-bigquery.sh
24 | ################################################################################
25 | 
26 | set -e
27 | 
28 | source hadoop-env-setup.sh
29 | 
30 | OUTPUT_TABLE=$1
31 | 
32 | CREATED_DATASET=0
33 | if [[ -z "${OUTPUT_TABLE}" ]]; then
34 |   OUTPUT_DATASET="validate_bigquery_dataset_$(date +%s)"
35 |   OUTPUT_TABLE="${OUTPUT_DATASET}.wordcount_output"
36 |   echo "No OUTPUT_TABLE provided; using ${OUTPUT_TABLE}"
37 |   bq mk "${PROJECT}:${OUTPUT_DATASET}"
38 |   CREATED_DATASET=1
39 | fi
40 | 
41 | INPUT_TABLE='publicdata:samples.shakespeare'
42 | INPUT_TABLE_FIELD='word'
43 | JAR='bigquery_wordcount.jar'
44 | 
45 | # Check for existence of jar
46 | if ! [[ -r ${JAR} ]]; then
47 |   echo "Error. Could not find jar: ${JAR}" >&2
48 |   exit 1
49 | fi
50 | 
51 | #  Perform word count MapReduce on README.txt
52 | hadoop jar ${JAR} ${PROJECT} ${INPUT_TABLE} ${INPUT_TABLE_FIELD} ${OUTPUT_TABLE}
53 | 
54 | echo 'Word count finished successfully.' \
55 |      "Manually clean up with 'bq rm ${OUTPUT_TABLE}'"
56 | if (( ${CREATED_DATASET} )); then
57 |   echo "To delete entire dataset: 'bq rm -r ${OUTPUT_DATASET}'"
58 | fi
59 | 


--------------------------------------------------------------------------------
/samples/word_count_mapper.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Mapper for use with hadoop-streaming bigquery word-count example.
15 | 
16 | Reads each line of input and writes out lines each containing
17 | a single word and the number 1.
18 | The input lines consist of two tab-separated fields:
19 |   1. the record number
20 |   2. JSON data
21 | We pick one field of the JSON and use its value as the word to output.
22 | """
23 | 
24 | import re
25 | import sys
26 | 
27 | 
28 | def main(args):
29 |   # Set up the pattern that we use to extract our field
30 |   field_name = args[1]
31 |   field_pattern = '\\{.*"(' + field_name + ')":"([^"]*)".*\\}'
32 |   field_extractor = re.compile(field_pattern)
33 | 
34 |   for line in sys.stdin:
35 |     line = line.strip()
36 |     key_and_json = line.split('\t', 1)
37 |     json = key_and_json[1]
38 |     matches = field_extractor.match(json)
39 |     if matches:
40 |       word = matches.group(2)
41 |       if word:
42 |         print '%s\t%s' % (word, 1)
43 | 
44 | 
45 | if __name__ == '__main__':
46 |   main(sys.argv)
47 | 


--------------------------------------------------------------------------------
/samples/word_count_reducer.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Reducer for use with hadoop-streaming word-count example.
15 | 
16 | Reads each line of input, sums the counts for each word,
17 | outputs a line with word and total count for each word.
18 | The input is assumed to be sorted by word.
19 | """
20 | 
21 | from __future__ import print_function
22 | 
23 | import re
24 | import sys
25 | 
26 | current_word = None
27 | current_count = 0
28 | output_json = False
29 | 
30 | 
31 | def print_word_and_count(word, count):
32 |   word = re.sub('"', "'", word)   # replace double-quotes with single-quotes
33 |   if output_json:
34 |     print('0\t{"Word": "%s", "Count": %d}' % (word, count))
35 |     # When streaming out to BigQuery, this key (0 here) is ignored.
36 |   else:
37 |     print('%s\t%s' % (word, count))
38 | 
39 | 
40 | def next_word(word, count):
41 |   global current_word, current_count
42 |   if current_word:
43 |     print_word_and_count(current_word, current_count)
44 |   current_word = word
45 |   current_count = count
46 | 
47 | 
48 | def main(args):
49 |   global current_count
50 |   global output_json
51 | 
52 |   if len(args) > 1:
53 |     if args[1] == '--output_json':
54 |       output_json = True
55 |     else:
56 |       print("Unknown command line option '%s'" % args[1], file=sys.stderr)
57 |       sys.exit(2)
58 | 
59 |   for line in sys.stdin:
60 |     line = line.strip()
61 |     word, count_string = line.split('\t', 1)
62 | 
63 |     try:
64 |       count = int(count_string)
65 |     except ValueError:
66 |       continue    # ignore lines that are not formatted correctly
67 | 
68 |     if word == current_word:
69 |       current_count += count
70 |     else:
71 |       next_word(word, count)
72 | 
73 |   next_word(None, 0)
74 | 
75 | if __name__ == '__main__':
76 |   main(sys.argv)
77 | 


--------------------------------------------------------------------------------
/single_node_env.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2013 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # This file contains environment-variable overrides to be used in conjunction
16 | # with bdutil_env.sh in order to deploy a single-node Hadoop cluster.
17 | # Usage: ./bdutil deploy -e single_node_env.sh
18 | 
19 | NUM_WORKERS=1
20 | 
21 | # A single-node setup is much more likely to be used for development, so install
22 | # JDK with compiler/tools instead of just the minimal JRE.
23 | INSTALL_JDK_DEVEL=true
24 | 
25 | # Save away the base evaluate_late_variable_bindings function so we can
26 | # override it.
27 | copy_func evaluate_late_variable_bindings old_evaluate_late_variable_bindings
28 | 
29 | function evaluate_late_variable_bindings() {
30 |   # Stash away the old value here so we can differentiate between whether the
31 |   # user overrides set it or we just resolved it in the base implementation
32 |   # of evaluate_late_variable_bindings.
33 |   local old_nfs_master_hostname="${GCS_CACHE_MASTER_HOSTNAME}"
34 | 
35 |   old_evaluate_late_variable_bindings
36 | 
37 |   # In the case of the single-node cluster, we'll just use the whole PREFIX
38 |   # as the name of the master and worker.
39 |   WORKERS[0]=${PREFIX}
40 |   MASTER_HOSTNAME=${PREFIX}
41 |   WORKER_ATTACHED_PDS[0]="${PREFIX}-pd"
42 |   MASTER_ATTACHED_PD="${PREFIX}-pd"
43 | 
44 |   # Fully qualified HDFS URI of namenode
45 |   NAMENODE_URI="hdfs://${MASTER_HOSTNAME}:8020/"
46 | 
47 |   # Host and port of jobtracker
48 |   JOB_TRACKER_URI="${MASTER_HOSTNAME}:9101"
49 | 
50 |   # GCS directory for deployment-related temporary files.
51 |   local staging_dir_base="gs://${CONFIGBUCKET}/bdutil-staging"
52 |   BDUTIL_GCS_STAGING_DIR="${staging_dir_base}/${MASTER_HOSTNAME}"
53 | 
54 |   # Default NFS cache host is the master node, but it can be overriden to point
55 |   # at an NFS server off-cluster.
56 |   if [[ -z "${old_nfs_master_hostname}" ]]; then
57 |     GCS_CACHE_MASTER_HOSTNAME="${MASTER_HOSTNAME}"
58 |   fi
59 | 
60 |   # Since $WORKERS and $MASTER_HOSTNAME both refer to the same single-node
61 |   # VM, we must override COMMAND_STEPS to prevent duplicating steps. We also
62 |   # omit deploy-ssh-worker-setup because there is no need to copy SSH keys to
63 |   # the localhost.
64 |   COMMAND_STEPS=(${COMMAND_STEPS[@]/,*/,*})
65 | }
66 | 


--------------------------------------------------------------------------------
/standalone_nfs_cache_env.sh:
--------------------------------------------------------------------------------
 1 | # Copyright 2015 Google Inc. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Handy wrapper around single_node_env.sh to turn up just a single server
16 | # capable of acting as the NFS-based GCS consistency cache for multiple
17 | # other clusters.
18 | #
19 | # Usage:
20 | #   ./bdutil -P my-nfs-server -p <project> -z <zone> -b <bucket> generate_config my-nfs-server_env.sh
21 | #   ./bdutil -e my-nfs-server_env.sh deploy
22 | #
23 | #   ./bdutil -P cluster1 -p <project> -z <zone> -b <bucket> generate_config cluster1_env.sh
24 | #   echo GCS_CACHE_MASTER_HOSTNAME=my-nfs-server >> cluster1_env.sh
25 | #   ./bdutil -e cluster1_env.sh deploy
26 | #
27 | #   ./bdutil -P cluster2 -p <project> -z <zone> -b <bucket> generate_config cluster2_env.sh
28 | #   echo GCS_CACHE_MASTER_HOSTNAME=my-nfs-server >> cluster2_env.sh
29 | #   ./bdutil -e cluster2_env.sh deploy
30 | #
31 | #  ./bdutil -e cluster2_env.sh delete
32 | #  ./bdutil -e cluster1_env.sh delete
33 | #  ./bdutil -e my-nfs-server_env.sh delete
34 | 
35 | # Start with single_node_env.sh to get all the MASTER_HOSTNAME, etc.,
36 | # resolution.
37 | import_env single_node_env.sh
38 | 
39 | # This server would be somewhat pointless without the GCS connector and the
40 | # NFS cache enabled.
41 | INSTALL_GCS_CONNECTOR=true
42 | DEFAULT_FS='gs'
43 | ENABLE_NFS_GCS_FILE_CACHE=true
44 | 
45 | # We'll set up Hadoop as normal since it'll be handy to have "hadoop fs -ls"
46 | # on the cache server, but we just won't configure the hadoop daemons to start
47 | # on boot, and won't start them explicitly during deployment. That means
48 | # no jobracker or resourcemanager or namenode, but we should still be able to
49 | # use "hadoop fs" against GCS just fine.
50 | COMMAND_GROUPS+=(
51 |   "deploy-standalone-nfs-cache:
52 |      libexec/install_java.sh
53 |      libexec/mount_disks.sh
54 |      libexec/setup_hadoop_user.sh
55 |      libexec/install_hadoop.sh
56 |      libexec/install_bdconfig.sh
57 |      libexec/configure_hadoop.sh
58 |      libexec/install_and_configure_gcs_connector.sh
59 |      libexec/configure_hdfs.sh
60 |      libexec/set_default_fs.sh
61 |      libexec/setup_master_nfs.sh
62 |   "
63 | )
64 | 
65 | COMMAND_STEPS=(
66 |   "deploy-standalone-nfs-cache,*"
67 |   "deploy-client-nfs-setup,*"
68 | )
69 | 


--------------------------------------------------------------------------------