├── .dockerignore
├── CHANGES.txt
├── CONTRIBUTING
├── Dockerfile
├── LICENSE
├── README.md
├── bdutil
├── bdutil_env.sh
├── bigquery_env.sh
├── conf
├── hadoop1
│ ├── bq-mapred-template.xml
│ ├── core-template.xml
│ ├── gcs-core-template.xml
│ ├── hdfs-template.xml
│ ├── mapred-health-check.sh
│ └── mapred-template.xml
└── hadoop2
│ ├── bigtable-hbase-site-template.xml
│ ├── bq-mapred-template.xml
│ ├── capacity-scheduler-template.xml
│ ├── core-template.xml
│ ├── gcs-core-template.xml
│ ├── hdfs-template.xml
│ ├── mapred-template.xml
│ └── yarn-template.xml
├── docs
├── JOBS.md
├── MONITORING.md
├── QUICKSTART.md
└── SHUTDOWN.md
├── extensions
├── bigtable
│ ├── bigtable_env.sh
│ └── install_hbase_bigtable.sh
├── flink
│ ├── README.md
│ ├── flink_env.sh
│ ├── install_flink.sh
│ └── start_flink.sh
├── google
│ ├── experimental
│ │ └── resize_env.sh
│ └── gcs-validate-setup.sh
├── hama
│ ├── README.md
│ ├── hama_env.sh
│ ├── install_hama.sh
│ └── start_hama.sh
├── hbase
│ ├── README.md
│ ├── hbase_env.sh
│ ├── install_hbase.sh
│ └── start_hbase.sh
├── querytools
│ ├── hive-validate-setup.sh
│ ├── pig-mapred-template.xml
│ ├── pig-validate-setup.sh
│ ├── prepare_files.sh
│ ├── querytools_env.sh
│ └── setup_profiles.sh
├── spark
│ ├── install_shark.sh
│ ├── install_spark.sh
│ ├── spark-validate-setup.sh
│ ├── spark_configure_startup_processes.sh
│ ├── spark_env.sh
│ ├── spark_on_yarn_env.sh
│ ├── spark_shark_env.sh
│ ├── start_single_spark_worker.sh
│ └── start_spark.sh
├── storm
│ ├── README.md
│ ├── install_storm.sh
│ ├── install_supervisor.sh
│ ├── install_zookeeper.sh
│ ├── jar.xml
│ ├── start_storm_master.sh
│ ├── start_storm_worker.sh
│ └── storm_env.sh
└── tajo
│ ├── README.md
│ ├── configure_tajo.sh
│ ├── install_tajo.sh
│ ├── start_tajo.sh
│ └── tajo_env.sh
├── hadoop-validate-setup.sh
├── hadoop2_env.sh
├── libexec
├── bdutil_helpers.sh
├── configure_hadoop.sh
├── configure_hdfs.sh
├── configure_mrv2_mem.py
├── configure_startup_processes.sh
├── hadoop_helpers.sh
├── install_and_configure_bigquery_connector.sh
├── install_and_configure_gcs_connector.sh
├── install_bdconfig.sh
├── install_hadoop.sh
├── install_java.sh
├── mount_disks.sh
├── set_default_fs.sh
├── setup_client_nfs.sh
├── setup_hadoop_user.sh
├── setup_master_nfs.sh
├── setup_master_ssh.sh
├── setup_worker_ssh.sh
├── start_hadoop.sh
└── start_hadoop2.sh
├── platforms
├── cdh
│ ├── README.md
│ ├── cdh-core-template.xml
│ ├── cdh_env.sh
│ ├── configure_cdh.sh
│ └── install_cdh.sh
├── hdp
│ ├── README.md
│ ├── TEST.md
│ ├── ambari.conf
│ ├── ambari_env.sh
│ ├── ambari_functions.sh
│ ├── ambari_manual_env.sh
│ ├── ambari_manual_post_deploy_env.sh
│ ├── configuration.json
│ ├── create_blueprint.py
│ ├── install_ambari.sh
│ ├── install_ambari_components.sh
│ ├── install_gcs_connector_on_ambari.sh
│ ├── resources
│ │ ├── public-hostname-gcloud.sh
│ │ └── thp-disable.sh
│ └── update_ambari_config.sh
├── mapr
│ ├── README.md
│ ├── configure_mapr_instance.sh
│ ├── mapr_env.sh
│ ├── mapr_license.txt
│ ├── node.lst
│ └── prepare_mapr_image.sh
└── restart_services.sh
├── sampleapps
└── querytools
│ ├── COPYING
│ ├── README.md
│ ├── conf
│ └── hive
│ │ └── hive-site.xml
│ ├── examples
│ └── ngrams
│ │ ├── hive_query_ngrams.q
│ │ ├── hive_table_create.sh
│ │ ├── ngram_hdfs_load.sh
│ │ ├── ngram_setup.sh
│ │ └── pig_query_ngrams.pig
│ ├── project_properties.sh
│ └── scripts
│ ├── common_utils.sh
│ ├── install-packages-on-master__at__host.sh
│ ├── package_utils.sh
│ ├── packages-delete-from-gcs__at__host.sh
│ ├── packages-to-gcs__at__host.sh
│ ├── setup-hdfs-for-hdtools__at__master.sh
│ ├── setup-packages__at__master.sh
│ └── setup-ssh-keys__at__master.sh
├── samples
├── bigquery_wordcount.jar
├── streaming_word_count.sh
├── test-mr-bigquery.sh
├── word_count_mapper.py
└── word_count_reducer.py
├── single_node_env.sh
└── standalone_nfs_cache_env.sh
/.dockerignore:
--------------------------------------------------------------------------------
1 | .git
2 | .gitignore
3 |
4 | *.swp
5 | */*.swp
6 | */*/*.swp
7 | */*/*/*.swp
8 | */*/*/*/*.swp
9 |
--------------------------------------------------------------------------------
/CONTRIBUTING:
--------------------------------------------------------------------------------
1 | Want to contribute? Great! First, read this page (including the small print at the end).
2 |
3 | ### Before you contribute
4 | Before we can use your code, you must sign the
5 | [Google Individual Contributor License Agreement](https://developers.google.com/open-source/cla/individual?csw=1)
6 | (CLA), which you can do online. The CLA is necessary mainly because you own the
7 | copyright to your changes, even after your contribution becomes part of our
8 | codebase, so we need your permission to use and distribute your code. We also
9 | need to be sure of various other things—for instance that you'll tell us if you
10 | know that your code infringes on other people's patents. You don't have to sign
11 | the CLA until after you've submitted your code for review and a member has
12 | approved it, but you must do it before we can put your code into our codebase.
13 | Before you start working on a larger contribution, you should get in touch with
14 | us first through the issue tracker with your idea so that we can help out and
15 | possibly guide you. Coordinating up front makes it much easier to avoid
16 | frustration later on.
17 |
18 | ### Code reviews
19 | All submissions, including submissions by project members, require review. We
20 | use Github pull requests for this purpose.
21 |
22 | ### The small print
23 | Contributions made by corporations are covered by a different agreement than
24 | the one above, the Software Grant and Corporate Contributor License Agreement.
25 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM google/cloud-sdk
2 |
3 | ADD . /bdutil/
4 |
5 | ENTRYPOINT ["/bdutil/bdutil"]
6 | CMD ["--help"]
7 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # This project has been deprecated. Please use [Google Cloud Dataproc](https://cloud.google.com/dataproc) to create managed Apache Hadoop and Apache Spark instances on [Google Compute Engine](https://cloud.google.com/compute).
2 |
3 | # bdutil
4 |
5 | bdutil is a command-line script used to manage Apache Hadoop and Apache Spark instances on [Google Compute Engine](https://cloud.google.com/compute). bdutil manages deployment, configuration, and shutdown of your Hadoop instances.
6 |
7 | ## Requirements
8 |
9 | bdutil depends on the [Google Cloud SDK](https://cloud.google.com/sdk). bdutil is supported in any posix-compliant Bash v3 or greater shell.
10 |
11 | ## Usage
12 |
13 | See the [QUICKSTART](/docs/QUICKSTART.md) file in the `docs` directory to learn how to set up your Hadoop instances using bdutil.
14 |
15 | 1. Install and configure the [Google Cloud SDK](https://cloud.google.com/sdk) if you have already not done so
16 | 1. Clone this repository with `git clone https://github.com/GoogleCloudPlatform/bdutil.git`
17 | 1. Modify the following variables in the bdutil_env.sh file:
18 | 1. `PROJECT` - Set to the project ID for all bdutil commands. The project value will be overridden in the following order (where 1 overrides 2, and 2 overrides 3):
19 | * -p flag value, or if not specified then
20 | * PROJECT value in bdutil_env.sh, or if not specified then
21 | * gcloud default project value
22 | 1. `CONFIGBUCKET` - Set to a Google Compute Storage bucket that your project has read/write access to.
23 | 1. Run `bdutil --help` for a list of commands.
24 |
25 | The script implements the following commands, which are very similar:
26 |
27 | * `bdutil create` creates and starts instances, but will not apply most configuration settings. You can call `bdutil run_command_steps` on instances afterward to apply configuration settings to them. Typically you wouldn't use this, but would use `bdutil deploy` instead.
28 | * `bdutil deploy` creates and starts instances with all the configuration options specified in the command line and any included configuration scripts.
29 |
30 | ## Components installed
31 |
32 | The latest release of bdutil is `1.3.5`. This bdutil release installs the following versions of open source components:
33 |
34 | * Apache Hadoop - 1.2.1 (2.7.1 if you use the `-e` argument)
35 | * Apache Spark - 1.5.0
36 | * Apache Pig - 0.12
37 | * Apache Hive - 1.2.1
38 |
39 | ## Documentation
40 |
41 | The following documentation is useful for bdutil.
42 |
43 | * **[Quickstart](/docs/QUICKSTART.md)** - A guide on how to get started with bdutil quickly.
44 | * **[Jobs](/docs/JOBS.md)** - How to submit jobs (work) to a bdutil cluster.
45 | * **[Monitoring](/docs/MONITORING.md)** - How to monitor bdutil cluster.
46 | * **[Shutdown](/docs/SHUTDOWN.md)** - How shutdown a bdutil cluster.
47 |
--------------------------------------------------------------------------------
/bigquery_env.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2013 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # This file contains environment-variable overrides to be used in conjunction
16 | # with bdutil_env.sh in order to deploy a bigquery-enabled Hadoop cluster.
17 | # Usage: ./bdutil deploy bigquery_env.sh
18 |
19 | GCE_SERVICE_ACCOUNT_SCOPES+=('bigquery')
20 |
21 | # Whether or not to install and configure the BigQuery connector.
22 | INSTALL_BIGQUERY_CONNECTOR=true
23 |
24 |
--------------------------------------------------------------------------------
/conf/hadoop1/bq-mapred-template.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | mapred.bq.project.id
6 |
7 |
8 | Google Cloud Project ID to use for BigQuery operations.
9 |
10 |
11 |
12 | mapred.bq.gcs.bucket
13 |
14 |
15 | The GCS bucket holding temporary BigQuery data for the input connector.
16 |
17 |
18 |
19 | mapred.bq.output.buffer.size
20 | 67108864
21 |
22 | The size in bytes of the output buffer to use when writing to BigQuery.
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/conf/hadoop1/core-template.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | hadoop.tmp.dir
6 |
7 | A base for other temporary directories.
8 |
9 |
10 |
--------------------------------------------------------------------------------
/conf/hadoop1/gcs-core-template.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | fs.gs.project.id
6 |
7 |
8 | Google Cloud Project ID with access to configured GCS buckets.
9 |
10 |
11 |
12 | fs.gs.system.bucket
13 |
14 |
15 | GCS bucket to use as a default bucket if fs.default.name is not a gs: uri.
16 |
17 |
18 |
19 | fs.gs.working.dir
20 | /
21 |
22 | The directory relative gs: uris resolve in inside of the default bucket.
23 |
24 |
25 |
26 | fs.gs.impl
27 | com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem
28 | The FileSystem for gs: (GCS) uris.
29 |
30 |
31 | fs.gs.metadata.cache.enable
32 | true
33 |
34 | If true, a DirectoryListCache will be used to supplement "list" requests
35 | to GCS to fill in any missing items caused by eventual list consistency,
36 | intercepting create/delete/copy calls to create cache entries. The
37 | concrete type is determined with fs.gs.metadata.cache.type.
38 |
39 |
40 |
41 | fs.gs.metadata.cache.type
42 |
43 |
44 | Specifies which implementation of DirectoryListCache to use for
45 | supplementing GCS API "list" requests. Supported implementations:
46 | IN_MEMORY: Enforces immediate consistency within same Java process.
47 | FILESYSTEM_BACKED: Enforces consistency across all cooperating processes
48 | pointed at the same local mirror directory, which may be an NFS directory
49 | for massively-distributed coordination.
50 |
51 |
52 |
53 | fs.gs.metadata.cache.directory
54 |
55 |
56 | Only used if fs.gs.metadata.cache.type is FILESYSTEM_BACKED, specifies
57 | the local path to use as the base path for storing mirrored GCS metadata.
58 | Must be an absolute path, must be a directory, and must be fully
59 | readable/writable/executable by any user running processes which use the
60 | GCS connector.
61 |
62 |
63 |
64 |
--------------------------------------------------------------------------------
/conf/hadoop1/hdfs-template.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | dfs.namenode.rpc-address
6 |
7 |
8 | RPC address that handles all clients requests. If empty then we'll get
9 | thevalue from fs.default.name.The value of this property will take the
10 | form of hdfs://nn-host1:rpc-port.
11 |
12 |
13 |
14 | dfs.name.dir
15 |
16 |
17 | Determines where on the local filesystem the DFS namenode should store the
18 | name table(fsimage). If this is a comma-delimited list of directories then
19 | the name table is replicated in all of thedirectories, for redundancy.
20 |
21 |
22 |
23 | dfs.data.dir
24 |
25 |
26 | Determines where on the local filesystem an DFS datanode should store its
27 | blocks. If this is a comma-delimited list of directories, then data will
28 | be stored in all named directories, typically on different
29 | devices.Directories that do not exist are ignored.
30 |
31 |
32 |
33 | dfs.datanode.data.dir.perm
34 |
35 |
36 | Permissions for the directories on on the local filesystem where the DFS
37 | data node store its blocks. The permissions can either be octal or
38 | symbolic.
39 |
40 |
41 |
42 | dfs.permissions
43 |
44 |
45 | If "true", enable permission checking in HDFS. If "false", permission
46 | checking is turned off, but all other behavior is unchanged. Switching
47 | from one parameter value to the other does not change the mode, owner or
48 | group of files or directories.
49 |
50 |
51 |
52 | dfs.replication
53 | 2
54 |
55 | Default block replication. The actual number of replications can be
56 | specified when the file is created. The default is used if replication
57 | is not specified in create time.
58 |
59 |
60 |
61 |
--------------------------------------------------------------------------------
/conf/hadoop1/mapred-health-check.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # Licensed to the Apache Software Foundation (ASF) under one or more
4 | # contributor license agreements. See the NOTICE file distributed with
5 | # this work for additional information regarding copyright ownership.
6 | # The ASF licenses this file to You under the Apache License, Version 2.0
7 | # (the "License"); you may not use this file except in compliance with
8 | # the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 |
18 |
19 | # Check to see if the TaskTracker is healthy by checking it's http address.
20 | # Necessary to avoid [MAPREDUCE-4668].
21 |
22 | # Redirect stderr to stdout.
23 | # Necessary to see problems with health check script in log.
24 | # Will only show stdout if ERROR is present at the beginning of a line.
25 | exec 2>&1
26 |
27 | BIN=$(dirname "$0")
28 | BIN=$(cd "${BIN}"; pwd)
29 | HADOOP_CMD="${BIN}/hadoop"
30 |
31 | TASK_TRACKER_HTTP_ADDRESS=$(${HADOOP_CMD} jobtracker -dumpConfiguration 2>/dev/null \
32 | | sed -n 's/.*task\.tracker\.http\.address","value":"\([.:0-9]*\)".*/\1/p')
33 |
34 | if [[ -n "${TASK_TRACKER_HTTP_ADDRESS}" ]]; then
35 | curl -sm 10 -o /dev/null ${TASK_TRACKER_HTTP_ADDRESS}
36 | ERROR_CODE=$?
37 | if (( ${ERROR_CODE} == 28 )); then
38 | echo "ERROR curl timed out trying to reach the TaskTracker web server." \
39 | "Assuming the TaskTracker is unhealthy."
40 | elif (( ${ERROR_CODE} )); then
41 | echo "WARN curl failed to reach the TaskTracker, but did not time out."
42 | else
43 | echo "DEBUG Successfully curled TaskTracker."
44 | fi
45 | else
46 | echo "WARN Failed to determine TaskTracker http address." \
47 | "Not checking health."
48 | fi
49 |
50 | # TaskTracker disregards ERRORs with non-zero exit code.
51 | exit 0
52 |
--------------------------------------------------------------------------------
/conf/hadoop1/mapred-template.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | mapred.job.tracker
6 |
7 |
8 | The host and port that the MapReduce job tracker runsat. If "local",
9 | then jobs are run in-process as a single mapand reduce task.
10 |
11 |
12 |
13 | mapred.map.tasks
14 |
15 |
16 | The default number of map tasks per job.Ignored when mapred.job.tracker is
17 | "local".
18 |
19 |
20 |
21 | mapred.reduce.tasks
22 |
23 |
24 | The default number of reduce tasks per job. Typically set to 99%of the
25 | cluster's reduce capacity, so that if a node fails the reduces canstill be
26 | executed in a single wave.Ignored when mapred.job.tracker is
27 | "local".
28 |
29 |
30 |
31 | mapred.tasktracker.map.tasks.maximum
32 |
33 |
34 | The maximum number of map tasks that will be runsimultaneously by a task
35 | tracker.
36 |
37 |
38 |
39 | mapred.tasktracker.reduce.tasks.maximum
40 |
41 |
42 | The maximum number of reduce tasks that will be runsimultaneously by a
43 | task tracker.
44 |
45 |
46 |
47 | mapred.child.java.opts
48 |
49 |
50 | Java opts for the task tracker child processes.The following symbol, if
51 | present, will be interpolated: @taskid@ is replacedby current TaskID. Any
52 | other occurrences of '@' will go unchanged.For example, to enable verbose
53 | gc logging to a file named for the taskid in/tmp and to set the heap
54 | maximum to be a gigabyte, pass a 'value' of:-Xmx1024m -verbose:gc
55 | -Xloggc:/tmp/@taskid@.gcThe configuration variable mapred.child.ulimit can
56 | be used to control themaximum virtual memory of the child processes.
57 |
58 |
59 |
60 | mapred.jobtracker.restart.recover
61 | true
62 |
63 | Whether or not to enable (job) recovery upon restart.
64 |
65 |
66 |
67 | mapreduce.jobtracker.expire.trackers.interval
68 | 60000
69 |
70 | The time-interval, in milliseconds, after which a tasktracker is
71 | declared 'lost' if it doesn't send heartbeats. The Hadoop
72 | distribution default is 600000 (10 minutes), we set this to
73 | 60000 (1 minute) to quickly reassign work.
74 |
75 |
76 |
77 | mapred.local.dir
78 |
79 |
80 | Directories on the local machine in which to store mapreduce temp files.
81 |
82 |
83 |
84 |
--------------------------------------------------------------------------------
/conf/hadoop2/bigtable-hbase-site-template.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | hbase.client.connection.impl
7 |
8 |
9 |
10 | google.bigtable.endpoint.host
11 |
12 |
13 |
14 | google.bigtable.admin.endpoint.host
15 |
16 |
17 |
18 | google.bigtable.project.id
19 |
20 |
21 |
22 | google.bigtable.zone.name
23 |
24 |
25 |
26 | google.bigtable.cluster.name
27 |
28 |
29 |
30 | yarn.app.mapreduce.am.command-opts
31 |
32 |
33 |
34 | mapreduce.map.java.opts
35 |
36 |
37 |
38 | mapreduce.reduce.java.opts
39 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/conf/hadoop2/bq-mapred-template.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | mapred.bq.project.id
6 |
7 |
8 | Google Cloud Project ID to use for BigQuery operations.
9 |
10 |
11 |
12 | mapred.bq.gcs.bucket
13 |
14 |
15 | The GCS bucket holding temporary BigQuery data for the input connector.
16 |
17 |
18 |
19 | mapred.bq.output.buffer.size
20 | 67108864
21 |
22 | The size in bytes of the output buffer to use when writing to BigQuery.
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/conf/hadoop2/core-template.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | hadoop.tmp.dir
6 |
7 | A base for other temporary directories.
8 |
9 |
10 | fs.defaultFS
11 | file:///
12 |
13 | The name of the default file system. A URI whose scheme and authority
14 | determine the FileSystem implementation. The uri's scheme determines
15 | the config property (fs.SCHEME.impl) naming the FileSystem
16 | implementation class. The uri's authority is used to determine the
17 | host, port, etc. for a filesystem.
18 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/conf/hadoop2/gcs-core-template.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | fs.gs.project.id
6 |
7 |
8 | Google Cloud Project ID with access to configured GCS buckets.
9 |
10 |
11 |
12 | fs.gs.system.bucket
13 |
14 |
15 | GCS bucket to use as a default bucket if fs.default.name is not a gs: uri.
16 |
17 |
18 |
19 | fs.gs.working.dir
20 | /
21 |
22 | The directory relative gs: uris resolve in inside of the default bucket.
23 |
24 |
25 |
26 | fs.gs.impl
27 | com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem
28 | The FileSystem for gs: (GCS) uris.
29 |
30 |
31 | fs.AbstractFileSystem.gs.impl
32 | com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS
33 | The AbstractFileSystem for gs: (GCS) uris.
34 |
35 |
36 | fs.gs.metadata.cache.enable
37 | true
38 |
39 | If true, a DirectoryListCache will be used to supplement "list" requests
40 | to GCS to fill in any missing items caused by eventual list consistency,
41 | intercepting create/delete/copy calls to create cache entries. The
42 | concrete type is determined with fs.gs.metadata.cache.type.
43 |
44 |
45 |
46 | fs.gs.metadata.cache.type
47 |
48 |
49 | Specifies which implementation of DirectoryListCache to use for
50 | supplementing GCS API "list" requests. Supported implementations:
51 | IN_MEMORY: Enforces immediate consistency within same Java process.
52 | FILESYSTEM_BACKED: Enforces consistency across all cooperating processes
53 | pointed at the same local mirror directory, which may be an NFS directory
54 | for massively-distributed coordination.
55 |
56 |
57 |
58 | fs.gs.metadata.cache.directory
59 |
60 |
61 | Only used if fs.gs.metadata.cache.type is FILESYSTEM_BACKED, specifies
62 | the local path to use as the base path for storing mirrored GCS metadata.
63 | Must be an absolute path, must be a directory, and must be fully
64 | readable/writable/executable by any user running processes which use the
65 | GCS connector.
66 |
67 |
68 |
69 |
--------------------------------------------------------------------------------
/conf/hadoop2/hdfs-template.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | dfs.namenode.secondary.http-address
6 | :50090
7 |
8 | The secondary namenode http server address and port.
9 |
10 |
11 |
12 | dfs.namenode.rpc-address
13 | :8020
14 |
15 | RPC address that handles all clients requests. If empty then we'll get
16 | thevalue from fs.default.name.The value of this property will take the
17 | form of hdfs://nn-host1:rpc-port.
18 |
19 |
20 |
21 | dfs.namenode.name.dir
22 |
23 |
24 | Determines where on the local filesystem the DFS namenode should store the
25 | name table(fsimage). If this is a comma-delimited list of directories then
26 | the name table is replicated in all of thedirectories, for redundancy.
27 |
28 |
29 |
30 | dfs.datanode.data.dir
31 |
32 |
33 | Determines where on the local filesystem an DFS datanode should store its
34 | blocks. If this is a comma-delimited list of directories, then data will
35 | be stored in all named directories, typically on different
36 | devices.Directories that do not exist are ignored.
37 |
38 |
39 |
40 | dfs.datanode.data.dir.perm
41 |
42 |
43 | Permissions for the directories on on the local filesystem where the DFS
44 | data node store its blocks. The permissions can either be octal or
45 | symbolic.
46 |
47 |
48 |
49 | dfs.permissions.enabled
50 |
51 |
52 | If "true", enable permission checking in HDFS. If "false", permission
53 | checking is turned off, but all other behavior is unchanged. Switching
54 | from one parameter value to the other does not change the mode, owner or
55 | group of files or directories.
56 |
57 |
58 |
59 | dfs.permissions.supergroup
60 | hadoop
61 |
62 | The name of the group of super-users.
63 |
64 |
65 |
66 | dfs.replication
67 | 2
68 |
69 | Default block replication. The actual number of replications can be
70 | specified when the file is created. The default is used if replication
71 | is not specified in create time.
72 |
73 |
74 |
75 |
--------------------------------------------------------------------------------
/conf/hadoop2/yarn-template.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | yarn.resourcemanager.hostname
6 |
7 |
8 |
9 | yarn.nodemanager.aux-services
10 | mapreduce_shuffle
11 |
12 |
13 | yarn.nodemanager.resource.memory-mb
14 |
15 |
16 | Amount of physical memory, in MB, that can be allocated for containers.
17 |
18 |
19 |
20 | yarn.scheduler.maximum-allocation-mb
21 |
22 |
23 | The maximum allocation for every container request at the RM, in MBs.
24 | Memory requests higher than this won't take effect, and will get capped
25 | to this value.
26 |
27 |
28 |
29 | yarn.scheduler.minimum-allocation-mb
30 |
31 |
32 | The minimum allocation for every container request at the RM, in MBs.
33 | Memory requests lower than this won't take effect, and the specified
34 | value will get allocated at minimum.
35 |
36 |
37 |
38 | yarn.nodemanager.resource.cpu-vcores
39 |
40 |
41 | Number of vcores that can be allocated for containers. This is used by
42 | the RM scheduler when allocating resources for containers. This is not
43 | used to limit the number of physical cores used by YARN containers.
44 |
45 |
46 |
47 | yarn.log-aggregation-enable
48 | false
49 |
50 | Enable remote logs aggregation to the default FS.
51 |
52 |
53 |
54 | yarn.nodemanager.remote-app-log-dir
55 | /yarn-logs/
56 |
57 | The remote path, on the default FS, to store logs.
58 |
59 |
60 |
61 | yarn.resourcemanager.recovery.enabled
62 | true
63 |
64 | Enable RM to recover state after starting.
65 |
66 |
67 |
68 | yarn.resourcemanager.fs.state-store.uri
69 | file:///hadoop/yarn/system/rmstore
70 |
71 | URI pointing to the location of the FileSystem path where RM state will
72 | be stored. This is set on the local file system to avoid collisions in
73 | GCS.
74 |
75 |
76 |
77 | yarn.nodemanager.local-dirs
78 |
79 |
80 | Directories on the local machine in which to application temp files.
81 |
82 |
83 |
84 |
--------------------------------------------------------------------------------
/docs/JOBS.md:
--------------------------------------------------------------------------------
1 | # Jobs
2 |
3 | Once you have [created a cluster](QUICKSTART.md) you can submit "jobs" (work) to it. These can be entirely new jobs, or jobs you port from an existing environment.
4 |
5 | ## Writing Jobs
6 |
7 | To learn about how to write Hadoop jobs from the ground up, see the [Apache Hadoop tutorials](https://hadoop.apache.org/docs/current/hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html).
8 |
9 | Google Cloud Platform offers input/output data connectors for your Hadoop and Spark jobs:
10 |
11 | * [Google BigQuery Connector for Hadoop](https://github.com/GoogleCloudPlatform/bigdata-interop)
12 | * [Google Cloud Storage Connector for Hadoop](https://github.com/GoogleCloudPlatform/bigdata-interop)
13 |
14 | ## Porting existing jobs
15 |
16 | When porting a job from HDFS using the Cloud Storage connector for Hadoop, be sure to use the correct file path syntax (`gs://`).
17 | Also note that `FileSystem.append` is unsupported. If you choose Cloud Storage as your default file system, update your MapReduce, if necessary, to avoid using the append method.
18 |
19 | ## Running jobs
20 |
21 | Once you've set up a Hadoop cluster and have written or ported a job, you can run the job using the following steps.
22 |
23 | ### Validating your setup and data
24 |
25 | First, validate that your cluster is set up, and that you can access your data. Navigate to the command line to execute the following commands.
26 |
27 | Type `./bdutil shell` to SSH into the master node of the Hadoop cluster.
28 | Type `hadoop fs -ls /` to check the cluster status. If data outputs, the cluster is set up correctly.
29 |
30 | ### Running the job
31 |
32 | Next, run the job from the command line, while you are still connected to the cluster via SSH. Always run jobs as the `hadoop` user to avoid having to type full Hadoop paths in commands.
33 |
34 | The following example runs a sample job called WordCount. Hadoop installations include this sample in the `/home/hadoop/hadoop-install/hadoop-examples-*.jar file.`
35 |
36 | To run the WordCount job:
37 |
38 | 1. Navigate to the command line.
39 | 1. Type `./bdutil shell` to SSH into the master node of the Hadoop cluster.
40 | 1. Type `hadoop fs -mkdir input` to create the `input` directory.
41 | Note that when using Google Cloud Storage as your [default file system](QUICKSTART.md), input automatically resolves to `gs://$/input`.
42 | 1. Copy any file from the web, such as the following example text from Apache, by typing the following command: `curl http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/ClusterSetup.html > setup.html`.
43 | 1. Copy one or more text files into the `input` directory. Using the same Apache text in the previous step, type the following command: `hadoop fs -copyFromLocal setup.html input`.
44 | 1. Type `cd /home/hadoop/hadoop-install/` to navigate to the Hadoop install directory.
45 | 1. Type `hadoop jar hadoop-examples-*.jar wordcount input output` to run the job on data in the input directory, and place results in the output directory.
46 |
47 | ### Checking job status
48 |
49 | To check the status of of the Hadoop job, visit the [JobTracker page](http://wiki.apache.org/hadoop/JobTracker). See the [monitoring jobs](MONITORING.md) page for instructions on how to access the JobTracker.
50 |
51 | ### Cleanup
52 |
53 | After completing the job, make sure to [shut down the Hadoop cluster](SHUTDOWN.md) for the most cost effective solution.
54 |
--------------------------------------------------------------------------------
/docs/SHUTDOWN.md:
--------------------------------------------------------------------------------
1 | # Shutting Down a Hadoop Cluster
2 |
3 | Because [Google Compute Engine](https://cloud.google.com/compute/) charges on a [per-minute basis](https://cloud.google.com/compute/pricing), it can be cost effective to shut down your Hadoop cluster once a workload completes. Once the Hadoop cluster is shut down, your data's accessibility depends on the [default file system](QUICKSTART.md) you've chosen:
4 |
5 | * When using HDFS, data is inaccessible.
6 | * When using [Google Cloud Storage](https://cloud.google.com/storage/), data is accessible with [gsutil](https://cloud.google.com/storage/docs/gsutil) or the [Google Cloud Platform Console](https://console.cloud.google.com/?_ga=1.81149463.169096153.1475769191).
7 |
8 | **When you delete (shutdown) a cluster, the operation is irreversible.**
9 |
10 | ## Issuing the delete command
11 |
12 | To shut down the Hadoop cluster, use the bdutil file included as part of the setup script. Type `./bdutil delete` in the `bdutil-` directory on the command line to shut down the cluster.
13 |
14 | Here is an example of the command being run.
15 |
16 | ~/bdutil-0.35.1$ ./bdutil delete
17 | Wed Aug 13 16:03:15 PDT 2014: Using local tmp dir for staging files: /tmp/bdutil-20140813-160315
18 | Wed Aug 13 16:03:15 PDT 2014: Using custom environment-variable file(s): ./bdutil_env.sh
19 | Wed Aug 13 16:03:15 PDT 2014: Reading environment-variable file: ./bdutil_env.sh
20 | Delete cluster with following settings?
21 | CONFIGBUCKET=''
22 | PROJECT=''
23 | GCE_IMAGE='backports-debian-7'
24 | GCE_ZONE='us-central1-b'
25 | GCE_NETWORK='default'
26 | PREFIX='hadoop'
27 | NUM_WORKERS=2
28 | MASTER_HOSTNAME='hadoop-m'
29 | WORKERS='hadoop-w-0 hadoop-w-1'
30 | BDUTIL_GCS_STAGING_DIR='gs:///bdutil-staging/hadoop-m'
31 | (y/n) y
32 | Wed Aug 13 16:03:16 PDT 2014: Deleting hadoop cluster...
33 | ...Wed Aug 13 16:03:17 PDT 2014: Waiting on async 'deleteinstance' jobs to finish. Might take a while...
34 | ...
35 | Wed Aug 13 16:04:11 PDT 2014: Done deleting VMs!
36 | Wed Aug 13 16:04:11 PDT 2014: Execution complete. Cleaning up temporary files...
37 | Wed Aug 13 16:04:11 PDT 2014: Cleanup complete.
38 |
39 | ## Verifying all resources have been removed
40 |
41 | You **must** use the same bdutil configuration arguments for cluster creation and deletion. Altering the arguments might result in errors when shutting down the cluster. After the script executes, you can type `gcloud compute instances list --project= | grep ` and verify that no instances are still running. Similarly, you can type `gcloud compute disks list --project= | grep ` and verify that no created disks accidentally survived.
42 |
--------------------------------------------------------------------------------
/extensions/bigtable/bigtable_env.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # This file contains environment-variable overrides to be used in conjunction
16 | # with bdutil_env.sh in order to deploy a Hadoop cluster with HBase installed
17 | # and configured to use Cloud Bigtable.
18 | # Usage: ./bdutil deploy -e extensions/bigtable/bigtable_env.sh.
19 |
20 | # Directory on each VM in which to install hbase.
21 | HBASE_INSTALL_DIR=/home/hadoop/hbase-install
22 | HBASE_CONF_DIR=${HBASE_INSTALL_DIR}/conf/
23 | BIGTABLE_ENDPOINT=bigtable.googleapis.com
24 | BIGTABLE_ADMIN_ENDPOINT=bigtabletableadmin.googleapis.com
25 |
26 | BIGTABLE_ZONE=us-central1-b
27 | BIGTABLE_CLUSTER=cluster
28 |
29 | COMMAND_GROUPS+=(
30 | "install_bigtable:
31 | extensions/bigtable/install_hbase_bigtable.sh
32 | "
33 | )
34 |
35 | # Installation of bigtable on master and workers
36 | COMMAND_STEPS+=(
37 | 'install_bigtable,install_bigtable'
38 | )
39 |
40 | ALPN_VERSION=7.1.3.v20150130
41 | ALPN_REMOTE_JAR=http://central.maven.org/maven2/org/mortbay/jetty/alpn/alpn-boot/${ALPN_VERSION}/alpn-boot-${ALPN_VERSION}.jar
42 | BIGTABLE_HBASE_JAR=https://storage.googleapis.com/cloud-bigtable/jars/bigtable-hbase/bigtable-hbase-mapreduce-0.2.2-shaded.jar
43 | BIGTABLE_CONNECTION=com.google.cloud.bigtable.hbase1_1.BigtableConnection
44 |
45 | # Copied from http://www.us.apache.org/dist/hbase/stable/
46 | # We don't want to overload the apache servers.
47 | HBASE_TARBALL_URI=https://storage.googleapis.com/cloud-bigtable/hbase-dist/hbase-1.1.2/hbase-1.1.2-bin.tar.gz
48 |
49 | BIGTABLE_LIB_DIR=${HBASE_INSTALL_DIR}/lib/bigtable
50 | ALPN_CLASSPATH=${BIGTABLE_LIB_DIR}/alpn-boot-${ALPN_VERSION}.jar
51 | BIGTABLE_BOOT_OPTS="-Xms1024m -Xmx2048m -Xbootclasspath/p:${ALPN_CLASSPATH}"
52 |
53 | # TODO: JAVAOPTS gets used in mapred-template.xml. There should probably be a better way to do this.
54 | JAVAOPTS="$JAVAOPTS -Xbootclasspath/p:$BIGTABLE_BOOT_OPTS"
55 |
56 | GCE_SERVICE_ACCOUNT_SCOPES+=(
57 | 'https://www.googleapis.com/auth/cloud-bigtable.admin'
58 | 'https://www.googleapis.com/auth/cloud-bigtable.data'
59 | 'https://www.googleapis.com/auth/cloud-bigtable.data.readonly'
60 | )
61 |
--------------------------------------------------------------------------------
/extensions/flink/README.md:
--------------------------------------------------------------------------------
1 | Deploying Flink on Google Compute Engine
2 | ========================================
3 |
4 | Set up a bucket
5 | ----------------
6 |
7 | If you have not done so, create a bucket for the bdutil config and
8 | staging files. A new bucket can be created with the gsutil:
9 |
10 | gsutil mb gs://
11 |
12 |
13 | Adapt the bdutil config
14 | -----------------------
15 |
16 | To deploy Flink with bdutil, adapt at least the following variables in
17 | bdutil_env.sh.
18 |
19 | CONFIGBUCKET=""
20 | PROJECT=""
21 | NUM_WORKERS=
22 |
23 |
24 | Bring up a cluster with Flink
25 | -----------------------------
26 |
27 | To bring up the Flink cluster on Google Compute Engine, execute:
28 |
29 | ./bdutil -e extensions/flink/flink_env.sh deploy
30 |
31 | To run a Flink example job:
32 |
33 | ./bdutil shell
34 | curl http://www.gutenberg.org/cache/epub/2265/pg2265.txt > text
35 | gsutil cp text gs:///text
36 | cd /home/hadoop/flink-install/bin
37 | ./flink run ../examples/flink-java-examples-*-WordCount.jar gs:///text gs:///output
--------------------------------------------------------------------------------
/extensions/flink/flink_env.sh:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS-IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | # This file contains environment-variable overrides to be used in conjunction
14 | # with bdutil_env.sh in order to deploy a Hadoop + Flink cluster.
15 | # Usage: ./bdutil deploy -e extensions/flink/flink_env.sh
16 |
17 |
18 | # In standalone mode, Flink runs the job manager and the task managers (workers)
19 | # on the cluster without using YARN containers. Flink also supports YARN
20 | # deployment which will be implemented in future version of the Flink bdutil plugin.
21 | FLINK_MODE="standalone"
22 |
23 | # URIs of tarballs for installation.
24 | FLINK_HADOOP1_TARBALL_URI='gs://flink-dist/flink-0.10.1-bin-hadoop1-scala_2.10.tgz'
25 | # Hadoop v2.7 build
26 | FLINK_HADOOP2_TARBALL_URI='gs://flink-dist/flink-0.10.1-bin-hadoop27-scala_2.10.tgz'
27 |
28 | # Directory on each VM in which to install each package.
29 | FLINK_INSTALL_DIR='/home/hadoop/flink-install'
30 |
31 | # Optional JVM arguments to pass
32 | # Flink config entry: env.java.opts:
33 | FLINK_JAVA_OPTS="-DsomeOption=value"
34 |
35 | # Heap memory used by the job manager (master) determined by the physical (free) memory of the server
36 | # Flink config entry: jobmanager.heap.mb
37 | FLINK_JOBMANAGER_MEMORY_FRACTION='0.8'
38 |
39 | # Heap memory used by the task managers (slaves) determined by the physical (free) memory of the servers
40 | # Flink config entry: taskmanager.heap.mb
41 | FLINK_TASKMANAGER_MEMORY_FRACTION='0.8'
42 |
43 | # Number of task slots per task manager (worker)
44 | # ideally set to the number of physical cpus
45 | # if set to 'auto', the number of slots will be determined automatically
46 | # Flink config entry: taskmanager.numberOfTaskSlots
47 | FLINK_TASKMANAGER_SLOTS='auto'
48 |
49 | # Default parallelism (number of concurrent actions per task)
50 | # If set to 'auto', this will be determined automatically
51 | # Flink config entry: parallelism.default
52 | FLINK_PARALLELISM='auto'
53 |
54 | # The number of buffers for the network stack.
55 | # Flink config entry: taskmanager.network.numberOfBuffers
56 | FLINK_NETWORK_NUM_BUFFERS=2048
57 |
58 |
59 | COMMAND_GROUPS+=(
60 | "install_flink:
61 | extensions/flink/install_flink.sh
62 | "
63 | "start_flink:
64 | extensions/flink/start_flink.sh
65 | "
66 | )
67 |
68 | # Installation of flink on master and workers; then start_flink only on master.
69 | COMMAND_STEPS+=(
70 | 'install_flink,install_flink'
71 | 'start_flink,*'
72 | )
73 |
--------------------------------------------------------------------------------
/extensions/flink/install_flink.sh:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS-IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 |
14 | # fail if undeclared variables are used
15 | set -o nounset
16 | # exit on error
17 | set -o errexit
18 |
19 |
20 | # Figure out which tarball to use based on which Hadoop version is being used.
21 | set +o nounset
22 | HADOOP_BIN="sudo -u hadoop ${HADOOP_INSTALL_DIR}/bin/hadoop"
23 | HADOOP_VERSION=$(${HADOOP_BIN} version | tr -cd [:digit:] | head -c1)
24 | set -o nounset
25 | if [[ "${HADOOP_VERSION}" == '2' ]]; then
26 | FLINK_TARBALL_URI=${FLINK_HADOOP2_TARBALL_URI}
27 | else
28 | FLINK_TARBALL_URI=${FLINK_HADOOP1_TARBALL_URI}
29 | fi
30 |
31 | # Install Flink via this fancy pipe
32 | gsutil cat "${FLINK_TARBALL_URI}" | tar -C /home/hadoop/ -xzv
33 | mv /home/hadoop/flink* "${FLINK_INSTALL_DIR}"
34 |
35 | # List all task managers (workers) in the slaves file
36 | # The task managers will be brought up by the job manager (master)
37 | echo ${WORKERS[@]} | tr ' ' '\n' > ${FLINK_INSTALL_DIR}/conf/slaves
38 |
39 | # Create temp file in hadoop directory which might be mounted to other storage than os
40 | FLINK_TASKMANAGER_TEMP_DIR="/hadoop/flink/tmp"
41 | mkdir -p ${FLINK_TASKMANAGER_TEMP_DIR}
42 | chgrp hadoop -R /hadoop/flink
43 | chmod 777 -R /hadoop/flink
44 |
45 | # Calculate the memory allocations, MB, using 'free -m'. Floor to nearest MB.
46 | TOTAL_MEM=$(free -m | awk '/^Mem:/{print $2}')
47 | FLINK_JOBMANAGER_MEMORY=$(python -c \
48 | "print int(${TOTAL_MEM} * ${FLINK_JOBMANAGER_MEMORY_FRACTION})")
49 | FLINK_TASKMANAGER_MEMORY=$(python -c \
50 | "print int(${TOTAL_MEM} * ${FLINK_TASKMANAGER_MEMORY_FRACTION})")
51 |
52 | # Determine the number of task slots
53 | if [[ "${FLINK_TASKMANAGER_SLOTS}" == "auto" ]] ; then
54 | FLINK_TASKMANAGER_SLOTS=`grep -c processor /proc/cpuinfo`
55 | fi
56 |
57 | # Determine the default parallelism
58 | if [[ "${FLINK_PARALLELISM}" == "auto" ]] ; then
59 | FLINK_PARALLELISM=$(python -c \
60 | "print ${NUM_WORKERS} * ${FLINK_TASKMANAGER_SLOTS}")
61 | fi
62 |
63 | # Apply Flink settings by appending them to the default config
64 | cat << EOF >> ${FLINK_INSTALL_DIR}/conf/flink-conf.yaml
65 | jobmanager.rpc.address: ${MASTER_HOSTNAME}
66 | jobmanager.heap.mb: ${FLINK_JOBMANAGER_MEMORY}
67 | taskmanager.heap.mb: ${FLINK_TASKMANAGER_MEMORY}
68 | taskmanager.numberOfTaskSlots: ${FLINK_TASKMANAGER_SLOTS}
69 | parallelism.default: ${FLINK_PARALLELISM}
70 | taskmanager.network.numberOfBuffers: ${FLINK_NETWORK_NUM_BUFFERS}
71 | env.java.opts: ${FLINK_JAVA_OPTS}
72 | taskmanager.tmp.dirs: ${FLINK_TASKMANAGER_TEMP_DIR}
73 | fs.hdfs.hadoopconf: ${HADOOP_CONF_DIR}
74 | EOF
75 |
76 | # Find the Hadoop lib dir so and add its gcs-connector to the Flink lib dir
77 | set +o nounset
78 | if [[ -r "${HADOOP_INSTALL_DIR}/libexec/hadoop-config.sh" ]]; then
79 | . "${HADOOP_INSTALL_DIR}/libexec/hadoop-config.sh"
80 | fi
81 | if [[ -n "${HADOOP_COMMON_LIB_JARS_DIR}" ]] && \
82 | [[ -n "${HADOOP_PREFIX}" ]]; then
83 | LIB_JARS_DIR="${HADOOP_PREFIX}/${HADOOP_COMMON_LIB_JARS_DIR}"
84 | else
85 | LIB_JARS_DIR="${HADOOP_INSTALL_DIR}/lib"
86 | fi
87 | set -o nounset
88 | # Get jar name and path
89 | GCS_JARNAME=$(grep -o '[^/]*\.jar' <<< ${GCS_CONNECTOR_JAR})
90 | LOCAL_GCS_JAR="${LIB_JARS_DIR}/${GCS_JARNAME}"
91 | # create link in Flink lib dir
92 | ln -s "${LOCAL_GCS_JAR}" "${FLINK_INSTALL_DIR}/lib/"
93 |
94 |
95 | # Assign ownership of everything to the 'hadoop' user.
96 | chown -R hadoop:hadoop /home/hadoop/
97 | # Make the Flink log directory writable
98 | chmod 777 ${FLINK_INSTALL_DIR}/log
99 |
--------------------------------------------------------------------------------
/extensions/flink/start_flink.sh:
--------------------------------------------------------------------------------
1 | # Licensed under the Apache License, Version 2.0 (the "License");
2 | # you may not use this file except in compliance with the License.
3 | # You may obtain a copy of the License at
4 | #
5 | # http://www.apache.org/licenses/LICENSE-2.0
6 | #
7 | # Unless required by applicable law or agreed to in writing, software
8 | # distributed under the License is distributed on an "AS-IS" BASIS,
9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 |
13 | set -o nounset
14 | set -o errexit
15 |
16 | if [[ ${FLINK_MODE} == 'standalone' ]]; then
17 | sudo -u hadoop ${FLINK_INSTALL_DIR}/bin/start-cluster.sh
18 | fi
--------------------------------------------------------------------------------
/extensions/google/experimental/resize_env.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Plugin which allows manually resizing bdutil-deployed clusters. To resize
16 | # upwards, set NEW_NUM_WORKERS to the new, larger value, keeping the old
17 | # NUM_WORKERS (or -n flag) at the existing cluster size. Then:
18 | #
19 | # Deploy only the new workers, e.g. {hadoop-w-2, hadoop-w-3, hadoop-w-4}:
20 | # ./bdutil -e my_base_env.sh -e extensions/google/experimental/resize_env.sh deploy
21 | #
22 | # Explicitly start the Hadoop daemons on just the new workers:
23 | # ./bdutil -e my_base_env.sh -e extensions/google/experimental/resize_env.sh run_command -t workers -- "service hadoop-hdfs-datanode start && service hadoop-mapreduce-tasktracker start"
24 | #
25 | # If using Spark as well, explicitly start the Spark daemons on the new workers:
26 | # ./bdutil -e my_base_env.sh -e extensions/google/experimental/resize_env.sh run_command -t workers -u extensions/spark/start_single_spark_worker.sh -- "./start_single_spark_worker.sh"
27 | #
28 | # Edit your base config to reflect your new cluster size:
29 | # echo NUM_WORKERS=5 >> my_base_env.sh
30 | #
31 | # When resizing down, simply set the base NUM_WORKERS to the desired smaller
32 | # size, and set NEW_NUM_WORKERS equal to the current cluster size; this can
33 | # be thought of as "undo-ing" a "resize upwards" command:
34 | # ./bdutil -e my_base_env.sh -n 2 -e extensions/google/experimental/resize_env.sh delete
35 | # echo NUM_WORKERS=2 >> my_base_env.sh
36 | #
37 | # TODO(user): Merge into bdutil as a core command.
38 | NEW_NUM_WORKERS=5
39 |
40 | # During resizes, make sure to avoid touching the master node.
41 | SKIP_MASTER=true
42 |
43 | # Save away the base evaluate_late_variable_bindings function so we can
44 | # override it and replace the WORKERS array.
45 | copy_func evaluate_late_variable_bindings old_evaluate_late_variable_bindings
46 |
47 | function evaluate_late_variable_bindings() {
48 | old_evaluate_late_variable_bindings
49 |
50 | WORKERS=()
51 | WORKER_ATTACHED_PDS=()
52 |
53 | local worker_suffix='w'
54 | local master_suffix='m'
55 | if (( ${OLD_HOSTNAME_SUFFIXES} )); then
56 | echo 'WARNING: Using deprecated -nn and -dn naming convention'
57 | worker_suffix='dn'
58 | master_suffix='nn'
59 | fi
60 | for ((i = ${NUM_WORKERS}; i < ${NEW_NUM_WORKERS}; i++)); do
61 | local shift_i=$((${i} - ${NUM_WORKERS}))
62 | WORKERS[${shift_i}]="${PREFIX}-${worker_suffix}-${i}"
63 | done
64 | for ((i = ${NUM_WORKERS}; i < ${NEW_NUM_WORKERS}; i++)); do
65 | local shift_i=$((${i} - ${NUM_WORKERS}))
66 | WORKER_ATTACHED_PDS[${shift_i}]="${WORKERS[${shift_i}]}-pd"
67 | done
68 |
69 | local num_workers_to_add=$((${NEW_NUM_WORKERS} - ${NUM_WORKERS}))
70 | NUM_WORKERS=${num_workers_to_add}
71 | }
72 |
--------------------------------------------------------------------------------
/extensions/hama/README.md:
--------------------------------------------------------------------------------
1 | Deploying Hama on Google Compute Engine
2 | ===============================================
3 |
4 | Apache Hama
5 | -----------
6 | Apache Hama is a framework for Big Data analytics which uses the Bulk Synchronous Parallel (BSP) computing model, which was established in 2012 as a Top-Level Project of The Apache Software Foundation.
7 |
8 | It provides not only pure BSP programming model but also vertex and neuron centric programming models, inspired by Google's Pregel and DistBelief.
9 |
10 | Basic Usage
11 | -----------
12 |
13 | Basic installation of [Apache Hama](http://hama.apache.org/) alongside Hadoop on Google Cloud Platform.
14 |
15 | ./bdutil -e extensions/hama/hama_env.sh deploy
16 |
17 | Or alternatively, using shorthand syntax:
18 |
19 | ./bdutil -e hama deploy
20 |
21 | Status
22 | ------
23 |
24 | This plugin is currently considered experimental and not officially supported.
25 | Contributions are welcome.
26 |
--------------------------------------------------------------------------------
/extensions/hama/hama_env.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # This file contains environment-variable overrides to be used in conjunction
16 | # with bdutil_env.sh in order to deploy a Hadoop cluster with Hama installed
17 | # and configured.
18 | # Usage: ./bdutil deploy extensions/hama/hama_env.sh.
19 |
20 | # URIs of tarball to install.
21 | HAMA_TARBALL_URI='gs://hama-dist/hama-dist-0.7.0.tar.gz'
22 |
23 | # Default Hama dist tarball requires Hadoop 2.
24 | import_env hadoop2_env.sh
25 |
26 | # Directory on each VM in which to install hama.
27 | HAMA_INSTALL_DIR='/home/hadoop/hama-install'
28 |
29 | COMMAND_GROUPS+=(
30 | "install_hama:
31 | extensions/hama/install_hama.sh
32 | "
33 | "start_hama:
34 | extensions/hama/start_hama.sh
35 | "
36 | )
37 |
38 | # Installation of hama on master and workers; then start_hama only on master.
39 | COMMAND_STEPS+=(
40 | 'install_hama,install_hama'
41 | 'start_hama,*'
42 | )
43 |
--------------------------------------------------------------------------------
/extensions/hama/install_hama.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | set -o nounset
16 | set -o errexit
17 |
18 | # Get the filename out of the full URI.
19 | HAMA_TARBALL=${HAMA_TARBALL_URI##*/}
20 |
21 | # Get the tarball, untar it.
22 | gsutil cp ${HAMA_TARBALL_URI} /home/hadoop/${HAMA_TARBALL}
23 | tar -C /home/hadoop -xzvf /home/hadoop/${HAMA_TARBALL}
24 | mv /home/hadoop/hama*/ ${HAMA_INSTALL_DIR}
25 |
26 | # Set up hama-site.xml to make sure it can access HDFS.
27 | cat << EOF > ${HAMA_INSTALL_DIR}/conf/hama-site.xml
28 |
29 |
30 |
31 |
32 | bsp.master.address
33 | ${MASTER_HOSTNAME}:40000
34 |
35 |
36 | hama.zookeeper.quorum
37 | ${MASTER_HOSTNAME}
38 |
39 |
40 | fs.defaultFS
41 | hdfs://${MASTER_HOSTNAME}:8020/
42 |
43 |
44 | EOF
45 |
46 | # Set up all workers to be groomservers.
47 | echo ${WORKERS[@]} | tr ' ' '\n' > ${HAMA_INSTALL_DIR}/conf/groomservers
48 |
49 | # Symlink the Hadoop hdfs-site.xml to hama's "copy" of it.
50 | ln -s ${HADOOP_CONF_DIR}/hdfs-site.xml ${HAMA_INSTALL_DIR}/conf/hdfs-site.xml
51 |
52 | # Explicitly set up JAVA_HOME for hama.
53 | JAVA_HOME=$(readlink -f $(which java) | sed 's|/bin/java$||')
54 | cat << EOF >> ${HAMA_INSTALL_DIR}/conf/hama-env.sh
55 | export JAVA_HOME=${JAVA_HOME}
56 | EOF
57 |
58 | # Add the hama 'bin' path to the .bashrc so that it's easy to call 'hama'
59 | # during interactive ssh session.
60 | add_to_path_at_login "${HAMA_INSTALL_DIR}/bin"
61 |
62 | # Assign ownership of everything to the 'hadoop' user.
63 | chown -R hadoop:hadoop /home/hadoop/ ${HAMA_INSTALL_DIR}
64 |
--------------------------------------------------------------------------------
/extensions/hama/start_hama.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved. #
2 | # Licensed under the Apache License, Version 2.0 (the "License");
3 | # you may not use this file except in compliance with the License.
4 | # You may obtain a copy of the License at
5 | #
6 | # http://www.apache.org/licenses/LICENSE-2.0
7 | #
8 | # Unless required by applicable law or agreed to in writing, software
9 | # distributed under the License is distributed on an "AS-IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 |
14 | set -o nounset
15 | set -o errexit
16 |
17 | sudo -u hadoop ${HAMA_INSTALL_DIR}/bin/start-bspd.sh
18 |
--------------------------------------------------------------------------------
/extensions/hbase/README.md:
--------------------------------------------------------------------------------
1 | Deploying Apache HBase on Google Compute Engine
2 | ===============================================
3 |
4 | Basic Usage
5 | -----------
6 |
7 | Basic installation of [Apache HBase](http://hbase.apache.org/) alongside Hadoop on Google Cloud Platform.
8 |
9 | ./bdutil -e extensions/hbase/hbase_env.sh deploy
10 |
11 | Or alternatively, using shorthand syntax:
12 |
13 | ./bdutil -e hbase deploy
14 |
15 | Status
16 | ------
17 |
18 | This plugin is currently considered experimental and not officially supported.
19 | Contributions are welcome.
20 |
--------------------------------------------------------------------------------
/extensions/hbase/hbase_env.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # This file contains environment-variable overrides to be used in conjunction
16 | # with bdutil_env.sh in order to deploy a Hadoop cluster with HBase installed
17 | # and configured.
18 | # Usage: ./bdutil deploy extensions/hbase/hbase_env.sh.
19 |
20 | # URIs of tarball to install.
21 | HBASE_TARBALL_URI='gs://hbase-dist/hbase-0.94.19.tar.gz'
22 |
23 | # Directory on each VM in which to install hbase.
24 | HBASE_INSTALL_DIR='/home/hadoop/hbase-install'
25 |
26 | COMMAND_GROUPS+=(
27 | "install_hbase:
28 | extensions/hbase/install_hbase.sh
29 | "
30 | "start_hbase:
31 | extensions/hbase/start_hbase.sh
32 | "
33 | )
34 |
35 | # Installation of hbase on master and workers; then start_hbase only on master.
36 | COMMAND_STEPS+=(
37 | 'install_hbase,install_hbase'
38 | 'start_hbase,*'
39 | )
40 |
--------------------------------------------------------------------------------
/extensions/hbase/install_hbase.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | set -o nounset
16 | set -o errexit
17 |
18 | # Get the filename out of the full URI.
19 | HBASE_TARBALL=${HBASE_TARBALL_URI##*/}
20 |
21 | # Get the tarball, untar it.
22 | gsutil cp ${HBASE_TARBALL_URI} /home/hadoop/${HBASE_TARBALL}
23 | tar -C /home/hadoop -xzvf /home/hadoop/${HBASE_TARBALL}
24 | mv /home/hadoop/hbase*/ ${HBASE_INSTALL_DIR}
25 |
26 | # Set up hbase-site.xml to make sure it can access HDFS.
27 | cat << EOF > ${HBASE_INSTALL_DIR}/conf/hbase-site.xml
28 |
29 |
30 |
31 |
32 | hbase.rootdir
33 | hdfs://${MASTER_HOSTNAME}:8020/hbase
34 |
35 |
36 | hbase.zookeeper.quorum
37 | ${MASTER_HOSTNAME}
38 |
39 |
40 | hbase.cluster.distributed
41 | true
42 |
43 |
44 | EOF
45 |
46 | # Set up all workers to be regionservers.
47 | echo ${WORKERS[@]} | tr ' ' '\n' > ${HBASE_INSTALL_DIR}/conf/regionservers
48 |
49 | # Symlink the Hadoop hdfs-site.xml to hbase's "copy" of it.
50 | ln -s ${HADOOP_CONF_DIR}/hdfs-site.xml ${HBASE_INSTALL_DIR}/conf/hdfs-site.xml
51 |
52 | # Explicitly set up JAVA_HOME for hbase.
53 | JAVA_HOME=$(readlink -f $(which java) | sed 's|/bin/java$||')
54 | cat << EOF >> ${HBASE_INSTALL_DIR}/conf/hbase-env.sh
55 | export JAVA_HOME=${JAVA_HOME}
56 | EOF
57 |
58 | # Add the hbase 'bin' path to the .bashrc so that it's easy to call 'hbase'
59 | # during interactive ssh session.
60 | add_to_path_at_login "${HBASE_INSTALL_DIR}/bin"
61 |
62 | # Assign ownership of everything to the 'hadoop' user.
63 | chown -R hadoop:hadoop /home/hadoop/ ${HBASE_INSTALL_DIR}
64 |
--------------------------------------------------------------------------------
/extensions/hbase/start_hbase.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved. #
2 | # Licensed under the Apache License, Version 2.0 (the "License");
3 | # you may not use this file except in compliance with the License.
4 | # You may obtain a copy of the License at
5 | #
6 | # http://www.apache.org/licenses/LICENSE-2.0
7 | #
8 | # Unless required by applicable law or agreed to in writing, software
9 | # distributed under the License is distributed on an "AS-IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 |
14 | set -o nounset
15 | set -o errexit
16 |
17 | sudo -u hadoop ${HBASE_INSTALL_DIR}/bin/start-hbase.sh
18 |
--------------------------------------------------------------------------------
/extensions/querytools/hive-validate-setup.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #
3 | # Copyright 2014 Google Inc. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS-IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | # Runs a basic Hive script.
18 | # Usage: ./bdutil shell < extensions/querytools/hive-validate-setup.sh
19 |
20 | # File hadoop-confg.sh
21 | HADOOP_CONFIGURE_CMD=''
22 | HADOOP_CONFIGURE_CMD=$(find ${HADOOP_LIBEXEC_DIR} ${HADOOP_PREFIX} \
23 | /home/hadoop /usr/*/hadoop* /usr/*/current/hadoop* -name hadoop-config.sh | head -n 1)
24 |
25 | # If hadoop-config.sh has been found source it
26 | if [[ -n "${HADOOP_CONFIGURE_CMD}" ]]; then
27 | echo "Sourcing '${HADOOP_CONFIGURE_CMD}'"
28 | . ${HADOOP_CONFIGURE_CMD}
29 | fi
30 |
31 | HADOOP_CMD=$(find ${HADOOP_PREFIX} /home/hadoop /usr/*/hadoop* /usr/*/current/hadoop* -wholename '*/bin/hadoop' | head -n 1)
32 | HIVE_CMD=$(find ${HADOOP_PREFIX} /home/hadoop /usr/*/hive* /usr/*/current/hive* -wholename '*/bin/hive' | head -n 1)
33 |
34 | #if it is still empty then dont run the tests
35 | if [[ "${HADOOP_CMD}" == '' ]]; then
36 | echo "Did not find hadoop'"
37 | exit 1
38 | fi
39 |
40 | #if it is still empty then dont run the tests
41 | if [[ "${HIVE_CMD}" == '' ]]; then
42 | echo "Did not find hive'"
43 | exit 1
44 | fi
45 |
46 | # Upload sample data.
47 | PARENT_DIR="/tmp/validate_hive_$(date +%s)"
48 | ${HADOOP_CMD} fs -mkdir ${PARENT_DIR}
49 | ${HADOOP_CMD} fs -put /etc/passwd ${PARENT_DIR}
50 |
51 | # Create a basic Hive script.
52 | echo "Creating hivetest.hive..."
53 | cat << EOF > hivetest.hive
54 | DROP TABLE bdutil_validate_hive_tbl;
55 |
56 | CREATE TABLE bdutil_validate_hive_tbl (
57 | user STRING,
58 | dummy STRING,
59 | uid INT,
60 | gid INT,
61 | name STRING,
62 | home STRING,
63 | shell STRING
64 | )
65 | ROW FORMAT DELIMITED
66 | FIELDS TERMINATED BY ':'
67 | STORED AS TEXTFILE;
68 |
69 | LOAD DATA INPATH '${PARENT_DIR}/passwd'
70 | OVERWRITE INTO TABLE bdutil_validate_hive_tbl;
71 |
72 | SELECT shell, COUNT(*) shell_count
73 | FROM bdutil_validate_hive_tbl
74 | GROUP BY shell
75 | ORDER BY shell_count DESC, shell DESC;
76 | EOF
77 | cat hivetest.hive
78 |
79 | # Run the script.
80 | ${HIVE_CMD} -f hivetest.hive > /tmp/hiveoutput.txt
81 |
82 | echo "Hive output:"
83 | cat /tmp/hiveoutput.txt
84 |
85 | # Run an equivalent pipeline of command-line invocations which pull out the
86 | # 'shell' field, sort/uniq to get the counts of each occurence, then finally
87 | # format to match Hive by printing tab-separated fields:
88 | # shell_count\tshell
89 | cat /etc/passwd | awk -F: '{print $7}' | sort | uniq -c | sort -nr | \
90 | awk '{print $2, $1}' | sed "s/ /\t/" > /tmp/goldenoutput.txt
91 |
92 | echo "Expected output:"
93 | cat /tmp/goldenoutput.txt
94 |
95 | EXIT_CODE=0
96 | if diff /tmp/hiveoutput.txt /tmp/goldenoutput.txt; then
97 | echo "Verified correct output."
98 | else
99 | echo "Hive output doesn't match expected output!"
100 | EXIT_CODE=1
101 | fi
102 |
103 | # Cleanup.
104 | echo "Cleaning up test data: ${PARENT_DIR}"
105 | ${HADOOP_CMD} fs -rmr -skipTrash ${PARENT_DIR}
106 |
107 | exit ${EXIT_CODE}
108 |
--------------------------------------------------------------------------------
/extensions/querytools/pig-mapred-template.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | mapred.working.dir
6 | /user/
7 |
8 | The FileSystem working directory to use for relative paths.
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/extensions/querytools/pig-validate-setup.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #
3 | # Copyright 2014 Google Inc. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS-IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | # Runs a basic Pig script.
18 | # Usage: ./bdutil shell < extensions/querytools/pig-validate-setup.sh
19 |
20 | # File hadoop-confg.sh
21 | HADOOP_CONFIGURE_CMD=''
22 | HADOOP_CONFIGURE_CMD=$(find ${HADOOP_LIBEXEC_DIR} ${HADOOP_PREFIX} \
23 | /home/hadoop /usr/*/hadoop* /usr/*/current/hadoop* -name hadoop-config.sh | head -n 1)
24 |
25 | # If hadoop-config.sh has been found source it
26 | if [[ -n "${HADOOP_CONFIGURE_CMD}" ]]; then
27 | echo "Sourcing '${HADOOP_CONFIGURE_CMD}'"
28 | . ${HADOOP_CONFIGURE_CMD}
29 | fi
30 |
31 | HADOOP_CMD=$(find ${HADOOP_PREFIX} /home/hadoop /usr/*/hadoop* /usr/*/current/hadoop* -wholename '*/bin/hadoop' | head -n 1)
32 | PIG_CMD=$(find ${HADOOP_PREFIX} /home/hadoop /usr/*/pig* /usr/*/current/pig* -wholename '*/bin/pig' | head -n 1)
33 |
34 | #if it is still empty then dont run the tests
35 | if [[ "${HADOOP_CMD}" == '' ]]; then
36 | echo "Did not find hadoop'"
37 | exit 1
38 | fi
39 |
40 | #if it is still empty then dont run the tests
41 | if [[ "${PIG_CMD}" == '' ]]; then
42 | echo "Did not find pig'"
43 | exit 1
44 | fi
45 |
46 | # Upload sample data.
47 | PARENT_DIR="/tmp/validate_pig_$(date +%s)"
48 | ${HADOOP_CMD} fs -mkdir ${PARENT_DIR}
49 | ${HADOOP_CMD} fs -put /etc/passwd ${PARENT_DIR}
50 |
51 | # Create a basic Pig script.
52 | echo "Creating pigtest.pig..."
53 | cat << EOF > pigtest.pig
54 | SET job.name 'PigTest';
55 | data = LOAD '${PARENT_DIR}/passwd'
56 | USING PigStorage(':')
57 | AS (user:CHARARRAY, dummy:CHARARRAY, uid:INT, gid:INT,
58 | name:CHARARRAY, home:CHARARRAY, shell:CHARARRAY);
59 | grp = GROUP data BY (shell);
60 | counts = FOREACH grp GENERATE
61 | FLATTEN(group) AS shell:CHARARRAY, COUNT(data) AS shell_count:LONG;
62 | res = ORDER counts BY shell_count DESC, shell DESC;
63 | DUMP res;
64 | EOF
65 | cat pigtest.pig
66 |
67 | # Run the script.
68 | ${PIG_CMD} pigtest.pig > /tmp/pigoutput.txt
69 |
70 | echo "Pig output:"
71 | cat /tmp/pigoutput.txt
72 |
73 | # Run an equivalent pipeline of command-line invocations which pull out the
74 | # 'shell' field, sort/uniq to get the counts of each occurence, then finally
75 | # format to match Pig by printing comma-separated fields in parens:
76 | # (shell_count,shell)
77 | cat /etc/passwd | awk -F: '{print $7}' | sort | uniq -c | sort -nr | \
78 | awk '{print $2, $1}' | sed "s/\(.*\) \(.*\)/(\1,\2)/" > /tmp/goldenoutput.txt
79 |
80 | echo "Expected output:"
81 | cat /tmp/goldenoutput.txt
82 |
83 | EXIT_CODE=0
84 | if diff /tmp/pigoutput.txt /tmp/goldenoutput.txt; then
85 | echo "Verified correct output."
86 | else
87 | echo "Pig output doesn't match expected output!"
88 | EXIT_CODE=1
89 | fi
90 |
91 | # Cleanup.
92 | echo "Cleaning up test data: ${PARENT_DIR}"
93 | ${HADOOP_CMD} fs -rmr -skipTrash ${PARENT_DIR}
94 |
95 | exit ${EXIT_CODE}
96 |
--------------------------------------------------------------------------------
/extensions/querytools/prepare_files.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2013 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Places files into expected files; generates a project_properties.sh file
16 | # which other scripts are designed to use.
17 |
18 | set -o nounset
19 | set -o errexit
20 |
21 | mkdir -p ${MASTER_PACKAGE_DIR}/conf/hive
22 | mv hive-site.xml ${MASTER_PACKAGE_DIR}/conf/hive/
23 |
24 | # Dynamically generated a project_properties.sh file which only contains the
25 | # environment variables which must be derived from existing hadoop deployment
26 | # variables.
27 | cat << EOF >> project_properties.sh
28 | SUPPORTED_HDPTOOLS='hive pig'
29 | ZONE=${GCE_ZONE}
30 | MASTER=${MASTER_HOSTNAME}
31 | HADOOP_HOME=${HADOOP_INSTALL_DIR}
32 | EOF
33 |
34 | # Explicitly set a schemeless working directory, otherwise as of Pig 0.12.0
35 | # PigInputFormat fails to use input paths which are not from the "default"
36 | # FileSystem. No need to clobber existing working-directory settings.
37 | bdconfig merge_configurations \
38 | --configuration_file ${HADOOP_CONF_DIR}/mapred-site.xml \
39 | --source_configuration_file pig-mapred-template.xml \
40 | --resolve_environment_variables \
41 | --create_if_absent \
42 | --noclobber
43 |
--------------------------------------------------------------------------------
/extensions/querytools/querytools_env.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2013 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # This file contains environment-variable overrides to be used in conjunction
16 | # with bdutil_env.sh in order to deploy a Hadoop cluster with Pig and Hive
17 | # installed, using the Cloud Solutions sampleapp.
18 | # Usage: ./bdutil deploy extensions/querytools/querytools_env.sh
19 |
20 | # Set the default filesystem to be 'hdfs' since Pig and Hive will tend to rely
21 | # on multi-stage pipelines more heavily then plain Hadoop MapReduce, and thus
22 | # be vulnerable to eventual list consistency. Okay to read initially from GCS
23 | # using explicit gs:// URIs and likewise to write the final output to GCS,
24 | # letting any intermediate cross-stage items get stored in HDFS temporarily.
25 | DEFAULT_FS='hdfs'
26 |
27 | # URIs of tarballs to install.
28 | PIG_TARBALL_URI='gs://querytools-dist/pig-0.12.0.tar.gz'
29 | HIVE_TARBALL_URI='gs://querytools-dist/hive-0.12.0-bin.tar.gz'
30 |
31 | # Constants normally in project_properties.sh from the sampleapp, but which we
32 | # can propagate out here as shared environment variables instead.
33 | HADOOP_MAJOR_VERSION='1'
34 | HADOOP_USER='hadoop'
35 | HADOOP_GROUP='hadoop'
36 | HDP_USER='hadoop'
37 | HDP_USER_HOME='/home/hadoop'
38 | MASTER_INSTALL_DIR='/home/hadoop'
39 | PACKAGES_DIR='packages'
40 | SCRIPTS_DIR='scripts'
41 | MASTER_PACKAGE_DIR='/tmp/hdp_tools'
42 | HDFS_TMP_DIR='/tmp'
43 | HADOOP_TMP_DIR='/hadoop/tmp'
44 |
45 | # File dependencies to be used by the scripts.
46 | if [[ -n "${BDUTIL_DIR}" ]]; then
47 | UPLOAD_FILES+=(
48 | "${BDUTIL_DIR}/extensions/querytools/pig-mapred-template.xml"
49 | "${BDUTIL_DIR}/sampleapps/querytools/conf/hive/hive-site.xml"
50 | "${BDUTIL_DIR}/sampleapps/querytools/scripts/common_utils.sh"
51 | "${BDUTIL_DIR}/sampleapps/querytools/scripts/package_utils.sh"
52 | )
53 | fi
54 | COMMAND_GROUPS+=(
55 | "install_querytools:
56 | extensions/querytools/prepare_files.sh
57 | sampleapps/querytools/scripts/setup-packages__at__master.sh
58 | sampleapps/querytools/scripts/setup-hdfs-for-hdtools__at__master.sh
59 | extensions/querytools/setup_profiles.sh
60 | "
61 | )
62 |
63 | # Querytools installation only needs to run on master.
64 | COMMAND_STEPS+=(
65 | 'install_querytools,*'
66 | )
67 |
--------------------------------------------------------------------------------
/extensions/querytools/setup_profiles.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Sets up login shells to have the "hive" and "pig" binaries in the system PATH
16 | # environment variable.
17 |
18 | add_to_path_at_login "${MASTER_INSTALL_DIR}/pig/bin"
19 | add_to_path_at_login "${MASTER_INSTALL_DIR}/hive/bin"
20 |
--------------------------------------------------------------------------------
/extensions/spark/install_shark.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | set -o errexit
16 |
17 | # Figure out which tarball to use based on which Hadoop version is being used.
18 | set +o nounset
19 | HADOOP_BIN="sudo -u hadoop ${HADOOP_INSTALL_DIR}/bin/hadoop"
20 | HADOOP_VERSION=$(${HADOOP_BIN} version | tr -cd [:digit:] | head -c1)
21 | set -o nounset
22 | if [[ "${HADOOP_VERSION}" == '2' ]]; then
23 | SHARK_TARBALL_URI=${SHARK_HADOOP2_TARBALL_URI}
24 | else
25 | SHARK_TARBALL_URI=${SHARK_HADOOP1_TARBALL_URI}
26 | fi
27 |
28 | SHARK_TARBALL=${SHARK_TARBALL_URI##*/}
29 | gsutil cp ${SHARK_TARBALL_URI} /home/hadoop/${SHARK_TARBALL}
30 | tar -C /home/hadoop -xzvf /home/hadoop/${SHARK_TARBALL}
31 | mv /home/hadoop/shark*/ ${SHARK_INSTALL_DIR}
32 |
33 | # Find the Hadoop lib dir so that we can link its gcs-connector into the
34 | # Shark library path.
35 | set +o nounset
36 | if [[ -r "${HADOOP_INSTALL_DIR}/libexec/hadoop-config.sh" ]]; then
37 | . "${HADOOP_INSTALL_DIR}/libexec/hadoop-config.sh"
38 | fi
39 | if [[ -n "${HADOOP_COMMON_LIB_JARS_DIR}" ]] && \
40 | [[ -n "${HADOOP_PREFIX}" ]]; then
41 | LIB_JARS_DIR="${HADOOP_PREFIX}/${HADOOP_COMMON_LIB_JARS_DIR}"
42 | else
43 | LIB_JARS_DIR="${HADOOP_INSTALL_DIR}/lib"
44 | fi
45 | set -o nounset
46 |
47 | GCS_JARNAME=$(grep -o '[^/]*\.jar' <<< ${GCS_CONNECTOR_JAR})
48 | LOCAL_GCS_JAR="${LIB_JARS_DIR}/${GCS_JARNAME}"
49 | ln -s ${LOCAL_GCS_JAR} ${SHARK_INSTALL_DIR}/lib/
50 |
51 | # Calculate the memory allocations, MB, using 'free -m'. Floor to nearest MB.
52 | TOTAL_MEM=$(free -m | awk '/^Mem:/{print $2}')
53 | SHARK_MEM=$(python -c \
54 | "print int(${TOTAL_MEM} * ${SHARK_MEM_FRACTION})")
55 |
56 |
57 | # Point shark at scala, hadoop, hive, spark, and the spark master.
58 | cat << EOF >> ${SHARK_INSTALL_DIR}/conf/shark-env.sh
59 | export HADOOP_HOME=${HADOOP_INSTALL_DIR}
60 | export SCALA_HOME=${SCALA_INSTALL_DIR}
61 | export SPARK_HOME=${SPARK_INSTALL_DIR}
62 | export SPARK_MEM=${SHARK_MEM}m
63 |
64 | # Set spark master by copying from spark-env.sh
65 | $(grep 'MASTER=' ${SPARK_INSTALL_DIR}/conf/spark-env.sh)
66 | EOF
67 |
68 | # Add the spark 'bin' path to the .bashrc so that it's easy to call 'spark'
69 | # during interactive ssh session.
70 | add_to_path_at_login "${SHARK_INSTALL_DIR}/bin"
71 |
72 | # Assign ownership of everything to the 'hadoop' user.
73 | chown -R hadoop:hadoop /home/hadoop/
74 |
--------------------------------------------------------------------------------
/extensions/spark/spark_env.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # This file contains environment-variable overrides to be used in conjunction
16 | # with bdutil_env.sh in order to deploy a Hadoop + Spark cluster.
17 | # Usage: ./bdutil deploy -e extensions/spark/spark_env.sh
18 |
19 | # An enum of [default|standalone|yarn-client|yarn-cluster].
20 | # In standalone mode, Spark runs it's own daemons and job submissions are made
21 | # to the master daemon by default. yarn-client and yarn-cluster both run inside
22 | # YARN containers. default preserves Spark's default.
23 | SPARK_MODE="standalone"
24 |
25 | # URIs of tarballs to install.
26 | SCALA_TARBALL_URI='gs://spark-dist/scala-2.10.3.tgz'
27 | SPARK_HADOOP1_TARBALL_URI='gs://spark-dist/spark-1.5.0-bin-hadoop1.tgz'
28 | SPARK_HADOOP2_TARBALL_URI='gs://spark-dist/spark-1.5.0-bin-hadoop2.6.tgz'
29 |
30 | # Directory on each VM in which to install each package.
31 | SCALA_INSTALL_DIR='/home/hadoop/scala-install'
32 | SPARK_INSTALL_DIR='/home/hadoop/spark-install'
33 |
34 | # Worker memory to provide in spark-env.sh, as a fraction of total physical
35 | # memory. In the event of running Spark on YARN the NODEMANAGER_MEMORY_FRACTION
36 | # in hadoop2_env.sh replaces this.
37 | SPARK_WORKER_MEMORY_FRACTION='0.8'
38 |
39 | # Default memory per Spark executor, as a fraction of total physical memory;
40 | # used for default spark-shell if not overridden with a -D option. Can be used
41 | # to accommodate multiple spark-shells on a single cluster, e.g. if this value
42 | # is set to half the value of SPARK_WORKER_MEMORY_FRACTION then two sets of
43 | # executors can run simultaneously. However, in such a case, then at the time
44 | # of starting 'spark-shell' you must specify fewer cores, e.g.:
45 | # SPARK_JAVA_OPTS="-Dspark.cores.max=4" spark-shell
46 | SPARK_EXECUTOR_MEMORY_FRACTION='0.8'
47 |
48 | # Max memory to use by the single Spark daemon process on each node; may need to
49 | # increase when using larger clusters. Expressed as a fraction of total physical
50 | # memory.
51 | SPARK_DAEMON_MEMORY_FRACTION='0.15'
52 |
53 | # Install JDK because certain Spark commands assume jar is installed.
54 | INSTALL_JDK_DEVEL='true'
55 |
56 | # Spark-standalone master UI is on port 8080.
57 | MASTER_UI_PORTS=('8080' ${MASTER_UI_PORTS[@]})
58 |
59 | COMMAND_GROUPS+=(
60 | "install_spark:
61 | extensions/spark/install_spark.sh
62 | "
63 | "spark_configure_startup:
64 | extensions/spark/spark_configure_startup_processes.sh
65 | "
66 | "start_spark:
67 | extensions/spark/start_spark.sh
68 | "
69 | )
70 |
71 | # Installation of spark on master and workers; then start_spark only on master.
72 | COMMAND_STEPS+=(
73 | 'install_spark,install_spark'
74 | 'spark_configure_startup,spark_configure_startup'
75 | 'start_spark,*'
76 | )
77 |
--------------------------------------------------------------------------------
/extensions/spark/spark_on_yarn_env.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # This file contains environment-variable overrides to be used in conjunction
16 | # with bdutil_env.sh in order to deploy a Hadoop 2 + Spark on YARN cluster.
17 | # Usage: ./bdutil deploy -e extensions/spark/spark_env.sh
18 |
19 | # Install YARN and Spark
20 | import_env hadoop2_env.sh
21 | import_env extensions/spark/spark_env.sh
22 |
23 | # Clusters must have at least 3 workers to run spark-validate-setup.sh
24 | # and many other Spark jobs.
25 | if [[ -z "${NUM_WORKERS}" ]] || (( ${NUM_WORKERS} < 3 )); then
26 | NUM_WORKERS=3
27 | fi
28 |
29 | # An enum of [default|standalone|yarn-client|yarn-cluster].
30 | # yarn-client and yarn-cluster both run Spark jobs inside YARN containers
31 | # yarn-cluster also runs the spark-class or spark-submit process inside a
32 | # container, but it cannot support spark-shell, without specifying another
33 | # master.
34 | # e.g. spark-shell --master yarn-client.
35 | SPARK_MODE='yarn-client'
36 |
--------------------------------------------------------------------------------
/extensions/spark/spark_shark_env.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # This file contains environment-variable overrides to be used in conjunction
16 | # with bdutil_env.sh in order to deploy a Hadoop + Spark + Shark cluster.
17 | # Usage: ./bdutil deploy -e extensions/spark/spark_shark_env.sh
18 |
19 | import_env extensions/spark/spark_env.sh
20 |
21 | # URIs of tarballs to install.
22 | SHARK_HADOOP1_TARBALL_URI='gs://spark-dist/shark-0.9.1-bin-hadoop1.tgz'
23 | SHARK_HADOOP2_TARBALL_URI='gs://spark-dist/shark-0.9.1-bin-hadoop2.tgz'
24 | # Shark is not compatible with Spark 1.x
25 | SPARK_HADOOP1_TARBALL_URI='gs://spark-dist/spark-0.9.2-bin-hadoop1.tgz'
26 | SPARK_HADOOP2_TARBALL_URI='gs://spark-dist/spark-0.9.2-bin-hadoop2.tgz'
27 |
28 | # Directory on each VM in which to install shark
29 | SHARK_INSTALL_DIR='/home/hadoop/shark-install'
30 |
31 | # Value to give Shark indicating the amount of Spark worker memory
32 | # available/usable by Shark per worker. Expressed as a fraction of total
33 | # physical memory.
34 | SHARK_MEM_FRACTION='0.8'
35 |
36 | COMMAND_GROUPS+=(
37 | "install_shark:
38 | extensions/spark/install_shark.sh
39 | "
40 | )
41 |
42 | # Installation of shark
43 | COMMAND_STEPS+=(
44 | 'install_shark,install_shark'
45 | )
46 |
--------------------------------------------------------------------------------
/extensions/spark/start_single_spark_worker.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Can be used on an individual Spark worker when running Spark in "standalone"
16 | # mode. Requires all other setup of files, configuration, etc., to be complete
17 | # already.
18 |
19 | set -o errexit
20 |
21 | source hadoop-env-setup.sh
22 |
23 | SPARK_MASTER="spark://${MASTER_HOSTNAME}:7077"
24 | sudo -u hadoop ${SPARK_INSTALL_DIR}/sbin/spark-daemon.sh start \
25 | org.apache.spark.deploy.worker.Worker 0 ${SPARK_MASTER}
26 |
--------------------------------------------------------------------------------
/extensions/spark/start_spark.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | set -o nounset
16 | set -o errexit
17 |
18 | if [[ ${SPARK_MODE} == 'standalone' ]]; then
19 | sudo -u hadoop ${SPARK_INSTALL_DIR}/sbin/start-all.sh
20 | fi
21 |
--------------------------------------------------------------------------------
/extensions/storm/README.md:
--------------------------------------------------------------------------------
1 | Deploying Apache Storm on Google Compute Engine
2 | ===============================================
3 |
4 | Basic Usage
5 | -----------
6 |
7 | Basic installation of [Apache Storm](https://storm.apache.org/) alongside Hadoop on Google Cloud Platform.
8 |
9 | ./bdutil -e extensions/storm/storm_env.sh deploy
10 |
11 | Or alternatively, using shorthand syntax:
12 |
13 | ./bdutil -e storm deploy
14 |
15 | Status
16 | ------
17 |
18 | This plugin is currently considered experimental and not officially supported.
19 | Contributions are welcome.
20 |
--------------------------------------------------------------------------------
/extensions/storm/install_storm.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | set -o errexit
15 |
16 | # Set up Storm
17 | STORM_MASTER_INSTANCE="${MASTER_HOSTNAME}"
18 |
19 | STORM_INSTALL_TMP_DIR="/storm-$(date +%s)"
20 | mkdir -p ${STORM_INSTALL_TMP_DIR}
21 |
22 | STORM_TARBALL_BASENAME=$(grep -o '[^/]*\.tar.gz' <<< ${STORM_TARBALL_URI})
23 | STORM_LOCAL_TARBALL="${STORM_INSTALL_TMP_DIR}/${STORM_TARBALL_BASENAME}"
24 | download_bd_resource ${STORM_TARBALL_URI} ${STORM_LOCAL_TARBALL}
25 |
26 | tar -C ${STORM_INSTALL_TMP_DIR} -xvzf ${STORM_LOCAL_TARBALL}
27 | mkdir -p $(dirname ${STORM_INSTALL_DIR})
28 | mv ${STORM_INSTALL_TMP_DIR}/apache-storm*/ ${STORM_INSTALL_DIR}
29 |
30 | STORM_LIB_DIR="${STORM_INSTALL_DIR}/lib"
31 |
32 | if (( ${ENABLE_STORM_BIGTABLE} )); then
33 | GOOGLE_STORM_LIB_DIR="${STORM_INSTALL_DIR}/lib/google"
34 | mkdir -p "${GOOGLE_STORM_LIB_DIR}"
35 | # Download the alpn jar. The Alpn jar should be a fully qualified URL.
36 | # download_bd_resource needs a fully qualified file path and not just a
37 | # directory name to put the file in when the file to download starts with
38 | # http://.
39 | ALPN_JAR_NAME="${ALPN_REMOTE_JAR##*/}"
40 | ALPN_BOOT_JAR="${GOOGLE_STORM_LIB_DIR}/${ALPN_JAR_NAME}"
41 | download_bd_resource "${ALPN_REMOTE_JAR}" "${ALPN_BOOT_JAR}"
42 | fi
43 |
44 |
45 | mkdir -p ${STORM_VAR}
46 | cat << EOF | tee -a ${STORM_INSTALL_DIR}/conf/storm.yaml
47 | storm.zookeeper.servers:
48 | - "${STORM_MASTER_INSTANCE}"
49 | nimbus.host: "${STORM_MASTER_INSTANCE}"
50 | storm.local.dir: "${STORM_VAR}"
51 | supervisor.slots.ports:
52 | - 6700
53 | - 6701
54 | - 6702
55 | - 6703
56 | storm.messaging.transport: 'backtype.storm.messaging.netty.Context'
57 | storm.messaging.netty.server_worker_threads: 1
58 | storm.messaging.netty.client_worker_threads: 1
59 | storm.messaging.netty.buffer_size: 5242880
60 | storm.messaging.netty.max_retries: 100
61 | storm.messaging.netty.max_wait_ms: 1000
62 | storm.messaging.netty.min_wait_ms: 100
63 |
64 | EOF
65 |
66 | if (( ${ENABLE_STORM_BIGTABLE} )); then
67 | cat << EOF | tee -a "${STORM_INSTALL_DIR}/conf/storm.yaml"
68 | worker.childopts: "-Xbootclasspath/p:${ALPN_BOOT_JAR}"
69 | EOF
70 | fi
71 |
72 | # Add the storm 'bin' path to the .bashrc so that it's easy to call 'storm'
73 | # during interactive ssh session.
74 | add_to_path_at_login "${STORM_INSTALL_DIR}/bin"
75 |
76 | # TODO(user): Fix this a better way.
77 | cp /home/hadoop/hadoop-install/lib/gcs-connector*.jar /home/hadoop/storm-install/lib/
78 | cp /home/hadoop/hadoop-install/hadoop-core*.jar /home/hadoop/storm-install/lib/
79 | cp /home/hadoop/hadoop-install/lib/commons-configuration*.jar /home/hadoop/storm-install/lib/
80 |
81 | # Assign ownership of everything to the 'hadoop' user.
82 | chown -R hadoop:hadoop /home/hadoop/ ${STORM_VAR}
83 |
--------------------------------------------------------------------------------
/extensions/storm/install_supervisor.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Installs Supervisor using apt-get.
16 |
17 | # Strip the debian mirrors to force only using the GCS mirrors. Not ideal for
18 | # production usage due to stripping security.debian.org, but reduces external
19 | # load for non-critical use cases.
20 |
21 | install_application 'supervisor'
22 |
23 | # No easy way to install supervisor on CentOS and have it configured
24 | if ! [[ -x $(which apt-get) ]] && [[ -x $(which yum) ]]; then
25 | # Install supervisor
26 | yum install -y python-setuptools
27 | easy_install supervisor
28 | mkdir -p /etc/supervisor/conf.d/
29 | mkdir -p /var/log/supervisor
30 |
31 | # Set up the supervisor configuration
32 | cat > supervisord.conf < ${ZOOKEEPER_INSTALL_DIR}/conf/zoo.cfg
34 |
35 | # Sets the dir locations for the log and tracelog and sets root.logger value to "INFO, ROLLINGFILE" instead of "INFO, CONSOLE"
36 | perl -pi -e 's|^(zookeeper.(?:trace)?log.dir=).*|$1'${ZOOKEEPER_VAR}'/log| ; s|(?<=zookeeper.root.logger=).*|INFO, ROLLINGFILE| ;' \
37 | ${ZOOKEEPER_INSTALL_DIR}/conf/log4j.properties
38 |
39 |
40 | # Add the zookeeper 'bin' path to the .bashrc so that it's easy to call access
41 | # zookeeper files during interactive ssh session.
42 | add_to_path_at_login "${ZOOKEEPER_INSTALL_DIR}/bin"
43 |
44 | # Assign ownership of everything to the 'hadoop' user.
45 | chown -R hadoop:hadoop /home/hadoop/ ${ZOOKEEPER_VAR}
46 |
47 | # Define Supervisor Configuration for ZooKeeper
48 | cat > /etc/supervisor/conf.d/zookeeper.conf <
2 |
37 |
38 |
39 |
40 |
41 |
42 | %d{yyyy-MM-dd HH:mm:ss} %c{1} [%p] %m%n
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
--------------------------------------------------------------------------------
/extensions/storm/start_storm_master.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Define Supervisor Configurations for Storm
16 | cat > /etc/supervisor/conf.d/storm.conf < /etc/supervisor/conf.d/storm.conf <) share of the memory available to
33 | # the NodeManager for containers. Thus an n1-standard-4 with CORES_PER_MAP_TASK
34 | # set to 2 would be able to host 4 / 2 = 2 map containers (and no other
35 | # containers). For more details see the script 'libexec/configure-mrv2-mem.py'.
36 | CORES_PER_MAP_TASK=1.0
37 |
38 | # Decimal number controlling the size of reduce containers in memory and virtual
39 | # cores. See CORES_PER_MAP_TASK for more details.
40 | CORES_PER_REDUCE_TASK=2.0
41 |
42 | # Decimal number controlling the size of application master containers in memory
43 | # and virtual cores. See CORES_PER_MAP_TASK for more details.
44 | CORES_PER_APP_MASTER=2.0
45 |
46 | # Connector with Hadoop AbstractFileSystem implemenation for YARN
47 | GCS_CONNECTOR_JAR='https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-1.6.2-hadoop2.jar'
48 |
49 | BIGQUERY_CONNECTOR_JAR='https://storage.googleapis.com/hadoop-lib/bigquery/bigquery-connector-0.10.3-hadoop2.jar'
50 |
51 |
52 | HDFS_DATA_DIRS_PERM='700'
53 |
54 | # 8088 for YARN, 50070 for HDFS.
55 | MASTER_UI_PORTS=('8088' '50070')
56 |
57 | # Allow to tune the YARN scheduler to
58 | YARN_SCHEDULER_CAPACITY_MAXIMUM_APPLICATIONS=10000
59 | YARN_SCHEDULER_CAPACITY_MAX_AM_PERCENT=0.2
60 |
61 | # Use Hadoop 2 specific configuration templates.
62 | if [[ -n "${BDUTIL_DIR}" ]]; then
63 | UPLOAD_FILES=($(find ${BDUTIL_DIR}/conf/hadoop2 -name '*template.xml'))
64 | UPLOAD_FILES+=("${BDUTIL_DIR}/libexec/hadoop_helpers.sh")
65 | UPLOAD_FILES+=("${BDUTIL_DIR}/libexec/configure_mrv2_mem.py")
66 | fi
67 |
68 | # Use Hadoop 2 specific start scripts
69 | COMMAND_GROUPS+=(
70 | 'deploy_start2:
71 | libexec/start_hadoop2.sh'
72 | )
73 |
74 | COMMAND_STEPS=(
75 | "deploy-ssh-master-setup,*"
76 | 'deploy-core-setup,deploy-core-setup'
77 | "*,deploy-ssh-worker-setup"
78 | "deploy-master-nfs-setup,*",
79 | "deploy-client-nfs-setup,deploy-client-nfs-setup",
80 | 'deploy_start2,*'
81 | )
82 |
--------------------------------------------------------------------------------
/libexec/configure_hdfs.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Configures HDFS
16 |
17 | set -e
18 |
19 | source hadoop_helpers.sh
20 |
21 | if (( ${ENABLE_HDFS} )); then
22 |
23 | HDFS_ADMIN=$(get_hdfs_superuser)
24 |
25 | # Location of HDFS metadata on namenode
26 | export HDFS_NAME_DIR=/hadoop/dfs/name
27 |
28 | # If disks are mounted use all of them for HDFS data
29 | MOUNTED_DISKS=($(find /mnt -maxdepth 1 -mindepth 1))
30 | if [[ ${#MOUNTED_DISKS[@]} -eq 0 ]]; then
31 | MOUNTED_DISKS=('')
32 | fi
33 |
34 | # Location of HDFS data blocks on datanodes; for each mounted disk, add the
35 | # path /mnt/diskname/hadoop/dfs/data as a data directory, or if no mounted
36 | # disks exist, just go with the absolute path /hadoop/dfs/data.
37 | HDFS_DATA_DIRS="${MOUNTED_DISKS[@]/%//hadoop/dfs/data}"
38 |
39 | # Do not create HDFS_NAME_DIR, or Hadoop will think it is already formatted
40 | mkdir -p /hadoop/dfs ${HDFS_DATA_DIRS}
41 |
42 | chown ${HDFS_ADMIN}:hadoop -L -R /hadoop/dfs ${HDFS_DATA_DIRS}
43 |
44 | # Make sure the data dirs have the expected permissions.
45 | chmod ${HDFS_DATA_DIRS_PERM} ${HDFS_DATA_DIRS}
46 |
47 | # Set general Hadoop environment variables
48 |
49 | # Calculate the memory allocations, MB, using 'free -m'. Floor to nearest MB.
50 | TOTAL_MEM=$(free -m | awk '/^Mem:/{print $2}')
51 | NAMENODE_MEM_MB=$(python -c "print int(${TOTAL_MEM} * \
52 | ${HDFS_MASTER_MEMORY_FRACTION} / 2)")
53 | SECONDARYNAMENODE_MEM_MB=${NAMENODE_MEM_MB}
54 |
55 | cat << EOF >> ${HADOOP_CONF_DIR}/hadoop-env.sh
56 |
57 | # Increase the maximum NameNode / SecondaryNameNode heap.
58 | HADOOP_NAMENODE_OPTS="-Xmx${NAMENODE_MEM_MB}m \${HADOOP_NAMENODE_OPTS}"
59 | HADOOP_SECONDARYNAMENODE_OPTS="-Xmx${SECONDARYNAMENODE_MEM_MB}m \${HADOOP_SECONDARYNAMENODE_OPTS}"
60 | EOF
61 |
62 | # Increase maximum number of files for HDFS
63 | MAX_FILES=16384
64 | ulimit -n ${MAX_FILES}
65 | cat << EOF > /etc/security/limits.d/hadoop.conf
66 | ${HDFS_ADMIN} hard nofile ${MAX_FILES}
67 | ${HDFS_ADMIN} soft nofile ${MAX_FILES}
68 | EOF
69 |
70 | export HDFS_DATA_DIRS="${HDFS_DATA_DIRS// /,}"
71 |
72 | bdconfig merge_configurations \
73 | --configuration_file ${HADOOP_CONF_DIR}/hdfs-site.xml \
74 | --source_configuration_file hdfs-template.xml \
75 | --resolve_environment_variables \
76 | --create_if_absent \
77 | --clobber
78 | fi
79 |
--------------------------------------------------------------------------------
/libexec/install_and_configure_bigquery_connector.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2013 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Downloads and installs the relevant bigquery-connector-.jar.
16 | # Also configures it for use with hadoop.
17 |
18 | set -e
19 |
20 | if (( ${INSTALL_BIGQUERY_CONNECTOR} )); then
21 | if [[ -r "${HADOOP_INSTALL_DIR}/libexec/hadoop-config.sh" ]]; then
22 | . "${HADOOP_INSTALL_DIR}/libexec/hadoop-config.sh"
23 | fi
24 | if [[ -n "${HADOOP_COMMON_LIB_JARS_DIR}" ]] && \
25 | [[ -n "${HADOOP_PREFIX}" ]]; then
26 | LIB_JARS_DIR="${HADOOP_PREFIX}/${HADOOP_COMMON_LIB_JARS_DIR}"
27 | else
28 | LIB_JARS_DIR="${HADOOP_INSTALL_DIR}/lib"
29 | fi
30 |
31 |
32 | # Grab the connector jarfile, add it to installation /lib directory.
33 | JARNAME=$(grep -o '[^/]*\.jar' <<< ${BIGQUERY_CONNECTOR_JAR})
34 | LOCAL_JAR="${LIB_JARS_DIR}/${JARNAME}"
35 |
36 | download_bd_resource "${BIGQUERY_CONNECTOR_JAR}" "${LOCAL_JAR}"
37 |
38 | chown hadoop:hadoop ${LOCAL_JAR}
39 |
40 | echo "export HADOOP_CLASSPATH=\$HADOOP_CLASSPATH:${LOCAL_JAR}" \
41 | >> ${HADOOP_CONF_DIR}/hadoop-env.sh
42 |
43 | bdconfig merge_configurations \
44 | --configuration_file ${HADOOP_CONF_DIR}/mapred-site.xml \
45 | --source_configuration_file bq-mapred-template.xml \
46 | --resolve_environment_variables \
47 | --create_if_absent \
48 | --noclobber
49 |
50 | chown -R hadoop:hadoop ${HADOOP_CONF_DIR}
51 | fi
52 |
--------------------------------------------------------------------------------
/libexec/install_and_configure_gcs_connector.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.D
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Downloads and installs the relevant gcs-connector-.jar.
16 | # Also configures it for use with hadoop.
17 |
18 | if (( ${INSTALL_GCS_CONNECTOR} )) ; then
19 |
20 | if [[ -r "${HADOOP_INSTALL_DIR}/libexec/hadoop-config.sh" ]]; then
21 | . "${HADOOP_INSTALL_DIR}/libexec/hadoop-config.sh"
22 | fi
23 | if [[ -n "${HADOOP_COMMON_LIB_JARS_DIR}" ]] && \
24 | [[ -n "${HADOOP_PREFIX}" ]]; then
25 | LIB_JARS_DIR="${HADOOP_PREFIX}/${HADOOP_COMMON_LIB_JARS_DIR}"
26 | else
27 | LIB_JARS_DIR="${HADOOP_INSTALL_DIR}/lib"
28 | fi
29 |
30 | # Grab the connector jarfile, add it to installation /lib directory.
31 | JARNAME=$(grep -o '[^/]*\.jar' <<< ${GCS_CONNECTOR_JAR})
32 | LOCAL_JAR="${LIB_JARS_DIR}/${JARNAME}"
33 |
34 | download_bd_resource "${GCS_CONNECTOR_JAR}" "${LOCAL_JAR}"
35 |
36 | echo "export HADOOP_CLASSPATH=\$HADOOP_CLASSPATH:${LOCAL_JAR}" \
37 | >> ${HADOOP_CONF_DIR}/hadoop-env.sh
38 |
39 | if (( ${ENABLE_NFS_GCS_FILE_CACHE} )); then
40 | export GCS_METADATA_CACHE_TYPE='FILESYSTEM_BACKED'
41 | export GCS_FILE_CACHE_DIRECTORY="$(get_nfs_mount_point)"
42 | else
43 | export GCS_METADATA_CACHE_TYPE='IN_MEMORY'
44 | # For IN_MEMORY cache, this directory won't actually be used, but we set
45 | # it to a sane default for easy manual experimentation of file caching.
46 | export GCS_FILE_CACHE_DIRECTORY='/tmp/gcs_connector_metadata_cache'
47 | fi
48 | bdconfig merge_configurations \
49 | --configuration_file ${HADOOP_CONF_DIR}/core-site.xml \
50 | --source_configuration_file gcs-core-template.xml \
51 | --resolve_environment_variables \
52 | --create_if_absent \
53 | --noclobber
54 |
55 | # Install a script that can be used to cleanup filesystem-based GCS caches.
56 | if [[ "$(hostname -s)" == "${MASTER_HOSTNAME}" \
57 | && "${ENABLE_NFS_GCS_FILE_CACHE}" -ne 0 ]] ; then
58 | setup_cache_cleaner
59 | fi
60 | fi
61 |
--------------------------------------------------------------------------------
/libexec/install_bdconfig.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Downloads and installs bdconfig and the xml templates
16 |
17 | set -e
18 |
19 | # Download and use bdconfig for xml configuration.
20 | if [[ ! -f "$(which bdconfig)" ]]; then
21 | download_bd_resource "${BDCONFIG}" /tmp/bdconfig.tar.gz
22 | mkdir -p /usr/local/share/google
23 | tar -C /usr/local/share/google -xzf /tmp/bdconfig.tar.gz
24 | ln -s /usr/local/share/google/bdconfig*/bdconfig /usr/local/bin
25 | fi
26 |
--------------------------------------------------------------------------------
/libexec/install_java.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Installs the OpenJDK Java7 JRE using apt-get.
16 |
17 | # Strip the debian mirrors to force only using the GCS mirrors. Not ideal for
18 | # production usage due to stripping security.debian.org, but reduces external
19 | # load for non-critical use cases.
20 |
21 | if (( ${INSTALL_JDK_DEVEL} )); then
22 | echo 'Installing JDK with compiler and tools'
23 | install_application "openjdk-7-jdk" "java-1.7.0-openjdk-devel"
24 | else
25 | echo 'Installing minimal JRE'
26 | install_application "openjdk-7-jre-headless" "java-1.7.0-openjdk"
27 | fi
28 |
--------------------------------------------------------------------------------
/libexec/mount_disks.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2013 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Mounts any attached persistent and ephemeral disks non-boot disks
16 |
17 | set -e
18 |
19 | # Get a list of disks from the metadata server.
20 | BASE_DISK_URL='http://metadata.google.internal/computeMetadata/v1/instance/disks/'
21 | MOUNT_TOOL_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/compute-image-packages/legacy/google-startup-scripts/usr/share/google/safe_format_and_mount'
22 | DISK_PATHS=$(curl_v1_metadata "${BASE_DISK_URL}")
23 | MOUNTED_DISKS=()
24 |
25 | MOUNT_TOOL=/tmp/${MOUNT_TOOL_URL##*/}
26 | download_bd_resource ${MOUNT_TOOL_URL} ${MOUNT_TOOL}
27 | chmod a+x ${MOUNT_TOOL}
28 |
29 | for DISK_PATH in ${DISK_PATHS}; do
30 | # Use the metadata server to determine the official index/name of each disk.
31 | DISK_NAME=$(curl_v1_metadata "${BASE_DISK_URL}${DISK_PATH}device-name")
32 | DISK_INDEX=$(curl_v1_metadata "${BASE_DISK_URL}${DISK_PATH}index")
33 | DISK_TYPE=$(curl_v1_metadata "${BASE_DISK_URL}${DISK_PATH}type")
34 |
35 | # Index '0' is the boot disk and is thus already mounted.
36 | if [[ "${DISK_INDEX}" == '0' ]]; then
37 | echo "Boot disk is ${DISK_NAME}; will not attempt to mount it."
38 | continue
39 | fi
40 |
41 | if [[ "${DISK_TYPE}" == 'EPHEMERAL' ]]; then
42 | DISK_PREFIX='ed'
43 | elif [[ "${DISK_TYPE}" == 'PERSISTENT' ]]; then
44 | DISK_PREFIX='pd'
45 | fi
46 |
47 | # The metadata-specified 'name' can be converted to a disk 'id' by prepending
48 | # 'google-' and finding it under /dev/disk/by-id.
49 | DISK_ID="/dev/disk/by-id/google-${DISK_NAME}"
50 | echo "Resolved disk name '${DISK_NAME}' to expected path '${DISK_ID}'."
51 |
52 | # We will name the mount-point after the official 'disk index'; this means
53 | # there will be no mounted disk with suffix '0' since '0' is the boot disk.
54 | DATAMOUNT="/mnt/${DISK_PREFIX}${DISK_INDEX}"
55 | mkdir -p ${DATAMOUNT}
56 | MOUNTED_DISKS+=(${DATAMOUNT})
57 | echo "Mounting '${DISK_ID}' under mount point '${DATAMOUNT}'..."
58 |
59 | ${MOUNT_TOOL} -m 'mkfs.ext4 -F' ${DISK_ID} ${DATAMOUNT}
60 |
61 | # Idempotently update /etc/fstab
62 | if cut -d '#' -f 1 /etc/fstab | grep -qvw ${DATAMOUNT}; then
63 | DISK_UUID=$(blkid ${DISK_ID} -s UUID -o value)
64 | MOUNT_ENTRY=($(grep -w ${DATAMOUNT} /proc/mounts))
65 | # Taken from /usr/share/google/safe_format_and_mount
66 | MOUNT_OPTIONS='defaults,discard'
67 | echo "UUID=${DISK_UUID} ${MOUNT_ENTRY[@]:1:2} ${MOUNT_OPTIONS} 0 2 \
68 | # added by bdutil" >> /etc/fstab
69 | fi
70 | done
71 |
72 | # If disks are mounted use the first one to hold target of symlink /hadoop
73 | if (( ${#MOUNTED_DISKS[@]} )); then
74 | MOUNTED_HADOOP_DIR=${MOUNTED_DISKS[0]}/hadoop
75 | mkdir -p ${MOUNTED_HADOOP_DIR}
76 | if [[ ! -d /hadoop ]]; then
77 | ln -s ${MOUNTED_HADOOP_DIR} /hadoop
78 | fi
79 | fi
80 |
--------------------------------------------------------------------------------
/libexec/set_default_fs.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Sets the default file system for Hadoop.
16 |
17 | set -e
18 |
19 | # Set FS specific config variables
20 | if [[ "${DEFAULT_FS}" == 'gs' ]]; then
21 | DEFAULT_FS_NAME="gs://${CONFIGBUCKET}/"
22 | elif [[ "${DEFAULT_FS}" == 'hdfs' ]]; then
23 | DEFAULT_FS_NAME="${NAMENODE_URI}"
24 | fi
25 |
26 | bdconfig set_property \
27 | --configuration_file ${HADOOP_CONF_DIR}/core-site.xml \
28 | --name 'fs.default.name' \
29 | --value ${DEFAULT_FS_NAME} \
30 | --clobber
31 |
32 | bdconfig set_property \
33 | --configuration_file ${HADOOP_CONF_DIR}/core-site.xml \
34 | --name 'fs.defaultFS' \
35 | --value ${DEFAULT_FS_NAME} \
36 | --clobber
37 |
--------------------------------------------------------------------------------
/libexec/setup_client_nfs.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | if (( ${INSTALL_GCS_CONNECTOR} )) && \
16 | (( ${ENABLE_NFS_GCS_FILE_CACHE} )) ; then
17 | # Set up the GCS_ADMIN user.
18 | setup_gcs_admin
19 |
20 | install_application "nfs-common" "nfs-utils"
21 | install_application "autofs"
22 |
23 | NFS_MOUNT_POINT="$(get_nfs_mount_point)"
24 | NFS_EXPORT_POINT="$(get_nfs_export_point)"
25 |
26 | mkdir -p "${NFS_MOUNT_POINT}"
27 | chown ${GCS_ADMIN}:${GCS_ADMIN} "${NFS_MOUNT_POINT}"
28 | if ! grep -e "auto.hadoop_gcs_metadata_cache" /etc/auto.master ; then
29 | echo "/- /etc/auto.hadoop_gcs_metadata_cache nobind" >> /etc/auto.master
30 | fi
31 |
32 | MOUNT_STRING="/${NFS_MOUNT_POINT} -fstype=nfs,defaults,rw,hard,intr"
33 | MOUNT_STRING="${MOUNT_STRING} ${GCS_CACHE_MASTER_HOSTNAME}:${NFS_EXPORT_POINT}"
34 | echo "${MOUNT_STRING}" > /etc/auto.hadoop_gcs_metadata_cache
35 |
36 | if [[ -f /usr/lib/systemd/system/autofs.service ]] \
37 | && which systemctl ; then
38 | systemctl enable autofs
39 | fi
40 |
41 | service autofs restart
42 | fi
43 |
--------------------------------------------------------------------------------
/libexec/setup_hadoop_user.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Adds a new 'hadoop' user which will be used to run the hadoop servers.
16 |
17 | set -e
18 |
19 | mkdir -p /home/hadoop
20 | mkdir -p /home/hadoop/.ssh
21 |
22 | if ! (id -u hadoop >& /dev/null); then
23 | useradd --system --shell /bin/bash -M --home /home/hadoop --user-group hadoop
24 | fi
25 |
26 | if skeleton_files=$(find /etc/skel/ -maxdepth 1 -type f); then
27 | cp ${skeleton_files} /home/hadoop
28 | fi
29 |
30 | chown -R hadoop:hadoop /home/hadoop
31 |
32 | mkdir -p ~hadoop/.ssh
33 | chown -R hadoop:hadoop ~hadoop/.ssh/
34 |
35 | if [[ -x $(which restorecon) ]]; then
36 | restorecon -Rv /home
37 | fi
38 |
39 | mkdir -p /var/log/hadoop
40 | chown hadoop:hadoop /var/log/hadoop
41 |
--------------------------------------------------------------------------------
/libexec/setup_master_ssh.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2013 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Sets up ssh keys on the master and then uploads them to a GCS CONFIGBUCKET
16 | # for worker to later download.
17 |
18 | set -e
19 |
20 | mkdir -p /home/hadoop/.ssh/
21 | chmod 700 /home/hadoop/.ssh
22 |
23 | PRIVATE_KEY_NAME='hadoop_master_id_rsa'
24 | PUBLIC_KEY_NAME="${PRIVATE_KEY_NAME}.pub"
25 | LOCAL_PUBLIC_KEY="/home/hadoop/.ssh/${PUBLIC_KEY_NAME}"
26 | REMOTE_PUBLIC_KEY="${BDUTIL_GCS_STAGING_DIR}/${PUBLIC_KEY_NAME}"
27 | LOCAL_PRIVATE_KEY="/home/hadoop/.ssh/${PRIVATE_KEY_NAME}"
28 |
29 | ssh-keygen -N "" -f ${LOCAL_PRIVATE_KEY}
30 |
31 | # Authorize ssh into self as well, in case the master is also a worker node.
32 | cat ${LOCAL_PUBLIC_KEY} >> /home/hadoop/.ssh/authorized_keys
33 |
34 | echo "Host ${PREFIX}*" >> /home/hadoop/.ssh/config
35 | echo " IdentityFile ${LOCAL_PRIVATE_KEY}" >> /home/hadoop/.ssh/config
36 | echo ' UserKnownHostsFile /dev/null' >> /home/hadoop/.ssh/config
37 | echo ' CheckHostIP no' >> /home/hadoop/.ssh/config
38 | echo ' StrictHostKeyChecking no' >> /home/hadoop/.ssh/config
39 |
40 | gsutil cp ${LOCAL_PUBLIC_KEY} ${REMOTE_PUBLIC_KEY}
41 |
--------------------------------------------------------------------------------
/libexec/setup_worker_ssh.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2013 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Downloads shared ssh keys previously generated by the hadoop master and
16 | # uses them to configure intra-cluster ssh access.
17 |
18 | set -e
19 |
20 | mkdir -p ~hadoop/.ssh/
21 |
22 | PRIVATE_KEY_NAME='hadoop_master_id_rsa'
23 | PUBLIC_KEY_NAME="${PRIVATE_KEY_NAME}.pub"
24 | LOCAL_PUBLIC_KEY="/home/hadoop/.ssh/${PUBLIC_KEY_NAME}"
25 | REMOTE_PUBLIC_KEY="${BDUTIL_GCS_STAGING_DIR}/${PUBLIC_KEY_NAME}"
26 |
27 | gsutil cp ${REMOTE_PUBLIC_KEY} ${LOCAL_PUBLIC_KEY}
28 | cat ${LOCAL_PUBLIC_KEY} >> ~hadoop/.ssh/authorized_keys
29 |
30 | echo "Host ${PREFIX}*" >> ~hadoop/.ssh/config
31 | echo ' UserKnownHostsFile /dev/null' >> ~hadoop/.ssh/config
32 | echo ' CheckHostIP no' >> ~hadoop/.ssh/config
33 | echo ' StrictHostKeyChecking no' >> ~hadoop/.ssh/config
34 |
35 | chown -R hadoop:hadoop ~hadoop/.ssh/
36 | chmod 700 ~hadoop/.ssh
37 |
--------------------------------------------------------------------------------
/libexec/start_hadoop.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Starts relevant hadoop daemon servers as the 'hadoop' user.
16 | set -e
17 |
18 | source hadoop_helpers.sh
19 |
20 | HADOOP_PORTS=(50010 50020 50030 50060 50070 50075 50090)
21 |
22 | cd ${HADOOP_INSTALL_DIR}
23 |
24 | # Test for sshability to workers.
25 | for NODE in ${WORKERS[@]}; do
26 | sudo -u hadoop ssh ${NODE} "exit 0"
27 | done
28 |
29 | # Wait for our ports to be free, but keep running even if not.
30 | wait_until_ports_free_and_report "${HADOOP_PORTS[@]}" || true
31 |
32 | # Start namenode and jobtracker
33 | if (( ${ENABLE_HDFS} )); then
34 | start_with_retry_namenode start_dfs_hadoop_1 &
35 | fi
36 | start_with_retry_jobtracker &
37 | for SUBPROC in $(jobs -p); do
38 | wait ${SUBPROC}
39 | done
40 |
41 | check_filesystem_accessibility
42 |
--------------------------------------------------------------------------------
/libexec/start_hadoop2.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Starts relevant hadoop daemon servers as the 'hadoop' user.
16 |
17 | set -e
18 |
19 | source hadoop_helpers.sh
20 |
21 | HADOOP_PORTS=(8088 50010 50020 50070 50090)
22 |
23 | cd ${HADOOP_INSTALL_DIR}
24 |
25 | # Test for sshability to workers.
26 | for NODE in ${WORKERS[@]}; do
27 | sudo -u hadoop ssh ${NODE} "exit 0"
28 | done
29 |
30 | # Wait for our ports to be free, but keep running even if not.
31 | wait_until_ports_free_and_report "${HADOOP_PORTS[@]}" || true
32 |
33 | if (( ${ENABLE_HDFS} )); then
34 | # Start namenode and jobtracker
35 | start_with_retry_namenode start_dfs_hadoop_2
36 |
37 | if [[ "${DEFAULT_FS}" == 'hdfs' ]]; then
38 | # Set up HDFS /tmp and /user dirs
39 | initialize_hdfs_dirs
40 | fi
41 | fi
42 |
43 | # Start up resource and node managers
44 | sudo -u hadoop ./sbin/start-yarn.sh
45 | service hadoop-mapreduce-historyserver start
46 |
47 | check_filesystem_accessibility
48 |
--------------------------------------------------------------------------------
/platforms/cdh/README.md:
--------------------------------------------------------------------------------
1 | Deploying Cloudera Data Hub (CDH) on Google Compute Engine
2 | ==========================================================
3 |
4 | Basic Usage
5 | -----------
6 |
7 | This plugin replaces the vanilla Apache binary tarballs with [Cloudera Data Hub](http://www.cloudera.com/content/cloudera/en/products-and-services/cdh.html) packages. Cluster configuration is the same as in core bdutil.
8 |
9 | ./bdutil -e platforms/cdh/cdh_env.sh deploy
10 |
11 | Or alternatively, using shorthand syntax:
12 |
13 | ./bdutil -e cdh deploy
14 |
15 | Status
16 | ------
17 |
18 | This plugin is currently considered experimental and not officially supported.
19 | Contributions are welcome.
20 |
--------------------------------------------------------------------------------
/platforms/cdh/cdh-core-template.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | hadoop.proxyuser.hue.hosts
6 | *
7 |
8 |
9 | hadoop.proxyuser.hue.groups
10 | *
11 |
12 |
13 | hadoop.proxyuser.oozie.hosts
14 | *
15 |
16 |
17 | hadoop.proxyuser.oozie.groups
18 | *
19 |
20 |
21 |
--------------------------------------------------------------------------------
/platforms/cdh/cdh_env.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Extension file for deploying CDH with bdutil
16 |
17 | # Requies Hadoop 2 libraries (for recent versions at least).
18 | import_env hadoop2_env.sh
19 |
20 | # Change these.
21 | CDH_VERSION=5
22 | # Components are installed / started in the order they are listed.
23 | MASTER_COMPONENTS="hadoop-hdfs-namenode hadoop-hdfs-secondarynamenode
24 | hadoop-yarn-resourcemanager hadoop-mapreduce-historyserver
25 | hive-metastore hive pig oozie hue"
26 | DATANODE_COMPONENTS="hadoop-hdfs-datanode hadoop-yarn-nodemanager
27 | hadoop-mapreduce"
28 |
29 | # Install JDK with compiler/tools instead of just the minimal JRE.
30 | INSTALL_JDK_DEVEL=true
31 |
32 | # Hardware configuration.
33 | NUM_WORKERS=4
34 | WORKER_ATTACHED_PDS_SIZE_GB=1500
35 | MASTER_ATTACHED_PD_SIZE_GB=1500
36 |
37 | # Don't change these.
38 | HADOOP_CONF_DIR='/etc/hadoop/conf'
39 | HADOOP_INSTALL_DIR='/usr/lib/hadoop'
40 | DEFAULT_FS='hdfs'
41 | UPLOAD_FILES+=('platforms/cdh/cdh-core-template.xml')
42 | USE_ATTACHED_PDS=true
43 |
44 | COMMAND_GROUPS+=(
45 | "deploy-cdh:
46 | libexec/mount_disks.sh
47 | libexec/install_java.sh
48 | platforms/cdh/install_cdh.sh
49 | libexec/install_bdconfig.sh
50 | libexec/configure_hadoop.sh
51 | libexec/install_and_configure_gcs_connector.sh
52 | libexec/configure_hdfs.sh
53 | libexec/set_default_fs.sh
54 | platforms/cdh/configure_cdh.sh"
55 |
56 | "restart_services:
57 | platforms/restart_services.sh"
58 | )
59 |
60 | COMMAND_STEPS=(
61 | 'deploy-cdh,deploy-cdh'
62 | 'deploy-master-nfs-setup,*'
63 | 'deploy-client-nfs-setup,deploy-client-nfs-setup'
64 | 'restart_services,restart_services'
65 | )
66 |
--------------------------------------------------------------------------------
/platforms/cdh/configure_cdh.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Copyright 2014 Google Inc. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS-IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | # Misc configurations for components not installed elsewhere.
17 | # Not necessarily CDH specific.
18 |
19 | # Use FQDNs
20 | grep ${HOSTNAME} -lR ${HADOOP_CONF_DIR} \
21 | | xargs -r sed -i "s/${HOSTNAME}/$(hostname --fqdn)/g"
22 |
23 | # Configure Hive Metastore
24 | if dpkg -s hive-metastore > /dev/null; then
25 | # Configure Hive metastorea
26 | bdconfig set_property \
27 | --configuration_file /etc/hive/conf/hive-site.xml \
28 | --name 'hive.metastore.uris' \
29 | --value "thrift://$(hostname --fqdn):9083" \
30 | --clobber
31 | fi
32 |
33 | # Configure Hue
34 | if dpkg -s hue > /dev/null; then
35 | # Replace localhost with hostname.
36 | sed -i "s/#*\([^#]*=.*\)localhost/\1$(hostname --fqdn)/" /etc/hue/conf/hue.ini
37 | fi
38 |
39 | # Configure Oozie
40 | if dpkg -s oozie > /dev/null; then
41 | sudo -u oozie /usr/lib/oozie/bin/ooziedb.sh create -run
42 |
43 | # Try to enable gs:// paths
44 | bdconfig set_property \
45 | --configuration_file /etc/oozie/conf/oozie-site.xml \
46 | --name 'oozie.service.HadoopAccessorService.supported.filesystems' \
47 | --value 'hdfs,gs,webhdfs,hftp' \
48 | --clobber
49 | fi
50 |
51 | # Enable WebHDFS
52 | bdconfig set_property \
53 | --configuration_file ${HADOOP_CONF_DIR}/hdfs-site.xml \
54 | --name 'dfs.webhdfs.enabled' \
55 | --value true \
56 | --clobber
57 |
58 | # Enable Hue / Oozie impersonation
59 | bdconfig merge_configurations \
60 | --configuration_file ${HADOOP_CONF_DIR}/core-site.xml \
61 | --source_configuration_file cdh-core-template.xml \
62 | --resolve_environment_variables \
63 | --clobber
64 |
--------------------------------------------------------------------------------
/platforms/cdh/install_cdh.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Copyright 2014 Google Inc. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS-IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | #TODO(user) support other Linux distributions.
17 | ARCHIVE_URL="http://archive.cloudera.com/cdh${CDH_VERSION}/debian/jessie/amd64/cdh"
18 | cat << EOF > /etc/apt/sources.list.d/cloudera.list
19 | deb ${ARCHIVE_URL} jessie-cdh${CDH_VERSION} contrib
20 | deb-src ${ARCHIVE_URL} jessie-cdh${CDH_VERSION} contrib
21 | EOF
22 | # TODO(user): fix insecure download of apt-key.
23 | download_bd_resource ${ARCHIVE_URL}/archive.key /tmp/cloudera.key
24 | apt-key add /tmp/cloudera.key
25 |
26 | apt-get update
27 |
28 | if [[ $(hostname -s) == ${MASTER_HOSTNAME} ]]; then
29 | COMPONENTS="${MASTER_COMPONENTS}"
30 | else
31 | COMPONENTS="${DATANODE_COMPONENTS}"
32 | fi
33 |
34 | for COMPONENT in ${COMPONENTS}; do
35 | if ! install_application ${COMPONENT}; then
36 | # Check that it was actually installed as Services often fail to start.
37 | dpkg -s ${COMPONENT}
38 | fi
39 | # Stop installed services:
40 | if [[ -x "/etc/init.d/${COMPONENT}" ]]; then
41 | service ${COMPONENT} stop
42 | fi
43 | done
44 |
--------------------------------------------------------------------------------
/platforms/hdp/TEST.md:
--------------------------------------------------------------------------------
1 | ## Prep
2 |
3 | ```
4 | CONFIGBUCKET=hdp-00
5 | PROJECT=hdp-00
6 | switches="-b ${CONFIGBUCKET} -p ${PROJECT}"
7 |
8 | # add this to make it a smaller test than the defaults
9 | switches+="
10 | --master_attached_pd_size_gb 100
11 | --worker_attached_pds_size_gb 100
12 | -n 1
13 | -m n1-standard-2"
14 |
15 |
16 | bdutil="./bdutil ${switches}"
17 | ```
18 |
19 | ## Test ambari_env.sh
20 |
21 | ```
22 | environment=platforms/hdp/ambari_env.sh
23 | bdutil="${bdutil} -e ${environment}"
24 |
25 | ## deploy
26 | ${bdutil} deploy
27 |
28 | ## test
29 | ${bdutil} shell < ./hadoop-validate-setup.sh
30 | ${bdutil} shell < ./hadoop-validate-gcs.sh
31 | ${bdutil} shell < ./extensions/querytools/hive-validate-setup.sh
32 | ${bdutil} shell < ./extensions/querytools/pig-validate-setup.sh
33 | #${bdutil} shell < ./extensions/spark/spark-validate-setup.sh
34 |
35 | ## delete
36 | ${bdutil} delete
37 | ```
38 |
39 |
40 | ## Test ambari_manual_env.sh
41 |
42 | ```
43 | environment=platforms/hdp/ambari_manual_env.sh
44 | bdutil="${bdutil} -e ${environment}"
45 |
46 | ## deploy
47 | ${bdutil} deploy
48 |
49 | ## test
50 | # need to add an automated test here:
51 | ${bdutil} shell # do something here like check the appropriate number of hosts in /api/v1/hosts
52 |
53 | ## delete
54 | ${bdutil} delete
55 |
56 | ```
57 |
58 | ## Test re-using disks across multiple deployments of same instance count
59 |
60 | ```
61 | environment=platforms/hdp/ambari_env.sh
62 | bdutil="${bdutil} -e ${environment}"
63 | unset CREATE_ATTACHED_PDS_ON_DEPLOY
64 | unset DELETE_ATTACHED_PDS_ON_DELETE
65 |
66 | ## create
67 | export CREATE_ATTACHED_PDS_ON_DEPLOY=true
68 | ${bdutil} deploy
69 |
70 | ## generate some data onto HDFS, and dont’ delete it
71 | echo "hadoop fs -mkdir redeploy-validation.tmp" | ${bdutil} shell
72 | ## if you want more data than that:
73 | #${bdutil} -u hadoop-validate-setup.sh run_command -- \
74 | # sudo -u "$(whoami)" TERA_CLEANUP_SKIP=true TERA_GEN_NUM_RECORDS=100000 ./hadoop-validate-setup.sh
75 |
76 | ## check that the ‘validate_...’ dir is there
77 | echo "hadoop fs -ls" | ${bdutil} shell
78 |
79 | ## delete the cluster but keep disks
80 | export DELETE_ATTACHED_PDS_ON_DELETE=false
81 | ${bdutil} delete
82 |
83 | ## create with existing disks
84 | export CREATE_ATTACHED_PDS_ON_DEPLOY=false
85 | ${bdutil} deploy
86 |
87 | ## check that the ‘validate_...’ dir is there
88 | echo "hadoop fs -ls" | ${bdutil} shell
89 |
90 | ## delete everything to cleanup this testing
91 | export DELETE_ATTACHED_PDS_ON_DELETE=true
92 | ${bdutil} delete
93 | ```
94 |
--------------------------------------------------------------------------------
/platforms/hdp/ambari.conf:
--------------------------------------------------------------------------------
1 | ########################################################################
2 | ########################################################################
3 | ## This is the base configuration file for the ##
4 | ## Hortonworks Data Platform (HDP) extension to Google's `bdutil` ##
5 | ## ##
6 | ## Most of the values are commented out and just shown here for ##
7 | ## completeness, together with their default value. ##
8 | ########################################################################
9 | ########################################################################
10 |
11 | ## ambari.conf
12 | ## Provides configuration for 'bdutil' installations of Ambari
13 |
14 |
15 | ## bdutil setting overrides
16 | ## For further details see:
17 | ## `bdutil_env.sh`
18 | ## https://cloud.google.com/hadoop/setting-up-a-hadoop-cluster
19 |
20 | ## Your Google Cloud Platform configbucket & project
21 | ## Must be set here,
22 | ## or in `bdutil_env.sh`
23 | ## or with the -b & -p switches to `bdutil`
24 | #CONFIGBUCKET=""
25 | #PROJECT=""
26 |
27 | ## the region/zone to deploy into
28 | #GCE_ZONE='us-central1-a'
29 |
30 | ## Number of worker nodes. Total nodes will be NUM_WORKERS+1
31 | #NUM_WORKERS=4
32 |
33 | ## Google Compute Engine machine type
34 | #GCE_MACHINE_TYPE='n1-standard-4'
35 |
36 | ## Amount of storage to attach
37 | #WORKER_ATTACHED_PDS_SIZE_GB=1500
38 | #MASTER_ATTACHED_PD_SIZE_GB=1500
39 |
40 | ## Amount of storage to give the boot disk.
41 | ## A full HDP stack starts to fill up 10 GB.
42 | #MASTER_BOOT_DISK_SIZE_GB=50
43 | #WORKER_BOOT_DISK_SIZE_GB=50
44 |
45 | ## Storage types (pd-standard or pd-ssd)
46 | #WORKER_ATTACHED_PDS_TYPE='pd-standard'
47 | #MASTER_ATTACHED_PD_TYPE='pd-standard'
48 |
49 |
50 | ## HDP settings
51 | ## ============
52 |
53 | ## If 'true', URLs for web interfaces, such as the jobtracker will be
54 | ## linked from Ambari with the public IP.
55 | ## Default is false. You will need to SSH to reach the host in this case.
56 | #AMBARI_PUBLIC=false
57 |
58 | #AMBARI_VERSION='2.2.1.0'
59 | #AMBARI_REPO=http://public-repo-1.hortonworks.com/ambari/centos6/${AMBARI_VERSION:0:1}.x/updates/${AMBARI_VERSION}/ambari.repo
60 |
61 | ## The distribution to install on your cluster.
62 | #AMBARI_STACK='HDP'
63 | #AMBARI_STACK_VERSION='2.4'
64 |
65 | ## The components of that distribution to install on the cluster.
66 | ## Default is all but Kerberos, Apache Knox, Apache Ranger, and Hortonworks
67 | # SmartSense.
68 | #AMBARI_SERVICES="ACCUMULO AMBARI_METRICS ATLAS FALCON FLUME GANGLIA HBASE HDFS
69 | # HIVE KAFKA MAHOUT MAPREDUCE2 OOZIE PIG SLIDER SPARK SQOOP STORM TEZ YARN
70 | # ZOOKEEPER"
71 |
72 | ## You can run with as little as:
73 | #AMBARI_SERVICES='HDFS MAPREDUCE2 YARN'
74 |
75 | ## If using HDP 2.2, these are the supported services:
76 | #AMBARI_SERVICES="AMBARI_METRICS FALCON FLUME GANGLIA HBASE HDFS HIVE KAFKA
77 | # MAPREDUCE2 OOZIE PIG SLIDER SPARK SQOOP STORM TEZ YARN ZOOKEEPER"
78 |
79 | ## If you want to use a different JAVA
80 | ## Default is set by alternatives to 'openjdk-7-devel'
81 | #JAVA_HOME="/etc/alternatives/java_sdk"
82 |
--------------------------------------------------------------------------------
/platforms/hdp/ambari_env.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | # ambari_env.sh
17 | #
18 | # Extension providing a cluster with Apache Ambari installed and automatically
19 | # provisions and configures the cluster's software. This installs and configures
20 | # the GCS connector.
21 |
22 | ########################################################################
23 | ## There should be nothing to edit here, use ambari.conf ##
24 | ########################################################################
25 |
26 | # Import the base Ambari installation
27 | import_env platforms/hdp/ambari_manual_env.sh
28 |
29 | # The distribution to install on your cluster.
30 | AMBARI_STACK="${AMBARI_STACK:-HDP}"
31 | AMBARI_STACK_VERSION="${AMBARI_STACK_VERSION:-2.4}"
32 |
33 | ## The components of that distribution to install on the cluster.
34 | # Default is all but Kerberos, Apache Knox, Apache Ranger, and Hortonworks
35 | # SmartSense.
36 | AMBARI_SERVICES="${AMBARI_SERVICES:-ACCUMULO AMBARI_METRICS ATLAS FALCON FLUME
37 | GANGLIA HBASE HDFS HIVE KAFKA MAHOUT MAPREDUCE2 OOZIE PIG SLIDER SPARK SQOOP
38 | STORM TEZ YARN ZOOKEEPER}"
39 |
40 |
41 | if [[ -n "${BDUTIL_DIR}" ]]; then
42 | UPLOAD_FILES+=(
43 | "${BDUTIL_DIR}/platforms/hdp/create_blueprint.py"
44 | )
45 | fi
46 |
47 | COMMAND_GROUPS+=(
48 | "install-ambari-components:
49 | platforms/hdp/install_ambari_components.sh
50 | "
51 | )
52 |
53 | COMMAND_STEPS+=(
54 | 'install-ambari-components,*'
55 | 'install-gcs-connector-on-ambari,install-gcs-connector-on-ambari'
56 | 'update-ambari-config,*'
57 | )
58 |
--------------------------------------------------------------------------------
/platforms/hdp/ambari_functions.sh:
--------------------------------------------------------------------------------
1 | ## Tools for interacting with Ambari SERVER
2 |
3 | AMBARI_TIMEOUT=${AMBARI_TIMEOUT:-3600}
4 | POLLING_INTERVAL=${POLLING_INTERVAL:-10}
5 |
6 |
7 | function ambari_wait() {
8 | local condition="$1"
9 | local goal="$2"
10 | local failed="FAILED"
11 | local limit=$(( ${AMBARI_TIMEOUT} / ${POLLING_INTERVAL} + 1 ))
12 |
13 | for (( i=0; i<${limit}; i++ )); do
14 | local status=$(bash -c "${condition}")
15 | echo "ambari_wait status: ${status}" >&2
16 | if [[ "${status}" == "${goal}" ]]; then
17 | break
18 | elif [[ "${status}" =~ "${failed}" ]]; then
19 | echo "Ambari operiation failed with status: ${status}" >&2
20 | return 1
21 | fi
22 | sleep ${POLLING_INTERVAL}
23 | done
24 |
25 | if [[ ${i} == ${limit} ]]; then
26 | echo "ambari_wait did not finish within" \
27 | "'${AMBARI_TIMEOUT}' seconds. Exiting." >&2
28 | return 1
29 | fi
30 | }
31 |
32 | # Only useful during a fresh install where we expect no failures
33 | # Will not work if any requested TIMEDOUT/ABORTED
34 | function ambari_wait_requests_completed() {
35 | # Avoid race conditions with requests.
36 | sleep 10
37 | AMBARI_CLUSTER=$(get_ambari_cluster_name)
38 | # Poll for completion
39 | ambari_wait "${AMBARI_CURL} ${AMBARI_API}/clusters/${AMBARI_CLUSTER}/requests \
40 | | grep -Eo 'http://.*/requests/[0-9]+' \
41 | | xargs ${AMBARI_CURL} \
42 | | grep request_status \
43 | | grep -Eo '\"[A-Z_]+\"' \
44 | | sort | uniq | paste -sd'+'" \
45 | '"COMPLETED"'
46 | }
47 |
48 | function ambari_service_stop() {
49 | AMBARI_CLUSTER=$(get_ambari_cluster_name)
50 | if [[ -z "${SERVICE}" ]]; then
51 | echo "Taking no action as no SERVICE was defined. You may specific ALL to stop all Services."
52 | else
53 | AMBARI_REQUEST='{"RequestInfo": {"context" :"Stop '${SERVICE}' via REST"}, "Body": {"ServiceInfo": {"state": "INSTALLED"}}}'
54 | if [[ "${SERVICE}" == "ALL" ]]; then
55 | ${AMBARI_CURL} -i -X PUT -d "${AMBARI_REQUEST}" ${AMBARI_API}/clusters/${AMBARI_CLUSTER}/services/
56 | else
57 | ${AMBARI_CURL} -i -X PUT -d "${AMBARI_REQUEST}" ${AMBARI_API}/clusters/${AMBARI_CLUSTER}/services/${SERVICE}
58 | fi
59 | fi
60 | }
61 |
62 | function ambari_service_start() {
63 | AMBARI_CLUSTER=$(get_ambari_cluster_name)
64 | if [[ -z "${SERVICE}" ]]; then
65 | echo "Taking no action as no SERVICE was defined"
66 | else
67 | AMBARI_REQUEST='{"RequestInfo": {"context" :"Start '${SERVICE}' via REST"}, "Body": {"ServiceInfo": {"state": "STARTED"}}}'
68 | if [[ "${SERVICE}" == 'ALL' ]]; then
69 | ${AMBARI_CURL} -i -X PUT -d "${AMBARI_REQUEST}" ${AMBARI_API}/clusters/${AMBARI_CLUSTER}/services/
70 | else
71 | ${AMBARI_CURL} -i -X PUT -d "${AMBARI_REQUEST}" ${AMBARI_API}/clusters/${AMBARI_CLUSTER}/services/${SERVICE}
72 | fi
73 | fi
74 | }
75 |
76 | # set SERVICE=ALL to restart all services
77 | function ambari_service_restart() {
78 | ambari_service_stop
79 | ambari_wait_requests_completed
80 | ambari_service_start
81 | ambari_wait_requests_completed
82 | }
83 |
84 | function ambari_restart_all_services() {
85 | AMBARI_CLUSTER=$(get_ambari_cluster_name)
86 | SERVICES=($(${AMBARI_CURL} ${AMBARI_API}/clusters/${AMBARI_CLUSTER}/services \
87 | | grep -Eo 'http://.*/services/[^\"]+'))
88 |
89 | for STATE in 'INSTALLED' 'STARTED'; do
90 | ${AMBARI_CURL} -X PUT -d "{\"ServiceInfo\":{\"state\":\"${STATE}\"}}" "${SERVICES[@]}"
91 | ambari_wait_requests_completed
92 | done
93 | }
94 |
95 | # Make variable substitutions in a json file.
96 | function subsitute_bash_in_json() {
97 | local custom_configuration_file="$1"
98 | loginfo "Replacing variables in ${custom_configuration_file}."
99 | perl -pi -e 's/\$\{([^\}]*)\}/$ENV{$1}/e' ${custom_configuration_file}
100 | }
101 |
102 | # Print out name of first (and presumably only) cluster in Ambari.
103 | function get_ambari_cluster_name() {
104 | ${AMBARI_CURL} ${AMBARI_API}/clusters \
105 | | sed -n 's/.*cluster_name" : "\(\S*\)".*/\1/p' \
106 | | head -1
107 | }
108 |
--------------------------------------------------------------------------------
/platforms/hdp/ambari_manual_env.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | # ambari_manual_env.sh
17 | #
18 | # Extension installing Apache Ambari on the cluster allowing the user to
19 | # manually log in and provision and configure the clusters software.
20 | # This installs but does not configure the GCS connector.
21 |
22 | ########################################################################
23 | ## There should be nothing to edit here, use ambari.conf ##
24 | ########################################################################
25 |
26 | # Remove core bdutil upload files.
27 | UPLOAD_FILES=()
28 |
29 | # Import hadoop2_env.sh just for the GCS_CONNECTOR_JAR.
30 | import_env hadoop2_env.sh
31 |
32 | # Default to 4 workers plus master for good spreading of master daemons.
33 | NUM_WORKERS=4
34 | # Use CentOS instead of Debian.
35 | GCE_IMAGE=''
36 | GCE_IMAGE_FAMILY='centos-6'
37 | GCE_IMAGE_PROJECT='centos-cloud'
38 |
39 | # Create attached storage
40 | USE_ATTACHED_PDS=true
41 | # Since we'll be using HDFS as the default file system, size disks to grant
42 | # maximum I/O per VM.
43 | WORKER_ATTACHED_PDS_SIZE_GB=1500
44 | MASTER_ATTACHED_PD_SIZE_GB=1500
45 |
46 | ## Amount of storage to give the boot disk.
47 | ## A full HDP stack starts to fill up 10 GB.
48 | MASTER_BOOT_DISK_SIZE_GB=${MASTER_BOOT_DISK_SIZE_GB:-50}
49 | WORKER_BOOT_DISK_SIZE_GB=${MASTER_BOOT_DISK_SIZE_GB:-50}
50 |
51 | # Install the full Java JDK. Most services need it
52 | INSTALL_JDK_DEVEL=true
53 | JAVA_HOME=/etc/alternatives/java_sdk
54 |
55 | ## import configuration overrides
56 | import_env platforms/hdp/ambari.conf
57 |
58 | ## Version of Ambari and location of YUM package repository
59 | AMBARI_VERSION="${AMBARI_VERSION:-2.2.1.0}"
60 | AMBARI_REPO=${AMBARI_REPO:-http://public-repo-1.hortonworks.com/ambari/centos6/${AMBARI_VERSION:0:1}.x/updates/${AMBARI_VERSION}/ambari.repo}
61 |
62 | ## If 'true', URLs for web interfaces, such as the jobtracker will below
63 | ## linked from Ambari with the public IP.
64 | ## Default is false. You will need to SSH to reach the host in this case.
65 | AMBARI_PUBLIC=${AMBARI_PUBLIC:-false}
66 | normalize_boolean 'AMBARI_PUBLIC'
67 |
68 | # HDFS will always be the default file system (even if changed here), because
69 | # many services require it to be. This is purely advisory.
70 | DEFAULT_FS='hdfs'
71 |
72 | GCS_CACHE_CLEANER_LOG_DIRECTORY="/var/log/hadoop/${GCS_CACHE_CLEANER_USER}"
73 | GCS_CACHE_CLEANER_LOGGER='INFO,RFA'
74 | HADOOP_CONF_DIR="/etc/hadoop/conf"
75 | HADOOP_INSTALL_DIR="/usr/local/lib/hadoop"
76 |
77 | # For interacting with Ambari Server API
78 | AMBARI_API="http://localhost:8080/api/v1"
79 | AMBARI_CURL='curl -fsSu admin:admin -H X-Requested-By:ambari'
80 | MASTER_UI_PORTS=('8080')
81 |
82 | import_env platforms/hdp/ambari_functions.sh
83 |
84 | if [[ -n "${BDUTIL_DIR}" ]]; then
85 | UPLOAD_FILES+=(
86 | "${BDUTIL_DIR}/libexec/hadoop_helpers.sh"
87 | "${BDUTIL_DIR}/platforms/hdp/configuration.json"
88 | "${BDUTIL_DIR}/platforms/hdp/resources/public-hostname-gcloud.sh"
89 | "${BDUTIL_DIR}/platforms/hdp/resources/thp-disable.sh"
90 | )
91 | fi
92 |
93 | COMMAND_GROUPS+=(
94 | "ambari-setup:
95 | libexec/mount_disks.sh
96 | libexec/install_java.sh
97 | libexec/setup_hadoop_user.sh
98 | platforms/hdp/install_ambari.sh
99 | "
100 |
101 | "install-gcs-connector-on-ambari:
102 | platforms/hdp/install_gcs_connector_on_ambari.sh
103 | "
104 |
105 | "update-ambari-config:
106 | platforms/hdp/update_ambari_config.sh
107 | "
108 | )
109 |
110 | COMMAND_STEPS=(
111 | 'ambari-setup,ambari-setup'
112 | 'deploy-master-nfs-setup,*'
113 | 'deploy-client-nfs-setup,deploy-client-nfs-setup'
114 | )
115 |
--------------------------------------------------------------------------------
/platforms/hdp/ambari_manual_post_deploy_env.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 |
16 | # ambari_env.sh
17 | #
18 | # Extension providing a cluster with Apache Ambari installed and automatically
19 | # provisions and configures the cluster's software. This installs and configures
20 | # the GCS connector.
21 |
22 | ########################################################################
23 | ## There should be nothing to edit here, use ambari.conf ##
24 | ########################################################################
25 |
26 | # Import the base Ambari installation
27 | import_env platforms/hdp/ambari_manual_env.sh
28 |
29 | COMMAND_STEPS=(
30 | 'install-gcs-connector-on-ambari,install-gcs-connector-on-ambari'
31 | 'update-ambari-config,*'
32 | )
33 |
--------------------------------------------------------------------------------
/platforms/hdp/configuration.json:
--------------------------------------------------------------------------------
1 | {
2 | "configurations" : {
3 | "core-site" : {
4 | "fs.gs.project.id": "${PROJECT}",
5 | "fs.gs.system.bucket": "${CONFIGBUCKET}",
6 | "fs.gs.working.dir": "/",
7 | "fs.gs.impl" : "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem",
8 | "fs.AbstractFileSystem.gs.impl": "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS",
9 | "fs.gs.metadata.cache.enable": "true",
10 | "fs.gs.metadata.cache.type": "${GCS_METADATA_CACHE_TYPE}",
11 | "fs.gs.metadata.cache.directory": "${GCS_FILE_CACHE_DIRECTORY}",
12 |
13 | "hadoop.proxyuser.root.hosts": "*",
14 | "hadoop.proxyuser.root.groups": "*",
15 | "hadoop.proxyuser.root.users": "*"
16 | },
17 | "hdfs-site" : {
18 | "dfs.replication" : "2"
19 | },
20 | "mapred-site" : {
21 | "mapreduce.job.working.dir" : "/user/${user.name}"
22 | }
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/platforms/hdp/install_gcs_connector_on_ambari.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # Copyright 2014 Google Inc. All Rights Reserved.
4 |
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS-IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 |
18 | ## install_gcs_connector_on_ambari.sh
19 | ## This file:
20 | ## * downloads the relevant gcs-connector-.jar
21 | ## * installs into a local lib dir
22 | ## * adds that lib dir to relevant classpaths
23 |
24 | if (( ${INSTALL_GCS_CONNECTOR} )) ; then
25 | loginfo "installing GCS_CONNECTOR_JAR on each node"
26 | LIB_JARS_DIR="${HADOOP_INSTALL_DIR}/lib"
27 | mkdir -p ${LIB_JARS_DIR}
28 |
29 | # Grab the connector jarfile, add it to installation /lib directory.
30 | JARNAME=$(grep -o '[^/]*\.jar' <<< ${GCS_CONNECTOR_JAR})
31 | LOCAL_JAR="${LIB_JARS_DIR}/${JARNAME}"
32 |
33 | download_bd_resource "${GCS_CONNECTOR_JAR}" "${LOCAL_JAR}"
34 |
35 | # link gcs connector into main hadoop lib dir
36 | source <(grep "^export HADOOP_HOME=" /etc/hadoop/conf/hadoop-env.sh) || true
37 | if [[ -d "${HADOOP_HOME}/lib/" ]]; then
38 | ln -sv "${LOCAL_JAR}" "${HADOOP_HOME}/lib/"
39 | fi
40 | fi
41 |
--------------------------------------------------------------------------------
/platforms/hdp/resources/public-hostname-gcloud.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | curl -Ls -m 5 http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0/access-configs/0/external-ip -H "Metadata-Flavor: Google"
3 |
4 |
--------------------------------------------------------------------------------
/platforms/hdp/resources/thp-disable.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # disable transparent huge pages: for Hadoop
3 | thp_disable=true
4 | if [ "${thp_disable}" = true ]; then
5 | for path in redhat_transparent_hugepage transparent_hugepage; do
6 | if test -f /sys/kernel/mm/${path}/enabled; then
7 | echo never > /sys/kernel/mm/${path}/enabled
8 | fi
9 | if test -f /sys/kernel/mm/${path}/defrag; then
10 | echo never > /sys/kernel/mm/${path}/defrag
11 | fi
12 | done
13 | fi
14 | exit 0
15 |
--------------------------------------------------------------------------------
/platforms/hdp/update_ambari_config.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # finalize the cluster configuration
16 |
17 | source hadoop_helpers.sh
18 |
19 | # initialize hdfs dirs
20 | loginfo "Set up HDFS /tmp and /user dirs"
21 | initialize_hdfs_dirs admin
22 |
23 |
24 | AMBARI_CLUSTER=$(get_ambari_cluster_name)
25 |
26 | # update hadoop configuration to include the gcs connector
27 | if (( ${INSTALL_GCS_CONNECTOR} )) ; then
28 | loginfo "Setting up GCS connector cache cleaner and configuration."
29 | if (( ${ENABLE_NFS_GCS_FILE_CACHE} )); then
30 | export GCS_METADATA_CACHE_TYPE='FILESYSTEM_BACKED'
31 | export GCS_FILE_CACHE_DIRECTORY="$(get_nfs_mount_point)"
32 |
33 | setup_cache_cleaner
34 | else
35 | export GCS_METADATA_CACHE_TYPE='IN_MEMORY'
36 | # For IN_MEMORY cache, this directory won't actually be used, but we set
37 | # it to a sane default for easy manual experimentation of file caching.
38 | export GCS_FILE_CACHE_DIRECTORY='/tmp/gcs_connector_metadata_cache'
39 | fi
40 |
41 | # If it wasn't set at cluster creation configure the GCS connector.
42 | if ! /var/lib/ambari-server/resources/scripts/configs.sh \
43 | get localhost ${AMBARI_CLUSTER} core-site \
44 | | grep -q '^"fs.gs'; then
45 | subsitute_bash_in_json configuration.json
46 | sed -n < configuration.json \
47 | 's/.*"\(fs\.\S*gs\.\S*\)"\s*:\s*"\([^"]*\)".*/\1 \2/p' \
48 | | xargs -n 2 /var/lib/ambari-server/resources/scripts/configs.sh \
49 | set localhost ${AMBARI_CLUSTER} core-site
50 | # Will reload core-site.xml
51 | SERVICES_TO_UPDATE+=" HDFS"
52 | fi
53 |
54 | loginfo "Adding /usr/local/lib/hadoop/lib to " \
55 | "mapreduce.application.classpath."
56 | NEW_CLASSPATH=$(/var/lib/ambari-server/resources/scripts/configs.sh \
57 | get localhost ${AMBARI_CLUSTER} mapred-site \
58 | | grep -E '^"mapreduce.application.classpath"' \
59 | | tr -d \" \
60 | | awk '{print "/usr/local/lib/hadoop/lib/*,"$3}' | sed 's/,$//')
61 | /var/lib/ambari-server/resources/scripts/configs.sh \
62 | set localhost ${AMBARI_CLUSTER} \
63 | mapred-site mapreduce.application.classpath ${NEW_CLASSPATH}
64 | sleep 10
65 | fi
66 |
67 | loginfo "Restarting services, because Ambari usually requires it."
68 | SERVICE='ALL'
69 | ambari_service_stop
70 | ambari_wait_requests_completed
71 | ambari_service_start
72 | ambari_wait_requests_completed
73 |
74 | # Check GCS connectivity
75 | check_filesystem_accessibility
76 |
77 | # Set up files and pig views, which was added in Ambari 2.1.
78 | #
79 | if version_at_least "${AMBARI_VERSION}" '2.1'; then
80 | # This should be done automatically but it wasn't as of 2016-03-16.
81 | for view in FILES PIG; do
82 | # Both of these views are currently 1.0.0
83 | VIEW="${AMBARI_API}/views/${view}/versions/1.0.0/instances/AUTO_${view}_INSTANCE"
84 | if ${AMBARI_CURL} ${VIEW} |& grep -q '404 Not Found'; then
85 | ${AMBARI_CURL} -X POST ${VIEW} \
86 | -d "{\"ViewInstanceInfo\": {\"cluster_handle\": \"${AMBARI_CLUSTER}\"}}"
87 | fi
88 | done
89 | fi
90 |
--------------------------------------------------------------------------------
/platforms/mapr/README.md:
--------------------------------------------------------------------------------
1 | MapR Cluster on Google Compute Engine
2 | -------------------------------------
3 |
4 | The [MapR distribution](https://www.mapr.com/products/mapr-distribution-including-apache-hadoop) for Hadoop adds enterprise-grade features to the Hadoop platform that make Hadoop easier to use and more dependable. The MapR distribution for Hadoop is fully integrated with the [Google Compute Engine (GCE)](https://cloud.google.com/compute/) framework, allowing customers to deploy a MapR cluster with ready access to Google's cloud infrastructure. MapR provides network file system (NFS) and open database connectivity (ODBC) interfaces, a comprehensive management suite, and automatic compression. MapR provides high availability with a no-NameNode architecture and data protection with snapshots, disaster recovery, and cross-cluster mirroring.
5 |
6 | ### Make sure you have...
7 | * an active [Google Cloud Platform](https://console.developers.google.com/) account.
8 | * a client machine with [Google Cloud SDK](https://cloud.google.com/sdk/) and [bdutil](https://cloud.google.com/hadoop/downloads) installed.
9 | * access to a GCE project where you can add instances, buckets and disks.
10 | * a valid MapR license (optional).
11 |
12 | ### Now, to launch a MapR Cluster on GCE using `bdutil`...
13 |
14 | 1. Set the project and bucket in `mapr_env.sh` (located under `bdutil/platforms/mapr/`).
15 | 2. Update `node.lst` to determine the [allocation of cluster roles](http://doc.mapr.com/display/MapR/MapR+Cluster+on+the+Google+Compute+Engine#MapRClusterontheGoogleComputeEngine-gce-config) for the nodes in the cluster. For reference, the config file contains a simple 4-node [M7](https://www.mapr.com/products/hadoop-download) cluster allocation.
16 | * Node names must have the PREFIX mentioned in `mapr_env.sh`
17 | * Node names must have suffixes: -m, -w-0, -w-1, -w-2 ...
18 | For example, if the PREFIX is 'mapr', node names must be 'mapr-m', 'mapr-w-0', 'mapr-w-1', ...
19 | * NUM_WORKERS in `mapr_env.sh` must equal one less than number of nodes in `node.lst`
20 | 3. (Optional) Copy a valid license into `mapr_license.txt`
21 | 4. Deploy the cluster by invoking in the bdutil root directory:
22 | ```
23 | ./bdutil -e mapr deploy
24 | ```
25 |
26 | 5. Access the cluster by invoking:
27 | ```
28 | gcloud compute config-ssh
29 | ```
30 |
31 | The output shows how to ssh into a node. Login as the `MAPR_USER` mentioned in `mapr_env.sh` (for example, `ssh mapr@node1.us-central1-f.t-diplomatic-962`).
32 | 6. Test an example application by running:
33 | ```
34 | yarn jar $MAPR_HOME/hadoop/hadoop-2.5.1/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.5.1-mapr-1501.jar pi 16 100
35 | ```
36 |
37 |
38 | ### At the end...
39 | To delete the cluster, ensure `mapr_env.sh` is same as in when deployed. In the bdutil root directory, invoke:
40 | ```
41 | ./bdutil -e mapr delete
42 | ```
43 |
44 | ### Additional Resources
45 | * [Free Hadoop On-Demand Training](https://www.mapr.com/services/mapr-academy/big-data-hadoop-online-training)
46 | * [Why MapR](https://www.mapr.com/why-hadoop/why-mapr)
47 | * [MapR Development Guide](http://doc.mapr.com/display/MapR/Development+Guide)
48 | * [MapR Documentation](http://doc.mapr.com/)
49 | * [MapR Support](https://www.mapr.com/support/overview)
50 | * [Another way](http://doc.mapr.com/display/MapR/MapR+Cluster+on+the+Google+Compute+Engine) to deploy
51 | * [MapR-on-GCE](https://github.com/mapr/gce)
52 |
53 | **LICENSE:** [Apache License, Version 2.0](https://github.com/GoogleCloudPlatform/bdutil/blob/master/LICENSE)
--------------------------------------------------------------------------------
/platforms/mapr/mapr_license.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudDataproc/bdutil/967fd15b1f690e961f7d61809e4976aaa4ade90f/platforms/mapr/mapr_license.txt
--------------------------------------------------------------------------------
/platforms/mapr/node.lst:
--------------------------------------------------------------------------------
1 | # Simple 4-node M7 cluster
2 | # NOTE:
3 | # (1) Node names MUST have the PREFIX mentioned in 'mapr_env.sh'
4 | # (2) Node names MUST have suffixes: -m, -w-0, -w-1, -w-2 ...
5 | # For example, if the PREFIX is 'mapr',
6 | # node names MUST be 'mapr-m', 'mapr-w-0', 'mapr-w-1', ...
7 | # (3) Do not forget to update NUM_WORKERS variable
8 | # Refer to MapR documentation for other values
9 | mapr-m:zookeeper,cldb,fileserver,nodemanager,nfs,webserver,hbase
10 | mapr-w-0:zookeeper,cldb,fileserver,nodemanager,nfs,hbase
11 | mapr-w-1:zookeeper,resourcemanager,historyserver,fileserver,nodemanager,nfs,hbase
12 | mapr-w-2:resourcemanager,fileserver,nodemanager,nfs,hbase
13 |
--------------------------------------------------------------------------------
/platforms/restart_services.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | # Copyright 2014 Google Inc. All Rights Reserved.D
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS-IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | # Restarts services corresponding to installed packages.
17 | # Performs last minute initialization as needed.
18 |
19 | set -e
20 |
21 | source hadoop_helpers.sh
22 |
23 | if [[ $(hostname -s) == ${MASTER_HOSTNAME} ]]; then
24 | COMPONENTS=${MASTER_COMPONENTS}
25 | else
26 | COMPONENTS=${DATANODE_COMPONENTS}
27 | fi
28 |
29 | # Component ordering is sensitive. hive-metastore must come before hive-server2
30 | # and hdfs must be up before oozie.
31 | for COMPONENT in ${COMPONENTS}; do
32 | if [[ -x /etc/init.d/${COMPONENT} ]]; then
33 | # Initialize HDFS
34 | if [[ ${COMPONENT} == 'hadoop-hdfs-namenode' ]]; then
35 | service hadoop-hdfs-namenode stop
36 | # Do not refomat if already formatted.
37 | yes n | service hadoop-hdfs-namenode init
38 | service hadoop-hdfs-namenode start
39 |
40 | # Setup /tmp and /user directories.
41 | if [[ "${DEFAULT_FS}" == 'hdfs' ]]; then
42 | initialize_hdfs_dirs
43 | fi
44 | # Initialize Oozie. Requires Namenode to be up.
45 | elif [[ ${COMPONENT} == 'oozie' ]]; then
46 | # Requires HDFS to be up and running.
47 | # Might be CDH specific.
48 | oozie-setup sharelib create -fs ${NAMENODE_URI} \
49 | -locallib /usr/lib/oozie/oozie-sharelib-yarn*
50 | service oozie restart
51 | else
52 | service ${COMPONENT} restart
53 | fi
54 | fi
55 | done
56 |
--------------------------------------------------------------------------------
/sampleapps/querytools/conf/hive/hive-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
19 |
20 |
21 |
22 |
23 | hive.metastore.warehouse.dir
24 | /user/${user.name}/warehouse
25 | location of default database for the warehouse
26 |
27 |
28 |
29 |
30 |
--------------------------------------------------------------------------------
/sampleapps/querytools/examples/ngrams/hive_query_ngrams.q:
--------------------------------------------------------------------------------
1 | --
2 | -- Copyright 2013 Google Inc. All Rights Reserved.
3 | --
4 | -- Licensed under the Apache License, Version 2.0 (the "License");
5 | -- you may not use this file except in compliance with the License.
6 | -- You may obtain a copy of the License at
7 | --
8 | -- http://www.apache.org/licenses/LICENSE-2.0
9 | --
10 | -- Unless required by applicable law or agreed to in writing, software
11 | -- distributed under the License is distributed on an "AS IS" BASIS,
12 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | -- See the License for the specific language governing permissions and
14 | --
15 |
16 | --
17 | -- This script is intended to be run from the Hive shell:
18 | --
19 | -- hive> source hive_query_ngrams.q;
20 | --
21 | -- or from the operating system shell:
22 | --
23 | -- $ hive -f hive_query_ngrams.q
24 | --
25 | -- The result of this query is a table of records indicating the count
26 | -- of occurrences of the words "radio" and "television" in the Google
27 | -- ngrams corpora for each year since 1920.
28 | --
29 | -- This query ensures that a record exists in the result for every year
30 | -- since 1920, even if there were no instances of a given word.
31 | -- In practice this is unnecessary as radio and television both occur
32 | -- more than once in the data set for every year since 1920.
33 | --
34 | -- The structure of this query is to join three distinct subqueries (on year):
35 | -- y: list of years since 1920 (implicitly ordered by the DISTINCT operation)
36 | -- r: sum of instances of the word "radio" for each year since 1920
37 | -- t: sum of instances of the word "television" for each year since 1920
38 | --
39 |
40 | SELECT y.year AS year,
41 | r.instance_count AS radio, t.instance_count AS television,
42 | CAST(r.instance_count AS DOUBLE)/(r.instance_count + t.instance_count)
43 | AS pct
44 | FROM
45 | (SELECT DISTINCT year AS year FROM
46 | (SELECT distinct year from 1gram where prefix = 'r' and year >= 1920
47 | UNION ALL
48 | SELECT distinct year from 1gram where prefix = 't' and year >= 1920) y_all)
49 | y
50 | JOIN
51 | (SELECT LOWER(word) AS ngram_col, year, SUM(instance_count) AS instance_count
52 | FROM 1gram
53 | WHERE LOWER(word) = 'radio' AND prefix='r' AND (year >= 1920)
54 | GROUP BY LOWER(word), year) r
55 | ON y.year = r.year
56 | JOIN
57 | (SELECT LOWER(word) AS ngram_col, year, SUM(instance_count) AS instance_count
58 | FROM 1gram
59 | WHERE LOWER(word) = 'television' AND prefix='t' AND (year >= 1920)
60 | GROUP BY LOWER(word), year) t
61 | ON y.year = t.year
62 | ORDER BY year;
63 |
64 | EXIT;
65 |
66 | --
67 | -- This is a simplified version of the above which eliminates the explicit
68 | -- generation of the "year" list. It assumes (correctly) that the word
69 | -- "television" appears every year that "radio" does.
70 | -- This query is listed here for reference and educational purposes only.
71 | --
72 | -- SELECT a.year, a.instance_count, b.instance_count,
73 | -- CAST(a.instance_count AS DOUBLE)/(a.instance_count + b.instance_count)
74 | -- FROM
75 | -- (SELECT LOWER(word) AS ngram_col, year, SUM(instance_count) AS instance_count
76 | -- FROM 1gram
77 | -- WHERE LOWER(word) = 'radio' AND prefix='r' AND (year >= 1920)
78 | -- GROUP BY LOWER(word), year) a
79 | -- JOIN
80 | -- (SELECT LOWER(word) AS ngram_col, year, SUM(instance_count) AS instance_count
81 | -- FROM 1gram
82 | -- WHERE LOWER(word) = 'television' AND prefix='t' AND (year >= 1920)
83 | -- GROUP BY LOWER(word), year) b
84 | -- ON a.year = b.year
85 | -- ORDER BY year;
86 | --
87 |
--------------------------------------------------------------------------------
/sampleapps/querytools/examples/ngrams/hive_table_create.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2013 Google Inc. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | #
17 | # This script is intended to be run from the unix command line
18 | # on an instance with hive installed (and the hive executable
19 | # available in the user PATH).
20 | #
21 | # It is assumed that the one has already run the shell script
22 | # ngram_hdfs_load.sh which will have downloaded the associated
23 | # ngram data and deposited it into HDFS under /user/hdpusr/ngrams/
24 | #
25 | # This script will create a table ("1gram") and then load each
26 | # file into a separate partition within the table.
27 | #
28 |
29 | set -o errexit
30 | set -o nounset
31 |
32 | # Select what to install
33 | readonly SCRIPT_DIR=$(dirname $0)
34 | source $SCRIPT_DIR/ngram_setup.sh
35 |
36 | # Create the table if it does not already exist
37 | hive << END_CREATE
38 | CREATE TABLE IF NOT EXISTS $NGRAMS (
39 | word STRING,
40 | year INT,
41 | instance_count INT,
42 | book_count INT
43 | )
44 | PARTITIONED BY (prefix STRING)
45 | ROW FORMAT DELIMITED
46 | FIELDS TERMINATED BY '\t'
47 | STORED AS TEXTFILE
48 | ;
49 | EXIT
50 | ;
51 | END_CREATE
52 |
53 | # Get the list of files to put into the table
54 | FILE_PATTERN=$(printf $SOURCE_FORMAT $NGRAMS "" "")
55 | FILE_LIST=$($HDFS_CMD -ls $HDFS_DIR | grep $FILE_PATTERN | awk '{ print $8 }')
56 | for filepath in $FILE_LIST; do
57 | filename=$(basename $filepath)
58 | prefix=${filename##$FILE_PATTERN}
59 |
60 | hive --silent << END_LOAD
61 | LOAD DATA INPATH '$HDFS_DIR/$filename'
62 | OVERWRITE INTO TABLE $NGRAMS
63 | PARTITION (prefix='$prefix')
64 | ;
65 | EXIT
66 | ;
67 | END_LOAD
68 | done
69 |
70 | echo "Data loaded into hive table $NGRAMS"
71 |
--------------------------------------------------------------------------------
/sampleapps/querytools/examples/ngrams/ngram_setup.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2013 Google Inc. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | # Utility script, sourced by both ngram_hdfs_load.sh and hive_table_create.sh
17 | # This script will set a series of constants, some based on the choice
18 | # of the command line "N" value (defaults to 1). N indicates the ngram
19 | # dataset to download and copy into HDFS.
20 |
21 | readonly SOURCE_FORMAT="googlebooks-eng-all-%s-20120701-%s%s"
22 | readonly SOURCE_LOCATION="gs://books/ngrams/books"
23 |
24 | # The "hadoop" executable should be in the user path
25 | readonly HDFS_CMD="hadoop fs"
26 |
27 | # What to install: 1gram by default
28 | N=1
29 |
30 | # Now parse command line arguments
31 | while [[ $# -ne 0 ]]; do
32 | case "$1" in
33 | --N=*)
34 | N=${1#--N=}
35 | shift
36 | ;;
37 | --help)
38 | N=
39 | shift
40 | ;;
41 | *)
42 | esac
43 | done
44 |
45 | if [[ ! $N -ge 1 ]]; then
46 | echo "usage $(basename $0): --N="
47 | exit 1
48 | fi
49 |
50 | # Now set constants based on the selection of N
51 | readonly NGRAMS="${N}gram"
52 | readonly HDFS_DIR="ngrams/$NGRAMS"
53 | readonly STAGE_DIR="/hadoop/tmp/$USER/ngrams/$NGRAMS"
54 |
55 |
--------------------------------------------------------------------------------
/sampleapps/querytools/project_properties.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2013 Google Inc. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | # Begin: edit these values to set up your cluster
17 | # GCS bucket for packages
18 | readonly GCS_PACKAGE_BUCKET={{{{ bucket_name }}}}
19 | # Zone of the Hadoop master instance
20 | readonly ZONE={{{{ zone_id }}}}
21 | # Hadoop master instance name
22 | readonly MASTER={{{{ master_hostname }}}}
23 |
24 | # Subdirectory in cloud storage where packages are pushed at initial setup
25 | readonly GCS_PACKAGE_DIR=hdp_tools
26 |
27 | # Full GCS URIs of the Pig and Hive tarballs, if packages-to-gcs__at__host.sh
28 | # is used; alternatively, these can be set to other pre-existing GCS paths
29 | readonly SUPPORTED_HDPTOOLS="hive pig"
30 | readonly TARBALL_BASE="gs://$GCS_PACKAGE_BUCKET/$GCS_PACKAGE_DIR/packages"
31 | readonly HIVE_TARBALL_URI="$TARBALL_BASE/hive/hive-*.tar.gz"
32 | readonly PIG_TARBALL_URI="$TARBALL_BASE/pig/pig-*.tar.gz"
33 |
34 | # Directory on master where hadoop is installed
35 | readonly HADOOP_HOME=/home/hadoop/hadoop
36 |
37 | # Set to the major version of hadoop ("1" or "2")
38 | readonly HADOOP_MAJOR_VERSION="1"
39 |
40 | # Hadoop username and group on Compute Engine Cluster
41 | readonly HADOOP_USER=hadoop
42 | readonly HADOOP_GROUP=hadoop
43 |
44 | # Hadoop client username on Compute Engine Cluster
45 | readonly HDP_USER=hdpuser
46 |
47 | # Directory on master where packages are installed
48 | readonly HDP_USER_HOME=/home/hdpuser
49 | readonly MASTER_INSTALL_DIR=/home/hdpuser
50 |
51 | # End: edit these values to set up your cluster
52 |
53 |
54 | # Begin: constants used througout the solution
55 |
56 | # Subdirectory where packages files (tar.gz) are stored
57 | readonly PACKAGES_DIR=packages
58 |
59 | # Subdirectory where scripts are stored
60 | readonly SCRIPTS_DIR=scripts
61 |
62 | # Subdirectory on master where we pull down package files
63 | readonly MASTER_PACKAGE_DIR=/tmp/hdp_tools
64 |
65 | # User tmp dir in HDFS
66 | readonly HDFS_TMP_DIR="/tmp"
67 |
68 | # Hadoop temp dir (hadoop.tmp.dir)
69 | readonly HADOOP_TMP_DIR="/hadoop/tmp"
70 |
71 | # End: constants used througout the solution
72 |
--------------------------------------------------------------------------------
/sampleapps/querytools/scripts/common_utils.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2013 Google Inc. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | set -o nounset
17 | set -o errexit
18 |
19 | function emit() {
20 | echo -e "$@"
21 | }
22 | readonly -f emit
23 |
24 | function die() {
25 | echo -e "$@" >&2
26 | exit 1
27 | }
28 | readonly -f die
29 |
30 |
--------------------------------------------------------------------------------
/sampleapps/querytools/scripts/install-packages-on-master__at__host.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2013 Google Inc. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | set -o nounset
17 | set -o errexit
18 |
19 | readonly SCRIPTDIR=$(dirname $0)
20 |
21 | # Pull in global properties
22 | source project_properties.sh
23 |
24 | # Pull in common functions
25 | source $SCRIPTDIR/common_utils.sh
26 |
27 | # Files to push to master; place project_properties.sh in the same directory
28 | # as the other scripts
29 | readonly SCRIPT_FILES_TO_PUSH="\
30 | project_properties.sh \
31 | $SCRIPTS_DIR/common_utils.sh \
32 | $SCRIPTS_DIR/package_utils.sh \
33 | $SCRIPTS_DIR/setup-hdfs-for-hdtools__at__master.sh \
34 | $SCRIPTS_DIR/setup-packages__at__master.sh \
35 | $SCRIPTS_DIR/setup-ssh-keys__at__master.sh \
36 | "
37 | readonly MASTER_PACKAGE_SUBDIRS="\
38 | $MASTER_PACKAGE_DIR/$SCRIPTS_DIR \
39 | $MASTER_PACKAGE_DIR/conf/hive \
40 | $MASTER_PACKAGE_DIR/ssh-key
41 | "
42 |
43 | # Ensure permissions on the script files before we push them
44 | chmod 755 $SCRIPT_FILES_TO_PUSH
45 |
46 | # Create the destination directory on the master
47 | emit ""
48 | emit "Ensuring setup directories exist on master:"
49 | gcutil ssh --zone=$ZONE --ssh_arg -t $MASTER sudo -i \
50 | "rm -rf $MASTER_PACKAGE_DIR && \
51 | mkdir -p $MASTER_PACKAGE_SUBDIRS"
52 |
53 | # Push the setup script to the master
54 | emit ""
55 | emit "Pushing the setup scripts to the master:"
56 | gcutil push --zone=$ZONE $MASTER \
57 | $SCRIPT_FILES_TO_PUSH $MASTER_PACKAGE_DIR/$SCRIPTS_DIR
58 |
59 | # Push configuration to the master
60 | emit ""
61 | emit "Pushing configuration to the master:"
62 | gcutil push --zone=$ZONE $MASTER \
63 | conf/hive/* $MASTER_PACKAGE_DIR/conf/hive
64 |
65 | # Execute the setup script on the master
66 | emit ""
67 | emit "Launching the user and package setup script on the master:"
68 | gcutil ssh --zone=$ZONE --ssh_arg -t $MASTER \
69 | sudo $MASTER_PACKAGE_DIR/$SCRIPTS_DIR/setup-packages__at__master.sh
70 |
71 | # Execute the HDFS setup script on the master
72 | emit ""
73 | emit "Launching the HDFS setup script on the master:"
74 | gcutil ssh --zone=$ZONE --ssh_arg -t $MASTER \
75 | sudo \
76 | $MASTER_PACKAGE_DIR/$SCRIPTS_DIR/setup-hdfs-for-hdtools__at__master.sh
77 |
78 | # Set up SSH keys for the user
79 | emit ""
80 | emit "Generating SSH keys for user $HDP_USER"
81 |
82 | readonly KEY_DIR=./ssh-key
83 | mkdir -p $KEY_DIR
84 | rm -f $KEY_DIR/$HDP_USER $KEY_DIR/${HDP_USER}.pub
85 |
86 | ssh-keygen -t rsa -P '' -f $KEY_DIR/$HDP_USER
87 | chmod o+r $KEY_DIR/${HDP_USER}.pub
88 | emit "Pushing SSH keys for user $HDP_USER to $MASTER"
89 | gcutil push --zone=$ZONE $MASTER \
90 | $KEY_DIR/${HDP_USER}.pub $MASTER_PACKAGE_DIR/ssh-key/
91 | emit "Adding SSH public key for user $HDP_USER to authorized_keys"
92 | gcutil ssh --zone=$ZONE --ssh_arg -t $MASTER \
93 | sudo sudo -u $HDP_USER -i \
94 | $MASTER_PACKAGE_DIR/$SCRIPTS_DIR/setup-ssh-keys__at__master.sh \
95 | $MASTER_PACKAGE_DIR/ssh-key
96 |
97 | MASTER_IP=$(gcutil getinstance --zone=$ZONE $MASTER | \
98 | awk -F '|' \
99 | '$2 ~ / *external-ip */ { gsub(/[ ]*/, "", $3); print $3 }')
100 |
101 | emit ""
102 | emit "***"
103 | emit "SSH keys generated locally to:"
104 | emit " Public key: $KEY_DIR/$HDP_USER.pub"
105 | emit " Private key: $KEY_DIR/$HDP_USER"
106 | emit ""
107 | emit "Public key installed on $MASTER to ~$HDP_USER/.ssh/authorized_keys"
108 | emit ""
109 | emit "You may now ssh to user $HDP_USER@$MASTER with:"
110 | emit " ssh -i $KEY_DIR/$HDP_USER $HDP_USER@$MASTER_IP"
111 | emit "***"
112 |
113 | emit ""
114 | emit "Installation complete"
115 |
--------------------------------------------------------------------------------
/sampleapps/querytools/scripts/package_utils.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2013 Google Inc. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | set -o nounset
17 | set -o errexit
18 |
19 | function pkgutil_get_list() {
20 | local pkg_dir="$1"
21 |
22 | find $pkg_dir -mindepth 2 -maxdepth 2 | sort
23 | }
24 | readonly -f pkgutil_get_list
25 |
26 | function pkgutil_pkg_name() {
27 | local pkg_dir="$1"
28 | local pkg="$2"
29 |
30 | # Strip the "package" directory
31 | local pkg_stripped=${pkg#$pkg_dir/}
32 |
33 | # Get the query-tool specific directory name
34 | echo ${pkg_stripped%/*}
35 | }
36 | readonly -f pkgutil_pkg_name
37 |
38 | function pkgutil_pkg_file() {
39 | local pkg_dir="$1"
40 | local pkg="$2"
41 |
42 | # Return just the filename
43 | echo ${pkg##*/}
44 | }
45 | readonly -f pkgutil_pkg_file
46 |
47 | function pkgutil_emit_list() {
48 | local pkg_dir="$1"
49 | local pkg_list="$2"
50 |
51 | emit ""
52 | emit "Discovered packages:"
53 | for pkg in $pkg_list; do
54 | # Get the query-tool specific directory name
55 | local pkg_name=$(pkgutil_pkg_name $pkg_dir $pkg)
56 |
57 | # Get the name of the zip file
58 | local pkg_file=$(pkgutil_pkg_file $pkg_dir $pkg)
59 |
60 | emit " $pkg_name ($pkg_file)"
61 | done
62 | }
63 | readonly -f pkgutil_emit_list
64 |
65 |
--------------------------------------------------------------------------------
/sampleapps/querytools/scripts/packages-delete-from-gcs__at__host.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2013 Google Inc. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | # packages-delete-from-gcs
17 | # This script removes the Hadoop query tool packages from Google Cloud
18 | # Storage which were uploaded by packages-to-gcs__at__host.sh
19 |
20 | set -o nounset
21 | set -o errexit
22 |
23 | readonly SCRIPTDIR=$(dirname $0)
24 |
25 | # Pull in global properties
26 | source project_properties.sh
27 |
28 | # Pull in common functions
29 | source $SCRIPTDIR/common_utils.sh
30 |
31 | # Remove packages from GCS
32 | emit ""
33 | emit "Removing packages:"
34 | gsutil rm -R -f gs://$GCS_PACKAGE_BUCKET/$GCS_PACKAGE_DIR
35 |
36 | emit ""
37 | emit "Package removal complete"
38 |
39 |
--------------------------------------------------------------------------------
/sampleapps/querytools/scripts/packages-to-gcs__at__host.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2013 Google Inc. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | # packages-to-gcs
17 | # This script examines the Hadoop tools packages directory for a list
18 | # of packages to push to Google Cloud Storage.
19 | #
20 | # All packages should be found in the "packages" subdirectory.
21 | # The required format is for the package name to be a subdirectory
22 | # and the associated TAR.GZ file to be inside the package subdirectory:
23 | # packages/
24 | # hive/
25 | # hive-0.10.0.tar.gz
26 | # pig/
27 | # pig-0.11.1.tar.gz
28 |
29 | set -o nounset
30 | set -o errexit
31 |
32 | readonly SCRIPTDIR=$(dirname $0)
33 |
34 | # Pull in global properties
35 | source project_properties.sh
36 |
37 | # Pull in common functions
38 | source $SCRIPTDIR/common_utils.sh
39 | source $SCRIPTDIR/package_utils.sh
40 |
41 | # The resulting PACKAGE_LIST will contain one entry per package where the
42 | # the entry is of the form "package_dir/package/gzip"
43 | # (for example packages/hive/hive-0.10.0.tar.gz)
44 | PACKAGE_LIST=$(pkgutil_get_list $PACKAGES_DIR)
45 | if [[ -z $PACKAGE_LIST ]]; then
46 | die "No package found in $PACKAGES_DIR subdirectory"
47 | fi
48 |
49 | # Emit package list
50 | pkgutil_emit_list "$PACKAGES_DIR" "$PACKAGE_LIST"
51 |
52 | # Push packages to GCS
53 | emit ""
54 | emit "Pushing packages to gs://$GCS_PACKAGE_BUCKET/$GCS_PACKAGE_DIR/:"
55 | gsutil -m cp -R $PACKAGES_DIR gs://$GCS_PACKAGE_BUCKET/$GCS_PACKAGE_DIR/
56 |
57 | emit ""
58 | emit "Package upload complete"
59 |
60 |
--------------------------------------------------------------------------------
/sampleapps/querytools/scripts/setup-hdfs-for-hdtools__at__master.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2013 Google Inc. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | set -o nounset
17 | set -o errexit
18 |
19 | SCRIPT=$(basename $0)
20 | SCRIPTDIR=$(dirname $0)
21 |
22 | source $SCRIPTDIR/project_properties.sh
23 | source $SCRIPTDIR/common_utils.sh
24 |
25 | readonly HDFS_CMD="sudo -u $HADOOP_USER -i $HADOOP_HOME/bin/hadoop fs"
26 | readonly HDFS_ROOT_USER="$HADOOP_USER"
27 |
28 | function hdfs_mkdir () {
29 | local dir=$1
30 | local owner=${2:-}
31 | local permissions=${3:-}
32 |
33 | emit " Checking directory $dir"
34 | if ! $HDFS_CMD -test -d $dir 2> /dev/null; then
35 | emit " Creating directory $dir"
36 | $HDFS_CMD -mkdir $dir
37 | fi
38 |
39 | if [[ -n "$owner" ]]; then
40 | emit " Ensuring owner $owner"
41 | $HDFS_CMD -chown $owner $dir
42 | fi
43 |
44 | if [[ -n "$permissions" ]]; then
45 | emit " Ensuring permissions $permissions"
46 | $HDFS_CMD -chmod $permissions $dir
47 | fi
48 | }
49 | readonly -f hdfs_mkdir
50 |
51 | emit ""
52 | emit "*** Begin: $SCRIPT running on master $(hostname) ***"
53 |
54 | # Ensure that /tmp exists (it should) and is fully accessible
55 | hdfs_mkdir "$HDFS_TMP_DIR" "$HDFS_ROOT_USER" "777"
56 |
57 | # Create a hive-specific scratch space in /tmp for the hdpuser
58 | hdfs_mkdir "$HDFS_TMP_DIR/hive-$HDP_USER" "$HDP_USER"
59 |
60 | # Create a warehouse directory (hive) for the hdpuser
61 | hdfs_mkdir "/user" "$HDFS_ROOT_USER"
62 | hdfs_mkdir "/user/$HDP_USER" "$HDP_USER"
63 | hdfs_mkdir "/user/$HDP_USER/warehouse" "$HDP_USER"
64 |
65 | # Create a mapreduce staging directory for the hdpuser
66 | if [[ "${HADOOP_MAJOR_VERSION}" == "2" ]]; then
67 | hdfs_mkdir "/hadoop/mapreduce" "$HADOOP_USER" "o+rw"
68 | hdfs_mkdir "/hadoop/mapreduce/staging" "$HADOOP_USER" "o+rw"
69 | hdfs_mkdir "/hadoop/mapreduce/staging/history" "$HADOOP_USER" "777"
70 | hdfs_mkdir "/hadoop/mapreduce/staging/$HDP_USER" "$HDP_USER"
71 | else
72 | hdfs_mkdir "$HADOOP_TMP_DIR/mapred/staging/$HDP_USER" "$HDP_USER"
73 | fi
74 |
75 | emit ""
76 | emit "*** End: $SCRIPT running on master $(hostname) ***"
77 | emit ""
78 |
79 |
--------------------------------------------------------------------------------
/sampleapps/querytools/scripts/setup-ssh-keys__at__master.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2013 Google Inc. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | # This script runs on the Hadoop master node as the target user ($HDP_USER).
17 | # It is asssumed that a public key file for the user has been pushed
18 | # onto the master node and the location of that file is the first argument
19 | # to the script.
20 |
21 | set -o nounset
22 | set -o errexit
23 |
24 | readonly SCRIPT=$(basename $0)
25 | readonly SCRIPTDIR=$(dirname $0)
26 |
27 | # Pull in global properties
28 | source $SCRIPTDIR/project_properties.sh
29 | source $SCRIPTDIR/common_utils.sh
30 |
31 | if [[ $# -lt 1 ]]; then
32 | die "usage: $0 [keys-dir]"
33 | fi
34 |
35 | KEY_DIR=$1; shift
36 | KEY_FILE=$KEY_DIR/${USER}.pub
37 |
38 | if [[ ! -e $KEY_FILE ]]; then
39 | die "Public key file not found: $KEY_FILE"
40 | fi
41 |
42 | # Ensure that the .ssh directory and authorized_keys files exist
43 | if [[ ! -e $HOME/.ssh/authorized_keys ]]; then
44 | mkdir -p $HOME/.ssh
45 | chmod 700 $HOME/.ssh
46 |
47 | touch $HOME/.ssh/authorized_keys
48 | chmod 600 $HOME/.ssh/authorized_keys
49 | fi
50 |
51 | # Add the public key file for the user to authorized_keys
52 | emit "Updating $HOME/.ssh/authorized_keys"
53 | (echo "# Added $(date)" && cat $KEY_FILE) >> $HOME/.ssh/authorized_keys
54 |
55 |
--------------------------------------------------------------------------------
/samples/bigquery_wordcount.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GoogleCloudDataproc/bdutil/967fd15b1f690e961f7d61809e4976aaa4ade90f/samples/bigquery_wordcount.jar
--------------------------------------------------------------------------------
/samples/test-mr-bigquery.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #
3 | # Copyright 2013 Google Inc. All Rights Reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS-IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | ###############################################################################
18 | # Sets up and runs WordCount job to verify BigQuery setup.
19 | # Usage:
20 | # Specify fully-qualified outputTable, e.g. "[datasetId].[tableId]":
21 | # ./bdutil -v -u "samples/*" run_command ./test-mr-bigquery.sh [outputTable]
22 | # Auto-generate/create a datasetId, and use that (provide no args)
23 | # ./bdutil -v -u "samples/*" run_command ./test-mr-bigquery.sh
24 | ################################################################################
25 |
26 | set -e
27 |
28 | source hadoop-env-setup.sh
29 |
30 | OUTPUT_TABLE=$1
31 |
32 | CREATED_DATASET=0
33 | if [[ -z "${OUTPUT_TABLE}" ]]; then
34 | OUTPUT_DATASET="validate_bigquery_dataset_$(date +%s)"
35 | OUTPUT_TABLE="${OUTPUT_DATASET}.wordcount_output"
36 | echo "No OUTPUT_TABLE provided; using ${OUTPUT_TABLE}"
37 | bq mk "${PROJECT}:${OUTPUT_DATASET}"
38 | CREATED_DATASET=1
39 | fi
40 |
41 | INPUT_TABLE='publicdata:samples.shakespeare'
42 | INPUT_TABLE_FIELD='word'
43 | JAR='bigquery_wordcount.jar'
44 |
45 | # Check for existence of jar
46 | if ! [[ -r ${JAR} ]]; then
47 | echo "Error. Could not find jar: ${JAR}" >&2
48 | exit 1
49 | fi
50 |
51 | # Perform word count MapReduce on README.txt
52 | hadoop jar ${JAR} ${PROJECT} ${INPUT_TABLE} ${INPUT_TABLE_FIELD} ${OUTPUT_TABLE}
53 |
54 | echo 'Word count finished successfully.' \
55 | "Manually clean up with 'bq rm ${OUTPUT_TABLE}'"
56 | if (( ${CREATED_DATASET} )); then
57 | echo "To delete entire dataset: 'bq rm -r ${OUTPUT_DATASET}'"
58 | fi
59 |
--------------------------------------------------------------------------------
/samples/word_count_mapper.py:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Mapper for use with hadoop-streaming bigquery word-count example.
15 |
16 | Reads each line of input and writes out lines each containing
17 | a single word and the number 1.
18 | The input lines consist of two tab-separated fields:
19 | 1. the record number
20 | 2. JSON data
21 | We pick one field of the JSON and use its value as the word to output.
22 | """
23 |
24 | import re
25 | import sys
26 |
27 |
28 | def main(args):
29 | # Set up the pattern that we use to extract our field
30 | field_name = args[1]
31 | field_pattern = '\\{.*"(' + field_name + ')":"([^"]*)".*\\}'
32 | field_extractor = re.compile(field_pattern)
33 |
34 | for line in sys.stdin:
35 | line = line.strip()
36 | key_and_json = line.split('\t', 1)
37 | json = key_and_json[1]
38 | matches = field_extractor.match(json)
39 | if matches:
40 | word = matches.group(2)
41 | if word:
42 | print '%s\t%s' % (word, 1)
43 |
44 |
45 | if __name__ == '__main__':
46 | main(sys.argv)
47 |
--------------------------------------------------------------------------------
/samples/word_count_reducer.py:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Reducer for use with hadoop-streaming word-count example.
15 |
16 | Reads each line of input, sums the counts for each word,
17 | outputs a line with word and total count for each word.
18 | The input is assumed to be sorted by word.
19 | """
20 |
21 | from __future__ import print_function
22 |
23 | import re
24 | import sys
25 |
26 | current_word = None
27 | current_count = 0
28 | output_json = False
29 |
30 |
31 | def print_word_and_count(word, count):
32 | word = re.sub('"', "'", word) # replace double-quotes with single-quotes
33 | if output_json:
34 | print('0\t{"Word": "%s", "Count": %d}' % (word, count))
35 | # When streaming out to BigQuery, this key (0 here) is ignored.
36 | else:
37 | print('%s\t%s' % (word, count))
38 |
39 |
40 | def next_word(word, count):
41 | global current_word, current_count
42 | if current_word:
43 | print_word_and_count(current_word, current_count)
44 | current_word = word
45 | current_count = count
46 |
47 |
48 | def main(args):
49 | global current_count
50 | global output_json
51 |
52 | if len(args) > 1:
53 | if args[1] == '--output_json':
54 | output_json = True
55 | else:
56 | print("Unknown command line option '%s'" % args[1], file=sys.stderr)
57 | sys.exit(2)
58 |
59 | for line in sys.stdin:
60 | line = line.strip()
61 | word, count_string = line.split('\t', 1)
62 |
63 | try:
64 | count = int(count_string)
65 | except ValueError:
66 | continue # ignore lines that are not formatted correctly
67 |
68 | if word == current_word:
69 | current_count += count
70 | else:
71 | next_word(word, count)
72 |
73 | next_word(None, 0)
74 |
75 | if __name__ == '__main__':
76 | main(sys.argv)
77 |
--------------------------------------------------------------------------------
/single_node_env.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2013 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # This file contains environment-variable overrides to be used in conjunction
16 | # with bdutil_env.sh in order to deploy a single-node Hadoop cluster.
17 | # Usage: ./bdutil deploy -e single_node_env.sh
18 |
19 | NUM_WORKERS=1
20 |
21 | # A single-node setup is much more likely to be used for development, so install
22 | # JDK with compiler/tools instead of just the minimal JRE.
23 | INSTALL_JDK_DEVEL=true
24 |
25 | # Save away the base evaluate_late_variable_bindings function so we can
26 | # override it.
27 | copy_func evaluate_late_variable_bindings old_evaluate_late_variable_bindings
28 |
29 | function evaluate_late_variable_bindings() {
30 | # Stash away the old value here so we can differentiate between whether the
31 | # user overrides set it or we just resolved it in the base implementation
32 | # of evaluate_late_variable_bindings.
33 | local old_nfs_master_hostname="${GCS_CACHE_MASTER_HOSTNAME}"
34 |
35 | old_evaluate_late_variable_bindings
36 |
37 | # In the case of the single-node cluster, we'll just use the whole PREFIX
38 | # as the name of the master and worker.
39 | WORKERS[0]=${PREFIX}
40 | MASTER_HOSTNAME=${PREFIX}
41 | WORKER_ATTACHED_PDS[0]="${PREFIX}-pd"
42 | MASTER_ATTACHED_PD="${PREFIX}-pd"
43 |
44 | # Fully qualified HDFS URI of namenode
45 | NAMENODE_URI="hdfs://${MASTER_HOSTNAME}:8020/"
46 |
47 | # Host and port of jobtracker
48 | JOB_TRACKER_URI="${MASTER_HOSTNAME}:9101"
49 |
50 | # GCS directory for deployment-related temporary files.
51 | local staging_dir_base="gs://${CONFIGBUCKET}/bdutil-staging"
52 | BDUTIL_GCS_STAGING_DIR="${staging_dir_base}/${MASTER_HOSTNAME}"
53 |
54 | # Default NFS cache host is the master node, but it can be overriden to point
55 | # at an NFS server off-cluster.
56 | if [[ -z "${old_nfs_master_hostname}" ]]; then
57 | GCS_CACHE_MASTER_HOSTNAME="${MASTER_HOSTNAME}"
58 | fi
59 |
60 | # Since $WORKERS and $MASTER_HOSTNAME both refer to the same single-node
61 | # VM, we must override COMMAND_STEPS to prevent duplicating steps. We also
62 | # omit deploy-ssh-worker-setup because there is no need to copy SSH keys to
63 | # the localhost.
64 | COMMAND_STEPS=(${COMMAND_STEPS[@]/,*/,*})
65 | }
66 |
--------------------------------------------------------------------------------
/standalone_nfs_cache_env.sh:
--------------------------------------------------------------------------------
1 | # Copyright 2015 Google Inc. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS-IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Handy wrapper around single_node_env.sh to turn up just a single server
16 | # capable of acting as the NFS-based GCS consistency cache for multiple
17 | # other clusters.
18 | #
19 | # Usage:
20 | # ./bdutil -P my-nfs-server -p -z -b generate_config my-nfs-server_env.sh
21 | # ./bdutil -e my-nfs-server_env.sh deploy
22 | #
23 | # ./bdutil -P cluster1 -p -z -b generate_config cluster1_env.sh
24 | # echo GCS_CACHE_MASTER_HOSTNAME=my-nfs-server >> cluster1_env.sh
25 | # ./bdutil -e cluster1_env.sh deploy
26 | #
27 | # ./bdutil -P cluster2 -p -z -b generate_config cluster2_env.sh
28 | # echo GCS_CACHE_MASTER_HOSTNAME=my-nfs-server >> cluster2_env.sh
29 | # ./bdutil -e cluster2_env.sh deploy
30 | #
31 | # ./bdutil -e cluster2_env.sh delete
32 | # ./bdutil -e cluster1_env.sh delete
33 | # ./bdutil -e my-nfs-server_env.sh delete
34 |
35 | # Start with single_node_env.sh to get all the MASTER_HOSTNAME, etc.,
36 | # resolution.
37 | import_env single_node_env.sh
38 |
39 | # This server would be somewhat pointless without the GCS connector and the
40 | # NFS cache enabled.
41 | INSTALL_GCS_CONNECTOR=true
42 | DEFAULT_FS='gs'
43 | ENABLE_NFS_GCS_FILE_CACHE=true
44 |
45 | # We'll set up Hadoop as normal since it'll be handy to have "hadoop fs -ls"
46 | # on the cache server, but we just won't configure the hadoop daemons to start
47 | # on boot, and won't start them explicitly during deployment. That means
48 | # no jobracker or resourcemanager or namenode, but we should still be able to
49 | # use "hadoop fs" against GCS just fine.
50 | COMMAND_GROUPS+=(
51 | "deploy-standalone-nfs-cache:
52 | libexec/install_java.sh
53 | libexec/mount_disks.sh
54 | libexec/setup_hadoop_user.sh
55 | libexec/install_hadoop.sh
56 | libexec/install_bdconfig.sh
57 | libexec/configure_hadoop.sh
58 | libexec/install_and_configure_gcs_connector.sh
59 | libexec/configure_hdfs.sh
60 | libexec/set_default_fs.sh
61 | libexec/setup_master_nfs.sh
62 | "
63 | )
64 |
65 | COMMAND_STEPS=(
66 | "deploy-standalone-nfs-cache,*"
67 | "deploy-client-nfs-setup,*"
68 | )
69 |
--------------------------------------------------------------------------------