├── .dockerignore ├── CHANGES.txt ├── CONTRIBUTING ├── Dockerfile ├── LICENSE ├── README.md ├── bdutil ├── bdutil_env.sh ├── bigquery_env.sh ├── conf ├── hadoop1 │ ├── bq-mapred-template.xml │ ├── core-template.xml │ ├── gcs-core-template.xml │ ├── hdfs-template.xml │ ├── mapred-health-check.sh │ └── mapred-template.xml └── hadoop2 │ ├── bigtable-hbase-site-template.xml │ ├── bq-mapred-template.xml │ ├── capacity-scheduler-template.xml │ ├── core-template.xml │ ├── gcs-core-template.xml │ ├── hdfs-template.xml │ ├── mapred-template.xml │ └── yarn-template.xml ├── docs ├── JOBS.md ├── MONITORING.md ├── QUICKSTART.md └── SHUTDOWN.md ├── extensions ├── bigtable │ ├── bigtable_env.sh │ └── install_hbase_bigtable.sh ├── flink │ ├── README.md │ ├── flink_env.sh │ ├── install_flink.sh │ └── start_flink.sh ├── google │ ├── experimental │ │ └── resize_env.sh │ └── gcs-validate-setup.sh ├── hama │ ├── README.md │ ├── hama_env.sh │ ├── install_hama.sh │ └── start_hama.sh ├── hbase │ ├── README.md │ ├── hbase_env.sh │ ├── install_hbase.sh │ └── start_hbase.sh ├── querytools │ ├── hive-validate-setup.sh │ ├── pig-mapred-template.xml │ ├── pig-validate-setup.sh │ ├── prepare_files.sh │ ├── querytools_env.sh │ └── setup_profiles.sh ├── spark │ ├── install_shark.sh │ ├── install_spark.sh │ ├── spark-validate-setup.sh │ ├── spark_configure_startup_processes.sh │ ├── spark_env.sh │ ├── spark_on_yarn_env.sh │ ├── spark_shark_env.sh │ ├── start_single_spark_worker.sh │ └── start_spark.sh ├── storm │ ├── README.md │ ├── install_storm.sh │ ├── install_supervisor.sh │ ├── install_zookeeper.sh │ ├── jar.xml │ ├── start_storm_master.sh │ ├── start_storm_worker.sh │ └── storm_env.sh └── tajo │ ├── README.md │ ├── configure_tajo.sh │ ├── install_tajo.sh │ ├── start_tajo.sh │ └── tajo_env.sh ├── hadoop-validate-setup.sh ├── hadoop2_env.sh ├── libexec ├── bdutil_helpers.sh ├── configure_hadoop.sh ├── configure_hdfs.sh ├── configure_mrv2_mem.py ├── configure_startup_processes.sh ├── hadoop_helpers.sh ├── install_and_configure_bigquery_connector.sh ├── install_and_configure_gcs_connector.sh ├── install_bdconfig.sh ├── install_hadoop.sh ├── install_java.sh ├── mount_disks.sh ├── set_default_fs.sh ├── setup_client_nfs.sh ├── setup_hadoop_user.sh ├── setup_master_nfs.sh ├── setup_master_ssh.sh ├── setup_worker_ssh.sh ├── start_hadoop.sh └── start_hadoop2.sh ├── platforms ├── cdh │ ├── README.md │ ├── cdh-core-template.xml │ ├── cdh_env.sh │ ├── configure_cdh.sh │ └── install_cdh.sh ├── hdp │ ├── README.md │ ├── TEST.md │ ├── ambari.conf │ ├── ambari_env.sh │ ├── ambari_functions.sh │ ├── ambari_manual_env.sh │ ├── ambari_manual_post_deploy_env.sh │ ├── configuration.json │ ├── create_blueprint.py │ ├── install_ambari.sh │ ├── install_ambari_components.sh │ ├── install_gcs_connector_on_ambari.sh │ ├── resources │ │ ├── public-hostname-gcloud.sh │ │ └── thp-disable.sh │ └── update_ambari_config.sh ├── mapr │ ├── README.md │ ├── configure_mapr_instance.sh │ ├── mapr_env.sh │ ├── mapr_license.txt │ ├── node.lst │ └── prepare_mapr_image.sh └── restart_services.sh ├── sampleapps └── querytools │ ├── COPYING │ ├── README.md │ ├── conf │ └── hive │ │ └── hive-site.xml │ ├── examples │ └── ngrams │ │ ├── hive_query_ngrams.q │ │ ├── hive_table_create.sh │ │ ├── ngram_hdfs_load.sh │ │ ├── ngram_setup.sh │ │ └── pig_query_ngrams.pig │ ├── project_properties.sh │ └── scripts │ ├── common_utils.sh │ ├── install-packages-on-master__at__host.sh │ ├── package_utils.sh │ ├── packages-delete-from-gcs__at__host.sh │ ├── packages-to-gcs__at__host.sh │ ├── setup-hdfs-for-hdtools__at__master.sh │ ├── setup-packages__at__master.sh │ └── setup-ssh-keys__at__master.sh ├── samples ├── bigquery_wordcount.jar ├── streaming_word_count.sh ├── test-mr-bigquery.sh ├── word_count_mapper.py └── word_count_reducer.py ├── single_node_env.sh └── standalone_nfs_cache_env.sh /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | .gitignore 3 | 4 | *.swp 5 | */*.swp 6 | */*/*.swp 7 | */*/*/*.swp 8 | */*/*/*/*.swp 9 | -------------------------------------------------------------------------------- /CONTRIBUTING: -------------------------------------------------------------------------------- 1 | Want to contribute? Great! First, read this page (including the small print at the end). 2 | 3 | ### Before you contribute 4 | Before we can use your code, you must sign the 5 | [Google Individual Contributor License Agreement](https://developers.google.com/open-source/cla/individual?csw=1) 6 | (CLA), which you can do online. The CLA is necessary mainly because you own the 7 | copyright to your changes, even after your contribution becomes part of our 8 | codebase, so we need your permission to use and distribute your code. We also 9 | need to be sure of various other things—for instance that you'll tell us if you 10 | know that your code infringes on other people's patents. You don't have to sign 11 | the CLA until after you've submitted your code for review and a member has 12 | approved it, but you must do it before we can put your code into our codebase. 13 | Before you start working on a larger contribution, you should get in touch with 14 | us first through the issue tracker with your idea so that we can help out and 15 | possibly guide you. Coordinating up front makes it much easier to avoid 16 | frustration later on. 17 | 18 | ### Code reviews 19 | All submissions, including submissions by project members, require review. We 20 | use Github pull requests for this purpose. 21 | 22 | ### The small print 23 | Contributions made by corporations are covered by a different agreement than 24 | the one above, the Software Grant and Corporate Contributor License Agreement. 25 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM google/cloud-sdk 2 | 3 | ADD . /bdutil/ 4 | 5 | ENTRYPOINT ["/bdutil/bdutil"] 6 | CMD ["--help"] 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # This project has been deprecated. Please use [Google Cloud Dataproc](https://cloud.google.com/dataproc) to create managed Apache Hadoop and Apache Spark instances on [Google Compute Engine](https://cloud.google.com/compute). 2 | 3 | # bdutil 4 | 5 | bdutil is a command-line script used to manage Apache Hadoop and Apache Spark instances on [Google Compute Engine](https://cloud.google.com/compute). bdutil manages deployment, configuration, and shutdown of your Hadoop instances. 6 | 7 | ## Requirements 8 | 9 | bdutil depends on the [Google Cloud SDK](https://cloud.google.com/sdk). bdutil is supported in any posix-compliant Bash v3 or greater shell. 10 | 11 | ## Usage 12 | 13 | See the [QUICKSTART](/docs/QUICKSTART.md) file in the `docs` directory to learn how to set up your Hadoop instances using bdutil. 14 | 15 | 1. Install and configure the [Google Cloud SDK](https://cloud.google.com/sdk) if you have already not done so 16 | 1. Clone this repository with `git clone https://github.com/GoogleCloudPlatform/bdutil.git` 17 | 1. Modify the following variables in the bdutil_env.sh file: 18 | 1. `PROJECT` - Set to the project ID for all bdutil commands. The project value will be overridden in the following order (where 1 overrides 2, and 2 overrides 3): 19 | * -p flag value, or if not specified then 20 | * PROJECT value in bdutil_env.sh, or if not specified then 21 | * gcloud default project value 22 | 1. `CONFIGBUCKET` - Set to a Google Compute Storage bucket that your project has read/write access to. 23 | 1. Run `bdutil --help` for a list of commands. 24 | 25 | The script implements the following commands, which are very similar: 26 | 27 | * `bdutil create` creates and starts instances, but will not apply most configuration settings. You can call `bdutil run_command_steps` on instances afterward to apply configuration settings to them. Typically you wouldn't use this, but would use `bdutil deploy` instead. 28 | * `bdutil deploy` creates and starts instances with all the configuration options specified in the command line and any included configuration scripts. 29 | 30 | ## Components installed 31 | 32 | The latest release of bdutil is `1.3.5`. This bdutil release installs the following versions of open source components: 33 | 34 | * Apache Hadoop - 1.2.1 (2.7.1 if you use the `-e` argument) 35 | * Apache Spark - 1.5.0 36 | * Apache Pig - 0.12 37 | * Apache Hive - 1.2.1 38 | 39 | ## Documentation 40 | 41 | The following documentation is useful for bdutil. 42 | 43 | * **[Quickstart](/docs/QUICKSTART.md)** - A guide on how to get started with bdutil quickly. 44 | * **[Jobs](/docs/JOBS.md)** - How to submit jobs (work) to a bdutil cluster. 45 | * **[Monitoring](/docs/MONITORING.md)** - How to monitor bdutil cluster. 46 | * **[Shutdown](/docs/SHUTDOWN.md)** - How shutdown a bdutil cluster. 47 | -------------------------------------------------------------------------------- /bigquery_env.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2013 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This file contains environment-variable overrides to be used in conjunction 16 | # with bdutil_env.sh in order to deploy a bigquery-enabled Hadoop cluster. 17 | # Usage: ./bdutil deploy bigquery_env.sh 18 | 19 | GCE_SERVICE_ACCOUNT_SCOPES+=('bigquery') 20 | 21 | # Whether or not to install and configure the BigQuery connector. 22 | INSTALL_BIGQUERY_CONNECTOR=true 23 | 24 | -------------------------------------------------------------------------------- /conf/hadoop1/bq-mapred-template.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | mapred.bq.project.id 6 | 7 | 8 | Google Cloud Project ID to use for BigQuery operations. 9 | 10 | 11 | 12 | mapred.bq.gcs.bucket 13 | 14 | 15 | The GCS bucket holding temporary BigQuery data for the input connector. 16 | 17 | 18 | 19 | mapred.bq.output.buffer.size 20 | 67108864 21 | 22 | The size in bytes of the output buffer to use when writing to BigQuery. 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /conf/hadoop1/core-template.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | hadoop.tmp.dir 6 | 7 | A base for other temporary directories. 8 | 9 | 10 | -------------------------------------------------------------------------------- /conf/hadoop1/gcs-core-template.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | fs.gs.project.id 6 | 7 | 8 | Google Cloud Project ID with access to configured GCS buckets. 9 | 10 | 11 | 12 | fs.gs.system.bucket 13 | 14 | 15 | GCS bucket to use as a default bucket if fs.default.name is not a gs: uri. 16 | 17 | 18 | 19 | fs.gs.working.dir 20 | / 21 | 22 | The directory relative gs: uris resolve in inside of the default bucket. 23 | 24 | 25 | 26 | fs.gs.impl 27 | com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem 28 | The FileSystem for gs: (GCS) uris. 29 | 30 | 31 | fs.gs.metadata.cache.enable 32 | true 33 | 34 | If true, a DirectoryListCache will be used to supplement "list" requests 35 | to GCS to fill in any missing items caused by eventual list consistency, 36 | intercepting create/delete/copy calls to create cache entries. The 37 | concrete type is determined with fs.gs.metadata.cache.type. 38 | 39 | 40 | 41 | fs.gs.metadata.cache.type 42 | 43 | 44 | Specifies which implementation of DirectoryListCache to use for 45 | supplementing GCS API "list" requests. Supported implementations: 46 | IN_MEMORY: Enforces immediate consistency within same Java process. 47 | FILESYSTEM_BACKED: Enforces consistency across all cooperating processes 48 | pointed at the same local mirror directory, which may be an NFS directory 49 | for massively-distributed coordination. 50 | 51 | 52 | 53 | fs.gs.metadata.cache.directory 54 | 55 | 56 | Only used if fs.gs.metadata.cache.type is FILESYSTEM_BACKED, specifies 57 | the local path to use as the base path for storing mirrored GCS metadata. 58 | Must be an absolute path, must be a directory, and must be fully 59 | readable/writable/executable by any user running processes which use the 60 | GCS connector. 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /conf/hadoop1/hdfs-template.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | dfs.namenode.rpc-address 6 | 7 | 8 | RPC address that handles all clients requests. If empty then we'll get 9 | thevalue from fs.default.name.The value of this property will take the 10 | form of hdfs://nn-host1:rpc-port. 11 | 12 | 13 | 14 | dfs.name.dir 15 | 16 | 17 | Determines where on the local filesystem the DFS namenode should store the 18 | name table(fsimage). If this is a comma-delimited list of directories then 19 | the name table is replicated in all of thedirectories, for redundancy. 20 | 21 | 22 | 23 | dfs.data.dir 24 | 25 | 26 | Determines where on the local filesystem an DFS datanode should store its 27 | blocks. If this is a comma-delimited list of directories, then data will 28 | be stored in all named directories, typically on different 29 | devices.Directories that do not exist are ignored. 30 | 31 | 32 | 33 | dfs.datanode.data.dir.perm 34 | 35 | 36 | Permissions for the directories on on the local filesystem where the DFS 37 | data node store its blocks. The permissions can either be octal or 38 | symbolic. 39 | 40 | 41 | 42 | dfs.permissions 43 | 44 | 45 | If "true", enable permission checking in HDFS. If "false", permission 46 | checking is turned off, but all other behavior is unchanged. Switching 47 | from one parameter value to the other does not change the mode, owner or 48 | group of files or directories. 49 | 50 | 51 | 52 | dfs.replication 53 | 2 54 | 55 | Default block replication. The actual number of replications can be 56 | specified when the file is created. The default is used if replication 57 | is not specified in create time. 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /conf/hadoop1/mapred-health-check.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | 19 | # Check to see if the TaskTracker is healthy by checking it's http address. 20 | # Necessary to avoid [MAPREDUCE-4668]. 21 | 22 | # Redirect stderr to stdout. 23 | # Necessary to see problems with health check script in log. 24 | # Will only show stdout if ERROR is present at the beginning of a line. 25 | exec 2>&1 26 | 27 | BIN=$(dirname "$0") 28 | BIN=$(cd "${BIN}"; pwd) 29 | HADOOP_CMD="${BIN}/hadoop" 30 | 31 | TASK_TRACKER_HTTP_ADDRESS=$(${HADOOP_CMD} jobtracker -dumpConfiguration 2>/dev/null \ 32 | | sed -n 's/.*task\.tracker\.http\.address","value":"\([.:0-9]*\)".*/\1/p') 33 | 34 | if [[ -n "${TASK_TRACKER_HTTP_ADDRESS}" ]]; then 35 | curl -sm 10 -o /dev/null ${TASK_TRACKER_HTTP_ADDRESS} 36 | ERROR_CODE=$? 37 | if (( ${ERROR_CODE} == 28 )); then 38 | echo "ERROR curl timed out trying to reach the TaskTracker web server." \ 39 | "Assuming the TaskTracker is unhealthy." 40 | elif (( ${ERROR_CODE} )); then 41 | echo "WARN curl failed to reach the TaskTracker, but did not time out." 42 | else 43 | echo "DEBUG Successfully curled TaskTracker." 44 | fi 45 | else 46 | echo "WARN Failed to determine TaskTracker http address." \ 47 | "Not checking health." 48 | fi 49 | 50 | # TaskTracker disregards ERRORs with non-zero exit code. 51 | exit 0 52 | -------------------------------------------------------------------------------- /conf/hadoop1/mapred-template.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | mapred.job.tracker 6 | 7 | 8 | The host and port that the MapReduce job tracker runsat. If "local", 9 | then jobs are run in-process as a single mapand reduce task. 10 | 11 | 12 | 13 | mapred.map.tasks 14 | 15 | 16 | The default number of map tasks per job.Ignored when mapred.job.tracker is 17 | "local". 18 | 19 | 20 | 21 | mapred.reduce.tasks 22 | 23 | 24 | The default number of reduce tasks per job. Typically set to 99%of the 25 | cluster's reduce capacity, so that if a node fails the reduces canstill be 26 | executed in a single wave.Ignored when mapred.job.tracker is 27 | "local". 28 | 29 | 30 | 31 | mapred.tasktracker.map.tasks.maximum 32 | 33 | 34 | The maximum number of map tasks that will be runsimultaneously by a task 35 | tracker. 36 | 37 | 38 | 39 | mapred.tasktracker.reduce.tasks.maximum 40 | 41 | 42 | The maximum number of reduce tasks that will be runsimultaneously by a 43 | task tracker. 44 | 45 | 46 | 47 | mapred.child.java.opts 48 | 49 | 50 | Java opts for the task tracker child processes.The following symbol, if 51 | present, will be interpolated: @taskid@ is replacedby current TaskID. Any 52 | other occurrences of '@' will go unchanged.For example, to enable verbose 53 | gc logging to a file named for the taskid in/tmp and to set the heap 54 | maximum to be a gigabyte, pass a 'value' of:-Xmx1024m -verbose:gc 55 | -Xloggc:/tmp/@taskid@.gcThe configuration variable mapred.child.ulimit can 56 | be used to control themaximum virtual memory of the child processes. 57 | 58 | 59 | 60 | mapred.jobtracker.restart.recover 61 | true 62 | 63 | Whether or not to enable (job) recovery upon restart. 64 | 65 | 66 | 67 | mapreduce.jobtracker.expire.trackers.interval 68 | 60000 69 | 70 | The time-interval, in milliseconds, after which a tasktracker is 71 | declared 'lost' if it doesn't send heartbeats. The Hadoop 72 | distribution default is 600000 (10 minutes), we set this to 73 | 60000 (1 minute) to quickly reassign work. 74 | 75 | 76 | 77 | mapred.local.dir 78 | 79 | 80 | Directories on the local machine in which to store mapreduce temp files. 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /conf/hadoop2/bigtable-hbase-site-template.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | hbase.client.connection.impl 7 | 8 | 9 | 10 | google.bigtable.endpoint.host 11 | 12 | 13 | 14 | google.bigtable.admin.endpoint.host 15 | 16 | 17 | 18 | google.bigtable.project.id 19 | 20 | 21 | 22 | google.bigtable.zone.name 23 | 24 | 25 | 26 | google.bigtable.cluster.name 27 | 28 | 29 | 30 | yarn.app.mapreduce.am.command-opts 31 | 32 | 33 | 34 | mapreduce.map.java.opts 35 | 36 | 37 | 38 | mapreduce.reduce.java.opts 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /conf/hadoop2/bq-mapred-template.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | mapred.bq.project.id 6 | 7 | 8 | Google Cloud Project ID to use for BigQuery operations. 9 | 10 | 11 | 12 | mapred.bq.gcs.bucket 13 | 14 | 15 | The GCS bucket holding temporary BigQuery data for the input connector. 16 | 17 | 18 | 19 | mapred.bq.output.buffer.size 20 | 67108864 21 | 22 | The size in bytes of the output buffer to use when writing to BigQuery. 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /conf/hadoop2/core-template.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | hadoop.tmp.dir 6 | 7 | A base for other temporary directories. 8 | 9 | 10 | fs.defaultFS 11 | file:/// 12 | 13 | The name of the default file system. A URI whose scheme and authority 14 | determine the FileSystem implementation. The uri's scheme determines 15 | the config property (fs.SCHEME.impl) naming the FileSystem 16 | implementation class. The uri's authority is used to determine the 17 | host, port, etc. for a filesystem. 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /conf/hadoop2/gcs-core-template.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | fs.gs.project.id 6 | 7 | 8 | Google Cloud Project ID with access to configured GCS buckets. 9 | 10 | 11 | 12 | fs.gs.system.bucket 13 | 14 | 15 | GCS bucket to use as a default bucket if fs.default.name is not a gs: uri. 16 | 17 | 18 | 19 | fs.gs.working.dir 20 | / 21 | 22 | The directory relative gs: uris resolve in inside of the default bucket. 23 | 24 | 25 | 26 | fs.gs.impl 27 | com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem 28 | The FileSystem for gs: (GCS) uris. 29 | 30 | 31 | fs.AbstractFileSystem.gs.impl 32 | com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS 33 | The AbstractFileSystem for gs: (GCS) uris. 34 | 35 | 36 | fs.gs.metadata.cache.enable 37 | true 38 | 39 | If true, a DirectoryListCache will be used to supplement "list" requests 40 | to GCS to fill in any missing items caused by eventual list consistency, 41 | intercepting create/delete/copy calls to create cache entries. The 42 | concrete type is determined with fs.gs.metadata.cache.type. 43 | 44 | 45 | 46 | fs.gs.metadata.cache.type 47 | 48 | 49 | Specifies which implementation of DirectoryListCache to use for 50 | supplementing GCS API "list" requests. Supported implementations: 51 | IN_MEMORY: Enforces immediate consistency within same Java process. 52 | FILESYSTEM_BACKED: Enforces consistency across all cooperating processes 53 | pointed at the same local mirror directory, which may be an NFS directory 54 | for massively-distributed coordination. 55 | 56 | 57 | 58 | fs.gs.metadata.cache.directory 59 | 60 | 61 | Only used if fs.gs.metadata.cache.type is FILESYSTEM_BACKED, specifies 62 | the local path to use as the base path for storing mirrored GCS metadata. 63 | Must be an absolute path, must be a directory, and must be fully 64 | readable/writable/executable by any user running processes which use the 65 | GCS connector. 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /conf/hadoop2/hdfs-template.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | dfs.namenode.secondary.http-address 6 | :50090 7 | 8 | The secondary namenode http server address and port. 9 | 10 | 11 | 12 | dfs.namenode.rpc-address 13 | :8020 14 | 15 | RPC address that handles all clients requests. If empty then we'll get 16 | thevalue from fs.default.name.The value of this property will take the 17 | form of hdfs://nn-host1:rpc-port. 18 | 19 | 20 | 21 | dfs.namenode.name.dir 22 | 23 | 24 | Determines where on the local filesystem the DFS namenode should store the 25 | name table(fsimage). If this is a comma-delimited list of directories then 26 | the name table is replicated in all of thedirectories, for redundancy. 27 | 28 | 29 | 30 | dfs.datanode.data.dir 31 | 32 | 33 | Determines where on the local filesystem an DFS datanode should store its 34 | blocks. If this is a comma-delimited list of directories, then data will 35 | be stored in all named directories, typically on different 36 | devices.Directories that do not exist are ignored. 37 | 38 | 39 | 40 | dfs.datanode.data.dir.perm 41 | 42 | 43 | Permissions for the directories on on the local filesystem where the DFS 44 | data node store its blocks. The permissions can either be octal or 45 | symbolic. 46 | 47 | 48 | 49 | dfs.permissions.enabled 50 | 51 | 52 | If "true", enable permission checking in HDFS. If "false", permission 53 | checking is turned off, but all other behavior is unchanged. Switching 54 | from one parameter value to the other does not change the mode, owner or 55 | group of files or directories. 56 | 57 | 58 | 59 | dfs.permissions.supergroup 60 | hadoop 61 | 62 | The name of the group of super-users. 63 | 64 | 65 | 66 | dfs.replication 67 | 2 68 | 69 | Default block replication. The actual number of replications can be 70 | specified when the file is created. The default is used if replication 71 | is not specified in create time. 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /conf/hadoop2/yarn-template.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | yarn.resourcemanager.hostname 6 | 7 | 8 | 9 | yarn.nodemanager.aux-services 10 | mapreduce_shuffle 11 | 12 | 13 | yarn.nodemanager.resource.memory-mb 14 | 15 | 16 | Amount of physical memory, in MB, that can be allocated for containers. 17 | 18 | 19 | 20 | yarn.scheduler.maximum-allocation-mb 21 | 22 | 23 | The maximum allocation for every container request at the RM, in MBs. 24 | Memory requests higher than this won't take effect, and will get capped 25 | to this value. 26 | 27 | 28 | 29 | yarn.scheduler.minimum-allocation-mb 30 | 31 | 32 | The minimum allocation for every container request at the RM, in MBs. 33 | Memory requests lower than this won't take effect, and the specified 34 | value will get allocated at minimum. 35 | 36 | 37 | 38 | yarn.nodemanager.resource.cpu-vcores 39 | 40 | 41 | Number of vcores that can be allocated for containers. This is used by 42 | the RM scheduler when allocating resources for containers. This is not 43 | used to limit the number of physical cores used by YARN containers. 44 | 45 | 46 | 47 | yarn.log-aggregation-enable 48 | false 49 | 50 | Enable remote logs aggregation to the default FS. 51 | 52 | 53 | 54 | yarn.nodemanager.remote-app-log-dir 55 | /yarn-logs/ 56 | 57 | The remote path, on the default FS, to store logs. 58 | 59 | 60 | 61 | yarn.resourcemanager.recovery.enabled 62 | true 63 | 64 | Enable RM to recover state after starting. 65 | 66 | 67 | 68 | yarn.resourcemanager.fs.state-store.uri 69 | file:///hadoop/yarn/system/rmstore 70 | 71 | URI pointing to the location of the FileSystem path where RM state will 72 | be stored. This is set on the local file system to avoid collisions in 73 | GCS. 74 | 75 | 76 | 77 | yarn.nodemanager.local-dirs 78 | 79 | 80 | Directories on the local machine in which to application temp files. 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /docs/JOBS.md: -------------------------------------------------------------------------------- 1 | # Jobs 2 | 3 | Once you have [created a cluster](QUICKSTART.md) you can submit "jobs" (work) to it. These can be entirely new jobs, or jobs you port from an existing environment. 4 | 5 | ## Writing Jobs 6 | 7 | To learn about how to write Hadoop jobs from the ground up, see the [Apache Hadoop tutorials](https://hadoop.apache.org/docs/current/hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html). 8 | 9 | Google Cloud Platform offers input/output data connectors for your Hadoop and Spark jobs: 10 | 11 | * [Google BigQuery Connector for Hadoop](https://github.com/GoogleCloudPlatform/bigdata-interop) 12 | * [Google Cloud Storage Connector for Hadoop](https://github.com/GoogleCloudPlatform/bigdata-interop) 13 | 14 | ## Porting existing jobs 15 | 16 | When porting a job from HDFS using the Cloud Storage connector for Hadoop, be sure to use the correct file path syntax (`gs://`). 17 | Also note that `FileSystem.append` is unsupported. If you choose Cloud Storage as your default file system, update your MapReduce, if necessary, to avoid using the append method. 18 | 19 | ## Running jobs 20 | 21 | Once you've set up a Hadoop cluster and have written or ported a job, you can run the job using the following steps. 22 | 23 | ### Validating your setup and data 24 | 25 | First, validate that your cluster is set up, and that you can access your data. Navigate to the command line to execute the following commands. 26 | 27 | Type `./bdutil shell` to SSH into the master node of the Hadoop cluster. 28 | Type `hadoop fs -ls /` to check the cluster status. If data outputs, the cluster is set up correctly. 29 | 30 | ### Running the job 31 | 32 | Next, run the job from the command line, while you are still connected to the cluster via SSH. Always run jobs as the `hadoop` user to avoid having to type full Hadoop paths in commands. 33 | 34 | The following example runs a sample job called WordCount. Hadoop installations include this sample in the `/home/hadoop/hadoop-install/hadoop-examples-*.jar file.` 35 | 36 | To run the WordCount job: 37 | 38 | 1. Navigate to the command line. 39 | 1. Type `./bdutil shell` to SSH into the master node of the Hadoop cluster. 40 | 1. Type `hadoop fs -mkdir input` to create the `input` directory. 41 | Note that when using Google Cloud Storage as your [default file system](QUICKSTART.md), input automatically resolves to `gs://$/input`. 42 | 1. Copy any file from the web, such as the following example text from Apache, by typing the following command: `curl http://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/ClusterSetup.html > setup.html`. 43 | 1. Copy one or more text files into the `input` directory. Using the same Apache text in the previous step, type the following command: `hadoop fs -copyFromLocal setup.html input`. 44 | 1. Type `cd /home/hadoop/hadoop-install/` to navigate to the Hadoop install directory. 45 | 1. Type `hadoop jar hadoop-examples-*.jar wordcount input output` to run the job on data in the input directory, and place results in the output directory. 46 | 47 | ### Checking job status 48 | 49 | To check the status of of the Hadoop job, visit the [JobTracker page](http://wiki.apache.org/hadoop/JobTracker). See the [monitoring jobs](MONITORING.md) page for instructions on how to access the JobTracker. 50 | 51 | ### Cleanup 52 | 53 | After completing the job, make sure to [shut down the Hadoop cluster](SHUTDOWN.md) for the most cost effective solution. 54 | -------------------------------------------------------------------------------- /docs/SHUTDOWN.md: -------------------------------------------------------------------------------- 1 | # Shutting Down a Hadoop Cluster 2 | 3 | Because [Google Compute Engine](https://cloud.google.com/compute/) charges on a [per-minute basis](https://cloud.google.com/compute/pricing), it can be cost effective to shut down your Hadoop cluster once a workload completes. Once the Hadoop cluster is shut down, your data's accessibility depends on the [default file system](QUICKSTART.md) you've chosen: 4 | 5 | * When using HDFS, data is inaccessible. 6 | * When using [Google Cloud Storage](https://cloud.google.com/storage/), data is accessible with [gsutil](https://cloud.google.com/storage/docs/gsutil) or the [Google Cloud Platform Console](https://console.cloud.google.com/?_ga=1.81149463.169096153.1475769191). 7 | 8 | **When you delete (shutdown) a cluster, the operation is irreversible.** 9 | 10 | ## Issuing the delete command 11 | 12 | To shut down the Hadoop cluster, use the bdutil file included as part of the setup script. Type `./bdutil delete` in the `bdutil-` directory on the command line to shut down the cluster. 13 | 14 | Here is an example of the command being run. 15 | 16 | ~/bdutil-0.35.1$ ./bdutil delete 17 | Wed Aug 13 16:03:15 PDT 2014: Using local tmp dir for staging files: /tmp/bdutil-20140813-160315 18 | Wed Aug 13 16:03:15 PDT 2014: Using custom environment-variable file(s): ./bdutil_env.sh 19 | Wed Aug 13 16:03:15 PDT 2014: Reading environment-variable file: ./bdutil_env.sh 20 | Delete cluster with following settings? 21 | CONFIGBUCKET='' 22 | PROJECT='' 23 | GCE_IMAGE='backports-debian-7' 24 | GCE_ZONE='us-central1-b' 25 | GCE_NETWORK='default' 26 | PREFIX='hadoop' 27 | NUM_WORKERS=2 28 | MASTER_HOSTNAME='hadoop-m' 29 | WORKERS='hadoop-w-0 hadoop-w-1' 30 | BDUTIL_GCS_STAGING_DIR='gs:///bdutil-staging/hadoop-m' 31 | (y/n) y 32 | Wed Aug 13 16:03:16 PDT 2014: Deleting hadoop cluster... 33 | ...Wed Aug 13 16:03:17 PDT 2014: Waiting on async 'deleteinstance' jobs to finish. Might take a while... 34 | ... 35 | Wed Aug 13 16:04:11 PDT 2014: Done deleting VMs! 36 | Wed Aug 13 16:04:11 PDT 2014: Execution complete. Cleaning up temporary files... 37 | Wed Aug 13 16:04:11 PDT 2014: Cleanup complete. 38 | 39 | ## Verifying all resources have been removed 40 | 41 | You **must** use the same bdutil configuration arguments for cluster creation and deletion. Altering the arguments might result in errors when shutting down the cluster. After the script executes, you can type `gcloud compute instances list --project= | grep ` and verify that no instances are still running. Similarly, you can type `gcloud compute disks list --project= | grep ` and verify that no created disks accidentally survived. 42 | -------------------------------------------------------------------------------- /extensions/bigtable/bigtable_env.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This file contains environment-variable overrides to be used in conjunction 16 | # with bdutil_env.sh in order to deploy a Hadoop cluster with HBase installed 17 | # and configured to use Cloud Bigtable. 18 | # Usage: ./bdutil deploy -e extensions/bigtable/bigtable_env.sh. 19 | 20 | # Directory on each VM in which to install hbase. 21 | HBASE_INSTALL_DIR=/home/hadoop/hbase-install 22 | HBASE_CONF_DIR=${HBASE_INSTALL_DIR}/conf/ 23 | BIGTABLE_ENDPOINT=bigtable.googleapis.com 24 | BIGTABLE_ADMIN_ENDPOINT=bigtabletableadmin.googleapis.com 25 | 26 | BIGTABLE_ZONE=us-central1-b 27 | BIGTABLE_CLUSTER=cluster 28 | 29 | COMMAND_GROUPS+=( 30 | "install_bigtable: 31 | extensions/bigtable/install_hbase_bigtable.sh 32 | " 33 | ) 34 | 35 | # Installation of bigtable on master and workers 36 | COMMAND_STEPS+=( 37 | 'install_bigtable,install_bigtable' 38 | ) 39 | 40 | ALPN_VERSION=7.1.3.v20150130 41 | ALPN_REMOTE_JAR=http://central.maven.org/maven2/org/mortbay/jetty/alpn/alpn-boot/${ALPN_VERSION}/alpn-boot-${ALPN_VERSION}.jar 42 | BIGTABLE_HBASE_JAR=https://storage.googleapis.com/cloud-bigtable/jars/bigtable-hbase/bigtable-hbase-mapreduce-0.2.2-shaded.jar 43 | BIGTABLE_CONNECTION=com.google.cloud.bigtable.hbase1_1.BigtableConnection 44 | 45 | # Copied from http://www.us.apache.org/dist/hbase/stable/ 46 | # We don't want to overload the apache servers. 47 | HBASE_TARBALL_URI=https://storage.googleapis.com/cloud-bigtable/hbase-dist/hbase-1.1.2/hbase-1.1.2-bin.tar.gz 48 | 49 | BIGTABLE_LIB_DIR=${HBASE_INSTALL_DIR}/lib/bigtable 50 | ALPN_CLASSPATH=${BIGTABLE_LIB_DIR}/alpn-boot-${ALPN_VERSION}.jar 51 | BIGTABLE_BOOT_OPTS="-Xms1024m -Xmx2048m -Xbootclasspath/p:${ALPN_CLASSPATH}" 52 | 53 | # TODO: JAVAOPTS gets used in mapred-template.xml. There should probably be a better way to do this. 54 | JAVAOPTS="$JAVAOPTS -Xbootclasspath/p:$BIGTABLE_BOOT_OPTS" 55 | 56 | GCE_SERVICE_ACCOUNT_SCOPES+=( 57 | 'https://www.googleapis.com/auth/cloud-bigtable.admin' 58 | 'https://www.googleapis.com/auth/cloud-bigtable.data' 59 | 'https://www.googleapis.com/auth/cloud-bigtable.data.readonly' 60 | ) 61 | -------------------------------------------------------------------------------- /extensions/flink/README.md: -------------------------------------------------------------------------------- 1 | Deploying Flink on Google Compute Engine 2 | ======================================== 3 | 4 | Set up a bucket 5 | ---------------- 6 | 7 | If you have not done so, create a bucket for the bdutil config and 8 | staging files. A new bucket can be created with the gsutil: 9 | 10 | gsutil mb gs:// 11 | 12 | 13 | Adapt the bdutil config 14 | ----------------------- 15 | 16 | To deploy Flink with bdutil, adapt at least the following variables in 17 | bdutil_env.sh. 18 | 19 | CONFIGBUCKET="" 20 | PROJECT="" 21 | NUM_WORKERS= 22 | 23 | 24 | Bring up a cluster with Flink 25 | ----------------------------- 26 | 27 | To bring up the Flink cluster on Google Compute Engine, execute: 28 | 29 | ./bdutil -e extensions/flink/flink_env.sh deploy 30 | 31 | To run a Flink example job: 32 | 33 | ./bdutil shell 34 | curl http://www.gutenberg.org/cache/epub/2265/pg2265.txt > text 35 | gsutil cp text gs:///text 36 | cd /home/hadoop/flink-install/bin 37 | ./flink run ../examples/flink-java-examples-*-WordCount.jar gs:///text gs:///output -------------------------------------------------------------------------------- /extensions/flink/flink_env.sh: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS-IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | # This file contains environment-variable overrides to be used in conjunction 14 | # with bdutil_env.sh in order to deploy a Hadoop + Flink cluster. 15 | # Usage: ./bdutil deploy -e extensions/flink/flink_env.sh 16 | 17 | 18 | # In standalone mode, Flink runs the job manager and the task managers (workers) 19 | # on the cluster without using YARN containers. Flink also supports YARN 20 | # deployment which will be implemented in future version of the Flink bdutil plugin. 21 | FLINK_MODE="standalone" 22 | 23 | # URIs of tarballs for installation. 24 | FLINK_HADOOP1_TARBALL_URI='gs://flink-dist/flink-0.10.1-bin-hadoop1-scala_2.10.tgz' 25 | # Hadoop v2.7 build 26 | FLINK_HADOOP2_TARBALL_URI='gs://flink-dist/flink-0.10.1-bin-hadoop27-scala_2.10.tgz' 27 | 28 | # Directory on each VM in which to install each package. 29 | FLINK_INSTALL_DIR='/home/hadoop/flink-install' 30 | 31 | # Optional JVM arguments to pass 32 | # Flink config entry: env.java.opts: 33 | FLINK_JAVA_OPTS="-DsomeOption=value" 34 | 35 | # Heap memory used by the job manager (master) determined by the physical (free) memory of the server 36 | # Flink config entry: jobmanager.heap.mb 37 | FLINK_JOBMANAGER_MEMORY_FRACTION='0.8' 38 | 39 | # Heap memory used by the task managers (slaves) determined by the physical (free) memory of the servers 40 | # Flink config entry: taskmanager.heap.mb 41 | FLINK_TASKMANAGER_MEMORY_FRACTION='0.8' 42 | 43 | # Number of task slots per task manager (worker) 44 | # ideally set to the number of physical cpus 45 | # if set to 'auto', the number of slots will be determined automatically 46 | # Flink config entry: taskmanager.numberOfTaskSlots 47 | FLINK_TASKMANAGER_SLOTS='auto' 48 | 49 | # Default parallelism (number of concurrent actions per task) 50 | # If set to 'auto', this will be determined automatically 51 | # Flink config entry: parallelism.default 52 | FLINK_PARALLELISM='auto' 53 | 54 | # The number of buffers for the network stack. 55 | # Flink config entry: taskmanager.network.numberOfBuffers 56 | FLINK_NETWORK_NUM_BUFFERS=2048 57 | 58 | 59 | COMMAND_GROUPS+=( 60 | "install_flink: 61 | extensions/flink/install_flink.sh 62 | " 63 | "start_flink: 64 | extensions/flink/start_flink.sh 65 | " 66 | ) 67 | 68 | # Installation of flink on master and workers; then start_flink only on master. 69 | COMMAND_STEPS+=( 70 | 'install_flink,install_flink' 71 | 'start_flink,*' 72 | ) 73 | -------------------------------------------------------------------------------- /extensions/flink/install_flink.sh: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS-IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | 14 | # fail if undeclared variables are used 15 | set -o nounset 16 | # exit on error 17 | set -o errexit 18 | 19 | 20 | # Figure out which tarball to use based on which Hadoop version is being used. 21 | set +o nounset 22 | HADOOP_BIN="sudo -u hadoop ${HADOOP_INSTALL_DIR}/bin/hadoop" 23 | HADOOP_VERSION=$(${HADOOP_BIN} version | tr -cd [:digit:] | head -c1) 24 | set -o nounset 25 | if [[ "${HADOOP_VERSION}" == '2' ]]; then 26 | FLINK_TARBALL_URI=${FLINK_HADOOP2_TARBALL_URI} 27 | else 28 | FLINK_TARBALL_URI=${FLINK_HADOOP1_TARBALL_URI} 29 | fi 30 | 31 | # Install Flink via this fancy pipe 32 | gsutil cat "${FLINK_TARBALL_URI}" | tar -C /home/hadoop/ -xzv 33 | mv /home/hadoop/flink* "${FLINK_INSTALL_DIR}" 34 | 35 | # List all task managers (workers) in the slaves file 36 | # The task managers will be brought up by the job manager (master) 37 | echo ${WORKERS[@]} | tr ' ' '\n' > ${FLINK_INSTALL_DIR}/conf/slaves 38 | 39 | # Create temp file in hadoop directory which might be mounted to other storage than os 40 | FLINK_TASKMANAGER_TEMP_DIR="/hadoop/flink/tmp" 41 | mkdir -p ${FLINK_TASKMANAGER_TEMP_DIR} 42 | chgrp hadoop -R /hadoop/flink 43 | chmod 777 -R /hadoop/flink 44 | 45 | # Calculate the memory allocations, MB, using 'free -m'. Floor to nearest MB. 46 | TOTAL_MEM=$(free -m | awk '/^Mem:/{print $2}') 47 | FLINK_JOBMANAGER_MEMORY=$(python -c \ 48 | "print int(${TOTAL_MEM} * ${FLINK_JOBMANAGER_MEMORY_FRACTION})") 49 | FLINK_TASKMANAGER_MEMORY=$(python -c \ 50 | "print int(${TOTAL_MEM} * ${FLINK_TASKMANAGER_MEMORY_FRACTION})") 51 | 52 | # Determine the number of task slots 53 | if [[ "${FLINK_TASKMANAGER_SLOTS}" == "auto" ]] ; then 54 | FLINK_TASKMANAGER_SLOTS=`grep -c processor /proc/cpuinfo` 55 | fi 56 | 57 | # Determine the default parallelism 58 | if [[ "${FLINK_PARALLELISM}" == "auto" ]] ; then 59 | FLINK_PARALLELISM=$(python -c \ 60 | "print ${NUM_WORKERS} * ${FLINK_TASKMANAGER_SLOTS}") 61 | fi 62 | 63 | # Apply Flink settings by appending them to the default config 64 | cat << EOF >> ${FLINK_INSTALL_DIR}/conf/flink-conf.yaml 65 | jobmanager.rpc.address: ${MASTER_HOSTNAME} 66 | jobmanager.heap.mb: ${FLINK_JOBMANAGER_MEMORY} 67 | taskmanager.heap.mb: ${FLINK_TASKMANAGER_MEMORY} 68 | taskmanager.numberOfTaskSlots: ${FLINK_TASKMANAGER_SLOTS} 69 | parallelism.default: ${FLINK_PARALLELISM} 70 | taskmanager.network.numberOfBuffers: ${FLINK_NETWORK_NUM_BUFFERS} 71 | env.java.opts: ${FLINK_JAVA_OPTS} 72 | taskmanager.tmp.dirs: ${FLINK_TASKMANAGER_TEMP_DIR} 73 | fs.hdfs.hadoopconf: ${HADOOP_CONF_DIR} 74 | EOF 75 | 76 | # Find the Hadoop lib dir so and add its gcs-connector to the Flink lib dir 77 | set +o nounset 78 | if [[ -r "${HADOOP_INSTALL_DIR}/libexec/hadoop-config.sh" ]]; then 79 | . "${HADOOP_INSTALL_DIR}/libexec/hadoop-config.sh" 80 | fi 81 | if [[ -n "${HADOOP_COMMON_LIB_JARS_DIR}" ]] && \ 82 | [[ -n "${HADOOP_PREFIX}" ]]; then 83 | LIB_JARS_DIR="${HADOOP_PREFIX}/${HADOOP_COMMON_LIB_JARS_DIR}" 84 | else 85 | LIB_JARS_DIR="${HADOOP_INSTALL_DIR}/lib" 86 | fi 87 | set -o nounset 88 | # Get jar name and path 89 | GCS_JARNAME=$(grep -o '[^/]*\.jar' <<< ${GCS_CONNECTOR_JAR}) 90 | LOCAL_GCS_JAR="${LIB_JARS_DIR}/${GCS_JARNAME}" 91 | # create link in Flink lib dir 92 | ln -s "${LOCAL_GCS_JAR}" "${FLINK_INSTALL_DIR}/lib/" 93 | 94 | 95 | # Assign ownership of everything to the 'hadoop' user. 96 | chown -R hadoop:hadoop /home/hadoop/ 97 | # Make the Flink log directory writable 98 | chmod 777 ${FLINK_INSTALL_DIR}/log 99 | -------------------------------------------------------------------------------- /extensions/flink/start_flink.sh: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS-IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | 13 | set -o nounset 14 | set -o errexit 15 | 16 | if [[ ${FLINK_MODE} == 'standalone' ]]; then 17 | sudo -u hadoop ${FLINK_INSTALL_DIR}/bin/start-cluster.sh 18 | fi -------------------------------------------------------------------------------- /extensions/google/experimental/resize_env.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Plugin which allows manually resizing bdutil-deployed clusters. To resize 16 | # upwards, set NEW_NUM_WORKERS to the new, larger value, keeping the old 17 | # NUM_WORKERS (or -n flag) at the existing cluster size. Then: 18 | # 19 | # Deploy only the new workers, e.g. {hadoop-w-2, hadoop-w-3, hadoop-w-4}: 20 | # ./bdutil -e my_base_env.sh -e extensions/google/experimental/resize_env.sh deploy 21 | # 22 | # Explicitly start the Hadoop daemons on just the new workers: 23 | # ./bdutil -e my_base_env.sh -e extensions/google/experimental/resize_env.sh run_command -t workers -- "service hadoop-hdfs-datanode start && service hadoop-mapreduce-tasktracker start" 24 | # 25 | # If using Spark as well, explicitly start the Spark daemons on the new workers: 26 | # ./bdutil -e my_base_env.sh -e extensions/google/experimental/resize_env.sh run_command -t workers -u extensions/spark/start_single_spark_worker.sh -- "./start_single_spark_worker.sh" 27 | # 28 | # Edit your base config to reflect your new cluster size: 29 | # echo NUM_WORKERS=5 >> my_base_env.sh 30 | # 31 | # When resizing down, simply set the base NUM_WORKERS to the desired smaller 32 | # size, and set NEW_NUM_WORKERS equal to the current cluster size; this can 33 | # be thought of as "undo-ing" a "resize upwards" command: 34 | # ./bdutil -e my_base_env.sh -n 2 -e extensions/google/experimental/resize_env.sh delete 35 | # echo NUM_WORKERS=2 >> my_base_env.sh 36 | # 37 | # TODO(user): Merge into bdutil as a core command. 38 | NEW_NUM_WORKERS=5 39 | 40 | # During resizes, make sure to avoid touching the master node. 41 | SKIP_MASTER=true 42 | 43 | # Save away the base evaluate_late_variable_bindings function so we can 44 | # override it and replace the WORKERS array. 45 | copy_func evaluate_late_variable_bindings old_evaluate_late_variable_bindings 46 | 47 | function evaluate_late_variable_bindings() { 48 | old_evaluate_late_variable_bindings 49 | 50 | WORKERS=() 51 | WORKER_ATTACHED_PDS=() 52 | 53 | local worker_suffix='w' 54 | local master_suffix='m' 55 | if (( ${OLD_HOSTNAME_SUFFIXES} )); then 56 | echo 'WARNING: Using deprecated -nn and -dn naming convention' 57 | worker_suffix='dn' 58 | master_suffix='nn' 59 | fi 60 | for ((i = ${NUM_WORKERS}; i < ${NEW_NUM_WORKERS}; i++)); do 61 | local shift_i=$((${i} - ${NUM_WORKERS})) 62 | WORKERS[${shift_i}]="${PREFIX}-${worker_suffix}-${i}" 63 | done 64 | for ((i = ${NUM_WORKERS}; i < ${NEW_NUM_WORKERS}; i++)); do 65 | local shift_i=$((${i} - ${NUM_WORKERS})) 66 | WORKER_ATTACHED_PDS[${shift_i}]="${WORKERS[${shift_i}]}-pd" 67 | done 68 | 69 | local num_workers_to_add=$((${NEW_NUM_WORKERS} - ${NUM_WORKERS})) 70 | NUM_WORKERS=${num_workers_to_add} 71 | } 72 | -------------------------------------------------------------------------------- /extensions/hama/README.md: -------------------------------------------------------------------------------- 1 | Deploying Hama on Google Compute Engine 2 | =============================================== 3 | 4 | Apache Hama 5 | ----------- 6 | Apache Hama is a framework for Big Data analytics which uses the Bulk Synchronous Parallel (BSP) computing model, which was established in 2012 as a Top-Level Project of The Apache Software Foundation. 7 | 8 | It provides not only pure BSP programming model but also vertex and neuron centric programming models, inspired by Google's Pregel and DistBelief. 9 | 10 | Basic Usage 11 | ----------- 12 | 13 | Basic installation of [Apache Hama](http://hama.apache.org/) alongside Hadoop on Google Cloud Platform. 14 | 15 | ./bdutil -e extensions/hama/hama_env.sh deploy 16 | 17 | Or alternatively, using shorthand syntax: 18 | 19 | ./bdutil -e hama deploy 20 | 21 | Status 22 | ------ 23 | 24 | This plugin is currently considered experimental and not officially supported. 25 | Contributions are welcome. 26 | -------------------------------------------------------------------------------- /extensions/hama/hama_env.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This file contains environment-variable overrides to be used in conjunction 16 | # with bdutil_env.sh in order to deploy a Hadoop cluster with Hama installed 17 | # and configured. 18 | # Usage: ./bdutil deploy extensions/hama/hama_env.sh. 19 | 20 | # URIs of tarball to install. 21 | HAMA_TARBALL_URI='gs://hama-dist/hama-dist-0.7.0.tar.gz' 22 | 23 | # Default Hama dist tarball requires Hadoop 2. 24 | import_env hadoop2_env.sh 25 | 26 | # Directory on each VM in which to install hama. 27 | HAMA_INSTALL_DIR='/home/hadoop/hama-install' 28 | 29 | COMMAND_GROUPS+=( 30 | "install_hama: 31 | extensions/hama/install_hama.sh 32 | " 33 | "start_hama: 34 | extensions/hama/start_hama.sh 35 | " 36 | ) 37 | 38 | # Installation of hama on master and workers; then start_hama only on master. 39 | COMMAND_STEPS+=( 40 | 'install_hama,install_hama' 41 | 'start_hama,*' 42 | ) 43 | -------------------------------------------------------------------------------- /extensions/hama/install_hama.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | set -o nounset 16 | set -o errexit 17 | 18 | # Get the filename out of the full URI. 19 | HAMA_TARBALL=${HAMA_TARBALL_URI##*/} 20 | 21 | # Get the tarball, untar it. 22 | gsutil cp ${HAMA_TARBALL_URI} /home/hadoop/${HAMA_TARBALL} 23 | tar -C /home/hadoop -xzvf /home/hadoop/${HAMA_TARBALL} 24 | mv /home/hadoop/hama*/ ${HAMA_INSTALL_DIR} 25 | 26 | # Set up hama-site.xml to make sure it can access HDFS. 27 | cat << EOF > ${HAMA_INSTALL_DIR}/conf/hama-site.xml 28 | 29 | 30 | 31 | 32 | bsp.master.address 33 | ${MASTER_HOSTNAME}:40000 34 | 35 | 36 | hama.zookeeper.quorum 37 | ${MASTER_HOSTNAME} 38 | 39 | 40 | fs.defaultFS 41 | hdfs://${MASTER_HOSTNAME}:8020/ 42 | 43 | 44 | EOF 45 | 46 | # Set up all workers to be groomservers. 47 | echo ${WORKERS[@]} | tr ' ' '\n' > ${HAMA_INSTALL_DIR}/conf/groomservers 48 | 49 | # Symlink the Hadoop hdfs-site.xml to hama's "copy" of it. 50 | ln -s ${HADOOP_CONF_DIR}/hdfs-site.xml ${HAMA_INSTALL_DIR}/conf/hdfs-site.xml 51 | 52 | # Explicitly set up JAVA_HOME for hama. 53 | JAVA_HOME=$(readlink -f $(which java) | sed 's|/bin/java$||') 54 | cat << EOF >> ${HAMA_INSTALL_DIR}/conf/hama-env.sh 55 | export JAVA_HOME=${JAVA_HOME} 56 | EOF 57 | 58 | # Add the hama 'bin' path to the .bashrc so that it's easy to call 'hama' 59 | # during interactive ssh session. 60 | add_to_path_at_login "${HAMA_INSTALL_DIR}/bin" 61 | 62 | # Assign ownership of everything to the 'hadoop' user. 63 | chown -R hadoop:hadoop /home/hadoop/ ${HAMA_INSTALL_DIR} 64 | -------------------------------------------------------------------------------- /extensions/hama/start_hama.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. # 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS-IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | set -o nounset 15 | set -o errexit 16 | 17 | sudo -u hadoop ${HAMA_INSTALL_DIR}/bin/start-bspd.sh 18 | -------------------------------------------------------------------------------- /extensions/hbase/README.md: -------------------------------------------------------------------------------- 1 | Deploying Apache HBase on Google Compute Engine 2 | =============================================== 3 | 4 | Basic Usage 5 | ----------- 6 | 7 | Basic installation of [Apache HBase](http://hbase.apache.org/) alongside Hadoop on Google Cloud Platform. 8 | 9 | ./bdutil -e extensions/hbase/hbase_env.sh deploy 10 | 11 | Or alternatively, using shorthand syntax: 12 | 13 | ./bdutil -e hbase deploy 14 | 15 | Status 16 | ------ 17 | 18 | This plugin is currently considered experimental and not officially supported. 19 | Contributions are welcome. 20 | -------------------------------------------------------------------------------- /extensions/hbase/hbase_env.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This file contains environment-variable overrides to be used in conjunction 16 | # with bdutil_env.sh in order to deploy a Hadoop cluster with HBase installed 17 | # and configured. 18 | # Usage: ./bdutil deploy extensions/hbase/hbase_env.sh. 19 | 20 | # URIs of tarball to install. 21 | HBASE_TARBALL_URI='gs://hbase-dist/hbase-0.94.19.tar.gz' 22 | 23 | # Directory on each VM in which to install hbase. 24 | HBASE_INSTALL_DIR='/home/hadoop/hbase-install' 25 | 26 | COMMAND_GROUPS+=( 27 | "install_hbase: 28 | extensions/hbase/install_hbase.sh 29 | " 30 | "start_hbase: 31 | extensions/hbase/start_hbase.sh 32 | " 33 | ) 34 | 35 | # Installation of hbase on master and workers; then start_hbase only on master. 36 | COMMAND_STEPS+=( 37 | 'install_hbase,install_hbase' 38 | 'start_hbase,*' 39 | ) 40 | -------------------------------------------------------------------------------- /extensions/hbase/install_hbase.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | set -o nounset 16 | set -o errexit 17 | 18 | # Get the filename out of the full URI. 19 | HBASE_TARBALL=${HBASE_TARBALL_URI##*/} 20 | 21 | # Get the tarball, untar it. 22 | gsutil cp ${HBASE_TARBALL_URI} /home/hadoop/${HBASE_TARBALL} 23 | tar -C /home/hadoop -xzvf /home/hadoop/${HBASE_TARBALL} 24 | mv /home/hadoop/hbase*/ ${HBASE_INSTALL_DIR} 25 | 26 | # Set up hbase-site.xml to make sure it can access HDFS. 27 | cat << EOF > ${HBASE_INSTALL_DIR}/conf/hbase-site.xml 28 | 29 | 30 | 31 | 32 | hbase.rootdir 33 | hdfs://${MASTER_HOSTNAME}:8020/hbase 34 | 35 | 36 | hbase.zookeeper.quorum 37 | ${MASTER_HOSTNAME} 38 | 39 | 40 | hbase.cluster.distributed 41 | true 42 | 43 | 44 | EOF 45 | 46 | # Set up all workers to be regionservers. 47 | echo ${WORKERS[@]} | tr ' ' '\n' > ${HBASE_INSTALL_DIR}/conf/regionservers 48 | 49 | # Symlink the Hadoop hdfs-site.xml to hbase's "copy" of it. 50 | ln -s ${HADOOP_CONF_DIR}/hdfs-site.xml ${HBASE_INSTALL_DIR}/conf/hdfs-site.xml 51 | 52 | # Explicitly set up JAVA_HOME for hbase. 53 | JAVA_HOME=$(readlink -f $(which java) | sed 's|/bin/java$||') 54 | cat << EOF >> ${HBASE_INSTALL_DIR}/conf/hbase-env.sh 55 | export JAVA_HOME=${JAVA_HOME} 56 | EOF 57 | 58 | # Add the hbase 'bin' path to the .bashrc so that it's easy to call 'hbase' 59 | # during interactive ssh session. 60 | add_to_path_at_login "${HBASE_INSTALL_DIR}/bin" 61 | 62 | # Assign ownership of everything to the 'hadoop' user. 63 | chown -R hadoop:hadoop /home/hadoop/ ${HBASE_INSTALL_DIR} 64 | -------------------------------------------------------------------------------- /extensions/hbase/start_hbase.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. # 2 | # Licensed under the Apache License, Version 2.0 (the "License"); 3 | # you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS-IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | set -o nounset 15 | set -o errexit 16 | 17 | sudo -u hadoop ${HBASE_INSTALL_DIR}/bin/start-hbase.sh 18 | -------------------------------------------------------------------------------- /extensions/querytools/hive-validate-setup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Copyright 2014 Google Inc. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS-IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Runs a basic Hive script. 18 | # Usage: ./bdutil shell < extensions/querytools/hive-validate-setup.sh 19 | 20 | # File hadoop-confg.sh 21 | HADOOP_CONFIGURE_CMD='' 22 | HADOOP_CONFIGURE_CMD=$(find ${HADOOP_LIBEXEC_DIR} ${HADOOP_PREFIX} \ 23 | /home/hadoop /usr/*/hadoop* /usr/*/current/hadoop* -name hadoop-config.sh | head -n 1) 24 | 25 | # If hadoop-config.sh has been found source it 26 | if [[ -n "${HADOOP_CONFIGURE_CMD}" ]]; then 27 | echo "Sourcing '${HADOOP_CONFIGURE_CMD}'" 28 | . ${HADOOP_CONFIGURE_CMD} 29 | fi 30 | 31 | HADOOP_CMD=$(find ${HADOOP_PREFIX} /home/hadoop /usr/*/hadoop* /usr/*/current/hadoop* -wholename '*/bin/hadoop' | head -n 1) 32 | HIVE_CMD=$(find ${HADOOP_PREFIX} /home/hadoop /usr/*/hive* /usr/*/current/hive* -wholename '*/bin/hive' | head -n 1) 33 | 34 | #if it is still empty then dont run the tests 35 | if [[ "${HADOOP_CMD}" == '' ]]; then 36 | echo "Did not find hadoop'" 37 | exit 1 38 | fi 39 | 40 | #if it is still empty then dont run the tests 41 | if [[ "${HIVE_CMD}" == '' ]]; then 42 | echo "Did not find hive'" 43 | exit 1 44 | fi 45 | 46 | # Upload sample data. 47 | PARENT_DIR="/tmp/validate_hive_$(date +%s)" 48 | ${HADOOP_CMD} fs -mkdir ${PARENT_DIR} 49 | ${HADOOP_CMD} fs -put /etc/passwd ${PARENT_DIR} 50 | 51 | # Create a basic Hive script. 52 | echo "Creating hivetest.hive..." 53 | cat << EOF > hivetest.hive 54 | DROP TABLE bdutil_validate_hive_tbl; 55 | 56 | CREATE TABLE bdutil_validate_hive_tbl ( 57 | user STRING, 58 | dummy STRING, 59 | uid INT, 60 | gid INT, 61 | name STRING, 62 | home STRING, 63 | shell STRING 64 | ) 65 | ROW FORMAT DELIMITED 66 | FIELDS TERMINATED BY ':' 67 | STORED AS TEXTFILE; 68 | 69 | LOAD DATA INPATH '${PARENT_DIR}/passwd' 70 | OVERWRITE INTO TABLE bdutil_validate_hive_tbl; 71 | 72 | SELECT shell, COUNT(*) shell_count 73 | FROM bdutil_validate_hive_tbl 74 | GROUP BY shell 75 | ORDER BY shell_count DESC, shell DESC; 76 | EOF 77 | cat hivetest.hive 78 | 79 | # Run the script. 80 | ${HIVE_CMD} -f hivetest.hive > /tmp/hiveoutput.txt 81 | 82 | echo "Hive output:" 83 | cat /tmp/hiveoutput.txt 84 | 85 | # Run an equivalent pipeline of command-line invocations which pull out the 86 | # 'shell' field, sort/uniq to get the counts of each occurence, then finally 87 | # format to match Hive by printing tab-separated fields: 88 | # shell_count\tshell 89 | cat /etc/passwd | awk -F: '{print $7}' | sort | uniq -c | sort -nr | \ 90 | awk '{print $2, $1}' | sed "s/ /\t/" > /tmp/goldenoutput.txt 91 | 92 | echo "Expected output:" 93 | cat /tmp/goldenoutput.txt 94 | 95 | EXIT_CODE=0 96 | if diff /tmp/hiveoutput.txt /tmp/goldenoutput.txt; then 97 | echo "Verified correct output." 98 | else 99 | echo "Hive output doesn't match expected output!" 100 | EXIT_CODE=1 101 | fi 102 | 103 | # Cleanup. 104 | echo "Cleaning up test data: ${PARENT_DIR}" 105 | ${HADOOP_CMD} fs -rmr -skipTrash ${PARENT_DIR} 106 | 107 | exit ${EXIT_CODE} 108 | -------------------------------------------------------------------------------- /extensions/querytools/pig-mapred-template.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | mapred.working.dir 6 | /user/ 7 | 8 | The FileSystem working directory to use for relative paths. 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /extensions/querytools/pig-validate-setup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Copyright 2014 Google Inc. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS-IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Runs a basic Pig script. 18 | # Usage: ./bdutil shell < extensions/querytools/pig-validate-setup.sh 19 | 20 | # File hadoop-confg.sh 21 | HADOOP_CONFIGURE_CMD='' 22 | HADOOP_CONFIGURE_CMD=$(find ${HADOOP_LIBEXEC_DIR} ${HADOOP_PREFIX} \ 23 | /home/hadoop /usr/*/hadoop* /usr/*/current/hadoop* -name hadoop-config.sh | head -n 1) 24 | 25 | # If hadoop-config.sh has been found source it 26 | if [[ -n "${HADOOP_CONFIGURE_CMD}" ]]; then 27 | echo "Sourcing '${HADOOP_CONFIGURE_CMD}'" 28 | . ${HADOOP_CONFIGURE_CMD} 29 | fi 30 | 31 | HADOOP_CMD=$(find ${HADOOP_PREFIX} /home/hadoop /usr/*/hadoop* /usr/*/current/hadoop* -wholename '*/bin/hadoop' | head -n 1) 32 | PIG_CMD=$(find ${HADOOP_PREFIX} /home/hadoop /usr/*/pig* /usr/*/current/pig* -wholename '*/bin/pig' | head -n 1) 33 | 34 | #if it is still empty then dont run the tests 35 | if [[ "${HADOOP_CMD}" == '' ]]; then 36 | echo "Did not find hadoop'" 37 | exit 1 38 | fi 39 | 40 | #if it is still empty then dont run the tests 41 | if [[ "${PIG_CMD}" == '' ]]; then 42 | echo "Did not find pig'" 43 | exit 1 44 | fi 45 | 46 | # Upload sample data. 47 | PARENT_DIR="/tmp/validate_pig_$(date +%s)" 48 | ${HADOOP_CMD} fs -mkdir ${PARENT_DIR} 49 | ${HADOOP_CMD} fs -put /etc/passwd ${PARENT_DIR} 50 | 51 | # Create a basic Pig script. 52 | echo "Creating pigtest.pig..." 53 | cat << EOF > pigtest.pig 54 | SET job.name 'PigTest'; 55 | data = LOAD '${PARENT_DIR}/passwd' 56 | USING PigStorage(':') 57 | AS (user:CHARARRAY, dummy:CHARARRAY, uid:INT, gid:INT, 58 | name:CHARARRAY, home:CHARARRAY, shell:CHARARRAY); 59 | grp = GROUP data BY (shell); 60 | counts = FOREACH grp GENERATE 61 | FLATTEN(group) AS shell:CHARARRAY, COUNT(data) AS shell_count:LONG; 62 | res = ORDER counts BY shell_count DESC, shell DESC; 63 | DUMP res; 64 | EOF 65 | cat pigtest.pig 66 | 67 | # Run the script. 68 | ${PIG_CMD} pigtest.pig > /tmp/pigoutput.txt 69 | 70 | echo "Pig output:" 71 | cat /tmp/pigoutput.txt 72 | 73 | # Run an equivalent pipeline of command-line invocations which pull out the 74 | # 'shell' field, sort/uniq to get the counts of each occurence, then finally 75 | # format to match Pig by printing comma-separated fields in parens: 76 | # (shell_count,shell) 77 | cat /etc/passwd | awk -F: '{print $7}' | sort | uniq -c | sort -nr | \ 78 | awk '{print $2, $1}' | sed "s/\(.*\) \(.*\)/(\1,\2)/" > /tmp/goldenoutput.txt 79 | 80 | echo "Expected output:" 81 | cat /tmp/goldenoutput.txt 82 | 83 | EXIT_CODE=0 84 | if diff /tmp/pigoutput.txt /tmp/goldenoutput.txt; then 85 | echo "Verified correct output." 86 | else 87 | echo "Pig output doesn't match expected output!" 88 | EXIT_CODE=1 89 | fi 90 | 91 | # Cleanup. 92 | echo "Cleaning up test data: ${PARENT_DIR}" 93 | ${HADOOP_CMD} fs -rmr -skipTrash ${PARENT_DIR} 94 | 95 | exit ${EXIT_CODE} 96 | -------------------------------------------------------------------------------- /extensions/querytools/prepare_files.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2013 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Places files into expected files; generates a project_properties.sh file 16 | # which other scripts are designed to use. 17 | 18 | set -o nounset 19 | set -o errexit 20 | 21 | mkdir -p ${MASTER_PACKAGE_DIR}/conf/hive 22 | mv hive-site.xml ${MASTER_PACKAGE_DIR}/conf/hive/ 23 | 24 | # Dynamically generated a project_properties.sh file which only contains the 25 | # environment variables which must be derived from existing hadoop deployment 26 | # variables. 27 | cat << EOF >> project_properties.sh 28 | SUPPORTED_HDPTOOLS='hive pig' 29 | ZONE=${GCE_ZONE} 30 | MASTER=${MASTER_HOSTNAME} 31 | HADOOP_HOME=${HADOOP_INSTALL_DIR} 32 | EOF 33 | 34 | # Explicitly set a schemeless working directory, otherwise as of Pig 0.12.0 35 | # PigInputFormat fails to use input paths which are not from the "default" 36 | # FileSystem. No need to clobber existing working-directory settings. 37 | bdconfig merge_configurations \ 38 | --configuration_file ${HADOOP_CONF_DIR}/mapred-site.xml \ 39 | --source_configuration_file pig-mapred-template.xml \ 40 | --resolve_environment_variables \ 41 | --create_if_absent \ 42 | --noclobber 43 | -------------------------------------------------------------------------------- /extensions/querytools/querytools_env.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2013 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This file contains environment-variable overrides to be used in conjunction 16 | # with bdutil_env.sh in order to deploy a Hadoop cluster with Pig and Hive 17 | # installed, using the Cloud Solutions sampleapp. 18 | # Usage: ./bdutil deploy extensions/querytools/querytools_env.sh 19 | 20 | # Set the default filesystem to be 'hdfs' since Pig and Hive will tend to rely 21 | # on multi-stage pipelines more heavily then plain Hadoop MapReduce, and thus 22 | # be vulnerable to eventual list consistency. Okay to read initially from GCS 23 | # using explicit gs:// URIs and likewise to write the final output to GCS, 24 | # letting any intermediate cross-stage items get stored in HDFS temporarily. 25 | DEFAULT_FS='hdfs' 26 | 27 | # URIs of tarballs to install. 28 | PIG_TARBALL_URI='gs://querytools-dist/pig-0.12.0.tar.gz' 29 | HIVE_TARBALL_URI='gs://querytools-dist/hive-0.12.0-bin.tar.gz' 30 | 31 | # Constants normally in project_properties.sh from the sampleapp, but which we 32 | # can propagate out here as shared environment variables instead. 33 | HADOOP_MAJOR_VERSION='1' 34 | HADOOP_USER='hadoop' 35 | HADOOP_GROUP='hadoop' 36 | HDP_USER='hadoop' 37 | HDP_USER_HOME='/home/hadoop' 38 | MASTER_INSTALL_DIR='/home/hadoop' 39 | PACKAGES_DIR='packages' 40 | SCRIPTS_DIR='scripts' 41 | MASTER_PACKAGE_DIR='/tmp/hdp_tools' 42 | HDFS_TMP_DIR='/tmp' 43 | HADOOP_TMP_DIR='/hadoop/tmp' 44 | 45 | # File dependencies to be used by the scripts. 46 | if [[ -n "${BDUTIL_DIR}" ]]; then 47 | UPLOAD_FILES+=( 48 | "${BDUTIL_DIR}/extensions/querytools/pig-mapred-template.xml" 49 | "${BDUTIL_DIR}/sampleapps/querytools/conf/hive/hive-site.xml" 50 | "${BDUTIL_DIR}/sampleapps/querytools/scripts/common_utils.sh" 51 | "${BDUTIL_DIR}/sampleapps/querytools/scripts/package_utils.sh" 52 | ) 53 | fi 54 | COMMAND_GROUPS+=( 55 | "install_querytools: 56 | extensions/querytools/prepare_files.sh 57 | sampleapps/querytools/scripts/setup-packages__at__master.sh 58 | sampleapps/querytools/scripts/setup-hdfs-for-hdtools__at__master.sh 59 | extensions/querytools/setup_profiles.sh 60 | " 61 | ) 62 | 63 | # Querytools installation only needs to run on master. 64 | COMMAND_STEPS+=( 65 | 'install_querytools,*' 66 | ) 67 | -------------------------------------------------------------------------------- /extensions/querytools/setup_profiles.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Sets up login shells to have the "hive" and "pig" binaries in the system PATH 16 | # environment variable. 17 | 18 | add_to_path_at_login "${MASTER_INSTALL_DIR}/pig/bin" 19 | add_to_path_at_login "${MASTER_INSTALL_DIR}/hive/bin" 20 | -------------------------------------------------------------------------------- /extensions/spark/install_shark.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | set -o errexit 16 | 17 | # Figure out which tarball to use based on which Hadoop version is being used. 18 | set +o nounset 19 | HADOOP_BIN="sudo -u hadoop ${HADOOP_INSTALL_DIR}/bin/hadoop" 20 | HADOOP_VERSION=$(${HADOOP_BIN} version | tr -cd [:digit:] | head -c1) 21 | set -o nounset 22 | if [[ "${HADOOP_VERSION}" == '2' ]]; then 23 | SHARK_TARBALL_URI=${SHARK_HADOOP2_TARBALL_URI} 24 | else 25 | SHARK_TARBALL_URI=${SHARK_HADOOP1_TARBALL_URI} 26 | fi 27 | 28 | SHARK_TARBALL=${SHARK_TARBALL_URI##*/} 29 | gsutil cp ${SHARK_TARBALL_URI} /home/hadoop/${SHARK_TARBALL} 30 | tar -C /home/hadoop -xzvf /home/hadoop/${SHARK_TARBALL} 31 | mv /home/hadoop/shark*/ ${SHARK_INSTALL_DIR} 32 | 33 | # Find the Hadoop lib dir so that we can link its gcs-connector into the 34 | # Shark library path. 35 | set +o nounset 36 | if [[ -r "${HADOOP_INSTALL_DIR}/libexec/hadoop-config.sh" ]]; then 37 | . "${HADOOP_INSTALL_DIR}/libexec/hadoop-config.sh" 38 | fi 39 | if [[ -n "${HADOOP_COMMON_LIB_JARS_DIR}" ]] && \ 40 | [[ -n "${HADOOP_PREFIX}" ]]; then 41 | LIB_JARS_DIR="${HADOOP_PREFIX}/${HADOOP_COMMON_LIB_JARS_DIR}" 42 | else 43 | LIB_JARS_DIR="${HADOOP_INSTALL_DIR}/lib" 44 | fi 45 | set -o nounset 46 | 47 | GCS_JARNAME=$(grep -o '[^/]*\.jar' <<< ${GCS_CONNECTOR_JAR}) 48 | LOCAL_GCS_JAR="${LIB_JARS_DIR}/${GCS_JARNAME}" 49 | ln -s ${LOCAL_GCS_JAR} ${SHARK_INSTALL_DIR}/lib/ 50 | 51 | # Calculate the memory allocations, MB, using 'free -m'. Floor to nearest MB. 52 | TOTAL_MEM=$(free -m | awk '/^Mem:/{print $2}') 53 | SHARK_MEM=$(python -c \ 54 | "print int(${TOTAL_MEM} * ${SHARK_MEM_FRACTION})") 55 | 56 | 57 | # Point shark at scala, hadoop, hive, spark, and the spark master. 58 | cat << EOF >> ${SHARK_INSTALL_DIR}/conf/shark-env.sh 59 | export HADOOP_HOME=${HADOOP_INSTALL_DIR} 60 | export SCALA_HOME=${SCALA_INSTALL_DIR} 61 | export SPARK_HOME=${SPARK_INSTALL_DIR} 62 | export SPARK_MEM=${SHARK_MEM}m 63 | 64 | # Set spark master by copying from spark-env.sh 65 | $(grep 'MASTER=' ${SPARK_INSTALL_DIR}/conf/spark-env.sh) 66 | EOF 67 | 68 | # Add the spark 'bin' path to the .bashrc so that it's easy to call 'spark' 69 | # during interactive ssh session. 70 | add_to_path_at_login "${SHARK_INSTALL_DIR}/bin" 71 | 72 | # Assign ownership of everything to the 'hadoop' user. 73 | chown -R hadoop:hadoop /home/hadoop/ 74 | -------------------------------------------------------------------------------- /extensions/spark/spark_env.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This file contains environment-variable overrides to be used in conjunction 16 | # with bdutil_env.sh in order to deploy a Hadoop + Spark cluster. 17 | # Usage: ./bdutil deploy -e extensions/spark/spark_env.sh 18 | 19 | # An enum of [default|standalone|yarn-client|yarn-cluster]. 20 | # In standalone mode, Spark runs it's own daemons and job submissions are made 21 | # to the master daemon by default. yarn-client and yarn-cluster both run inside 22 | # YARN containers. default preserves Spark's default. 23 | SPARK_MODE="standalone" 24 | 25 | # URIs of tarballs to install. 26 | SCALA_TARBALL_URI='gs://spark-dist/scala-2.10.3.tgz' 27 | SPARK_HADOOP1_TARBALL_URI='gs://spark-dist/spark-1.5.0-bin-hadoop1.tgz' 28 | SPARK_HADOOP2_TARBALL_URI='gs://spark-dist/spark-1.5.0-bin-hadoop2.6.tgz' 29 | 30 | # Directory on each VM in which to install each package. 31 | SCALA_INSTALL_DIR='/home/hadoop/scala-install' 32 | SPARK_INSTALL_DIR='/home/hadoop/spark-install' 33 | 34 | # Worker memory to provide in spark-env.sh, as a fraction of total physical 35 | # memory. In the event of running Spark on YARN the NODEMANAGER_MEMORY_FRACTION 36 | # in hadoop2_env.sh replaces this. 37 | SPARK_WORKER_MEMORY_FRACTION='0.8' 38 | 39 | # Default memory per Spark executor, as a fraction of total physical memory; 40 | # used for default spark-shell if not overridden with a -D option. Can be used 41 | # to accommodate multiple spark-shells on a single cluster, e.g. if this value 42 | # is set to half the value of SPARK_WORKER_MEMORY_FRACTION then two sets of 43 | # executors can run simultaneously. However, in such a case, then at the time 44 | # of starting 'spark-shell' you must specify fewer cores, e.g.: 45 | # SPARK_JAVA_OPTS="-Dspark.cores.max=4" spark-shell 46 | SPARK_EXECUTOR_MEMORY_FRACTION='0.8' 47 | 48 | # Max memory to use by the single Spark daemon process on each node; may need to 49 | # increase when using larger clusters. Expressed as a fraction of total physical 50 | # memory. 51 | SPARK_DAEMON_MEMORY_FRACTION='0.15' 52 | 53 | # Install JDK because certain Spark commands assume jar is installed. 54 | INSTALL_JDK_DEVEL='true' 55 | 56 | # Spark-standalone master UI is on port 8080. 57 | MASTER_UI_PORTS=('8080' ${MASTER_UI_PORTS[@]}) 58 | 59 | COMMAND_GROUPS+=( 60 | "install_spark: 61 | extensions/spark/install_spark.sh 62 | " 63 | "spark_configure_startup: 64 | extensions/spark/spark_configure_startup_processes.sh 65 | " 66 | "start_spark: 67 | extensions/spark/start_spark.sh 68 | " 69 | ) 70 | 71 | # Installation of spark on master and workers; then start_spark only on master. 72 | COMMAND_STEPS+=( 73 | 'install_spark,install_spark' 74 | 'spark_configure_startup,spark_configure_startup' 75 | 'start_spark,*' 76 | ) 77 | -------------------------------------------------------------------------------- /extensions/spark/spark_on_yarn_env.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This file contains environment-variable overrides to be used in conjunction 16 | # with bdutil_env.sh in order to deploy a Hadoop 2 + Spark on YARN cluster. 17 | # Usage: ./bdutil deploy -e extensions/spark/spark_env.sh 18 | 19 | # Install YARN and Spark 20 | import_env hadoop2_env.sh 21 | import_env extensions/spark/spark_env.sh 22 | 23 | # Clusters must have at least 3 workers to run spark-validate-setup.sh 24 | # and many other Spark jobs. 25 | if [[ -z "${NUM_WORKERS}" ]] || (( ${NUM_WORKERS} < 3 )); then 26 | NUM_WORKERS=3 27 | fi 28 | 29 | # An enum of [default|standalone|yarn-client|yarn-cluster]. 30 | # yarn-client and yarn-cluster both run Spark jobs inside YARN containers 31 | # yarn-cluster also runs the spark-class or spark-submit process inside a 32 | # container, but it cannot support spark-shell, without specifying another 33 | # master. 34 | # e.g. spark-shell --master yarn-client. 35 | SPARK_MODE='yarn-client' 36 | -------------------------------------------------------------------------------- /extensions/spark/spark_shark_env.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This file contains environment-variable overrides to be used in conjunction 16 | # with bdutil_env.sh in order to deploy a Hadoop + Spark + Shark cluster. 17 | # Usage: ./bdutil deploy -e extensions/spark/spark_shark_env.sh 18 | 19 | import_env extensions/spark/spark_env.sh 20 | 21 | # URIs of tarballs to install. 22 | SHARK_HADOOP1_TARBALL_URI='gs://spark-dist/shark-0.9.1-bin-hadoop1.tgz' 23 | SHARK_HADOOP2_TARBALL_URI='gs://spark-dist/shark-0.9.1-bin-hadoop2.tgz' 24 | # Shark is not compatible with Spark 1.x 25 | SPARK_HADOOP1_TARBALL_URI='gs://spark-dist/spark-0.9.2-bin-hadoop1.tgz' 26 | SPARK_HADOOP2_TARBALL_URI='gs://spark-dist/spark-0.9.2-bin-hadoop2.tgz' 27 | 28 | # Directory on each VM in which to install shark 29 | SHARK_INSTALL_DIR='/home/hadoop/shark-install' 30 | 31 | # Value to give Shark indicating the amount of Spark worker memory 32 | # available/usable by Shark per worker. Expressed as a fraction of total 33 | # physical memory. 34 | SHARK_MEM_FRACTION='0.8' 35 | 36 | COMMAND_GROUPS+=( 37 | "install_shark: 38 | extensions/spark/install_shark.sh 39 | " 40 | ) 41 | 42 | # Installation of shark 43 | COMMAND_STEPS+=( 44 | 'install_shark,install_shark' 45 | ) 46 | -------------------------------------------------------------------------------- /extensions/spark/start_single_spark_worker.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Can be used on an individual Spark worker when running Spark in "standalone" 16 | # mode. Requires all other setup of files, configuration, etc., to be complete 17 | # already. 18 | 19 | set -o errexit 20 | 21 | source hadoop-env-setup.sh 22 | 23 | SPARK_MASTER="spark://${MASTER_HOSTNAME}:7077" 24 | sudo -u hadoop ${SPARK_INSTALL_DIR}/sbin/spark-daemon.sh start \ 25 | org.apache.spark.deploy.worker.Worker 0 ${SPARK_MASTER} 26 | -------------------------------------------------------------------------------- /extensions/spark/start_spark.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | set -o nounset 16 | set -o errexit 17 | 18 | if [[ ${SPARK_MODE} == 'standalone' ]]; then 19 | sudo -u hadoop ${SPARK_INSTALL_DIR}/sbin/start-all.sh 20 | fi 21 | -------------------------------------------------------------------------------- /extensions/storm/README.md: -------------------------------------------------------------------------------- 1 | Deploying Apache Storm on Google Compute Engine 2 | =============================================== 3 | 4 | Basic Usage 5 | ----------- 6 | 7 | Basic installation of [Apache Storm](https://storm.apache.org/) alongside Hadoop on Google Cloud Platform. 8 | 9 | ./bdutil -e extensions/storm/storm_env.sh deploy 10 | 11 | Or alternatively, using shorthand syntax: 12 | 13 | ./bdutil -e storm deploy 14 | 15 | Status 16 | ------ 17 | 18 | This plugin is currently considered experimental and not officially supported. 19 | Contributions are welcome. 20 | -------------------------------------------------------------------------------- /extensions/storm/install_storm.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | set -o errexit 15 | 16 | # Set up Storm 17 | STORM_MASTER_INSTANCE="${MASTER_HOSTNAME}" 18 | 19 | STORM_INSTALL_TMP_DIR="/storm-$(date +%s)" 20 | mkdir -p ${STORM_INSTALL_TMP_DIR} 21 | 22 | STORM_TARBALL_BASENAME=$(grep -o '[^/]*\.tar.gz' <<< ${STORM_TARBALL_URI}) 23 | STORM_LOCAL_TARBALL="${STORM_INSTALL_TMP_DIR}/${STORM_TARBALL_BASENAME}" 24 | download_bd_resource ${STORM_TARBALL_URI} ${STORM_LOCAL_TARBALL} 25 | 26 | tar -C ${STORM_INSTALL_TMP_DIR} -xvzf ${STORM_LOCAL_TARBALL} 27 | mkdir -p $(dirname ${STORM_INSTALL_DIR}) 28 | mv ${STORM_INSTALL_TMP_DIR}/apache-storm*/ ${STORM_INSTALL_DIR} 29 | 30 | STORM_LIB_DIR="${STORM_INSTALL_DIR}/lib" 31 | 32 | if (( ${ENABLE_STORM_BIGTABLE} )); then 33 | GOOGLE_STORM_LIB_DIR="${STORM_INSTALL_DIR}/lib/google" 34 | mkdir -p "${GOOGLE_STORM_LIB_DIR}" 35 | # Download the alpn jar. The Alpn jar should be a fully qualified URL. 36 | # download_bd_resource needs a fully qualified file path and not just a 37 | # directory name to put the file in when the file to download starts with 38 | # http://. 39 | ALPN_JAR_NAME="${ALPN_REMOTE_JAR##*/}" 40 | ALPN_BOOT_JAR="${GOOGLE_STORM_LIB_DIR}/${ALPN_JAR_NAME}" 41 | download_bd_resource "${ALPN_REMOTE_JAR}" "${ALPN_BOOT_JAR}" 42 | fi 43 | 44 | 45 | mkdir -p ${STORM_VAR} 46 | cat << EOF | tee -a ${STORM_INSTALL_DIR}/conf/storm.yaml 47 | storm.zookeeper.servers: 48 | - "${STORM_MASTER_INSTANCE}" 49 | nimbus.host: "${STORM_MASTER_INSTANCE}" 50 | storm.local.dir: "${STORM_VAR}" 51 | supervisor.slots.ports: 52 | - 6700 53 | - 6701 54 | - 6702 55 | - 6703 56 | storm.messaging.transport: 'backtype.storm.messaging.netty.Context' 57 | storm.messaging.netty.server_worker_threads: 1 58 | storm.messaging.netty.client_worker_threads: 1 59 | storm.messaging.netty.buffer_size: 5242880 60 | storm.messaging.netty.max_retries: 100 61 | storm.messaging.netty.max_wait_ms: 1000 62 | storm.messaging.netty.min_wait_ms: 100 63 | 64 | EOF 65 | 66 | if (( ${ENABLE_STORM_BIGTABLE} )); then 67 | cat << EOF | tee -a "${STORM_INSTALL_DIR}/conf/storm.yaml" 68 | worker.childopts: "-Xbootclasspath/p:${ALPN_BOOT_JAR}" 69 | EOF 70 | fi 71 | 72 | # Add the storm 'bin' path to the .bashrc so that it's easy to call 'storm' 73 | # during interactive ssh session. 74 | add_to_path_at_login "${STORM_INSTALL_DIR}/bin" 75 | 76 | # TODO(user): Fix this a better way. 77 | cp /home/hadoop/hadoop-install/lib/gcs-connector*.jar /home/hadoop/storm-install/lib/ 78 | cp /home/hadoop/hadoop-install/hadoop-core*.jar /home/hadoop/storm-install/lib/ 79 | cp /home/hadoop/hadoop-install/lib/commons-configuration*.jar /home/hadoop/storm-install/lib/ 80 | 81 | # Assign ownership of everything to the 'hadoop' user. 82 | chown -R hadoop:hadoop /home/hadoop/ ${STORM_VAR} 83 | -------------------------------------------------------------------------------- /extensions/storm/install_supervisor.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Installs Supervisor using apt-get. 16 | 17 | # Strip the debian mirrors to force only using the GCS mirrors. Not ideal for 18 | # production usage due to stripping security.debian.org, but reduces external 19 | # load for non-critical use cases. 20 | 21 | install_application 'supervisor' 22 | 23 | # No easy way to install supervisor on CentOS and have it configured 24 | if ! [[ -x $(which apt-get) ]] && [[ -x $(which yum) ]]; then 25 | # Install supervisor 26 | yum install -y python-setuptools 27 | easy_install supervisor 28 | mkdir -p /etc/supervisor/conf.d/ 29 | mkdir -p /var/log/supervisor 30 | 31 | # Set up the supervisor configuration 32 | cat > supervisord.conf < ${ZOOKEEPER_INSTALL_DIR}/conf/zoo.cfg 34 | 35 | # Sets the dir locations for the log and tracelog and sets root.logger value to "INFO, ROLLINGFILE" instead of "INFO, CONSOLE" 36 | perl -pi -e 's|^(zookeeper.(?:trace)?log.dir=).*|$1'${ZOOKEEPER_VAR}'/log| ; s|(?<=zookeeper.root.logger=).*|INFO, ROLLINGFILE| ;' \ 37 | ${ZOOKEEPER_INSTALL_DIR}/conf/log4j.properties 38 | 39 | 40 | # Add the zookeeper 'bin' path to the .bashrc so that it's easy to call access 41 | # zookeeper files during interactive ssh session. 42 | add_to_path_at_login "${ZOOKEEPER_INSTALL_DIR}/bin" 43 | 44 | # Assign ownership of everything to the 'hadoop' user. 45 | chown -R hadoop:hadoop /home/hadoop/ ${ZOOKEEPER_VAR} 46 | 47 | # Define Supervisor Configuration for ZooKeeper 48 | cat > /etc/supervisor/conf.d/zookeeper.conf < 2 | 37 | 38 | 39 | 40 | 41 | 42 | %d{yyyy-MM-dd HH:mm:ss} %c{1} [%p] %m%n 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /extensions/storm/start_storm_master.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Define Supervisor Configurations for Storm 16 | cat > /etc/supervisor/conf.d/storm.conf < /etc/supervisor/conf.d/storm.conf <) share of the memory available to 33 | # the NodeManager for containers. Thus an n1-standard-4 with CORES_PER_MAP_TASK 34 | # set to 2 would be able to host 4 / 2 = 2 map containers (and no other 35 | # containers). For more details see the script 'libexec/configure-mrv2-mem.py'. 36 | CORES_PER_MAP_TASK=1.0 37 | 38 | # Decimal number controlling the size of reduce containers in memory and virtual 39 | # cores. See CORES_PER_MAP_TASK for more details. 40 | CORES_PER_REDUCE_TASK=2.0 41 | 42 | # Decimal number controlling the size of application master containers in memory 43 | # and virtual cores. See CORES_PER_MAP_TASK for more details. 44 | CORES_PER_APP_MASTER=2.0 45 | 46 | # Connector with Hadoop AbstractFileSystem implemenation for YARN 47 | GCS_CONNECTOR_JAR='https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-1.6.2-hadoop2.jar' 48 | 49 | BIGQUERY_CONNECTOR_JAR='https://storage.googleapis.com/hadoop-lib/bigquery/bigquery-connector-0.10.3-hadoop2.jar' 50 | 51 | 52 | HDFS_DATA_DIRS_PERM='700' 53 | 54 | # 8088 for YARN, 50070 for HDFS. 55 | MASTER_UI_PORTS=('8088' '50070') 56 | 57 | # Allow to tune the YARN scheduler to 58 | YARN_SCHEDULER_CAPACITY_MAXIMUM_APPLICATIONS=10000 59 | YARN_SCHEDULER_CAPACITY_MAX_AM_PERCENT=0.2 60 | 61 | # Use Hadoop 2 specific configuration templates. 62 | if [[ -n "${BDUTIL_DIR}" ]]; then 63 | UPLOAD_FILES=($(find ${BDUTIL_DIR}/conf/hadoop2 -name '*template.xml')) 64 | UPLOAD_FILES+=("${BDUTIL_DIR}/libexec/hadoop_helpers.sh") 65 | UPLOAD_FILES+=("${BDUTIL_DIR}/libexec/configure_mrv2_mem.py") 66 | fi 67 | 68 | # Use Hadoop 2 specific start scripts 69 | COMMAND_GROUPS+=( 70 | 'deploy_start2: 71 | libexec/start_hadoop2.sh' 72 | ) 73 | 74 | COMMAND_STEPS=( 75 | "deploy-ssh-master-setup,*" 76 | 'deploy-core-setup,deploy-core-setup' 77 | "*,deploy-ssh-worker-setup" 78 | "deploy-master-nfs-setup,*", 79 | "deploy-client-nfs-setup,deploy-client-nfs-setup", 80 | 'deploy_start2,*' 81 | ) 82 | -------------------------------------------------------------------------------- /libexec/configure_hdfs.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Configures HDFS 16 | 17 | set -e 18 | 19 | source hadoop_helpers.sh 20 | 21 | if (( ${ENABLE_HDFS} )); then 22 | 23 | HDFS_ADMIN=$(get_hdfs_superuser) 24 | 25 | # Location of HDFS metadata on namenode 26 | export HDFS_NAME_DIR=/hadoop/dfs/name 27 | 28 | # If disks are mounted use all of them for HDFS data 29 | MOUNTED_DISKS=($(find /mnt -maxdepth 1 -mindepth 1)) 30 | if [[ ${#MOUNTED_DISKS[@]} -eq 0 ]]; then 31 | MOUNTED_DISKS=('') 32 | fi 33 | 34 | # Location of HDFS data blocks on datanodes; for each mounted disk, add the 35 | # path /mnt/diskname/hadoop/dfs/data as a data directory, or if no mounted 36 | # disks exist, just go with the absolute path /hadoop/dfs/data. 37 | HDFS_DATA_DIRS="${MOUNTED_DISKS[@]/%//hadoop/dfs/data}" 38 | 39 | # Do not create HDFS_NAME_DIR, or Hadoop will think it is already formatted 40 | mkdir -p /hadoop/dfs ${HDFS_DATA_DIRS} 41 | 42 | chown ${HDFS_ADMIN}:hadoop -L -R /hadoop/dfs ${HDFS_DATA_DIRS} 43 | 44 | # Make sure the data dirs have the expected permissions. 45 | chmod ${HDFS_DATA_DIRS_PERM} ${HDFS_DATA_DIRS} 46 | 47 | # Set general Hadoop environment variables 48 | 49 | # Calculate the memory allocations, MB, using 'free -m'. Floor to nearest MB. 50 | TOTAL_MEM=$(free -m | awk '/^Mem:/{print $2}') 51 | NAMENODE_MEM_MB=$(python -c "print int(${TOTAL_MEM} * \ 52 | ${HDFS_MASTER_MEMORY_FRACTION} / 2)") 53 | SECONDARYNAMENODE_MEM_MB=${NAMENODE_MEM_MB} 54 | 55 | cat << EOF >> ${HADOOP_CONF_DIR}/hadoop-env.sh 56 | 57 | # Increase the maximum NameNode / SecondaryNameNode heap. 58 | HADOOP_NAMENODE_OPTS="-Xmx${NAMENODE_MEM_MB}m \${HADOOP_NAMENODE_OPTS}" 59 | HADOOP_SECONDARYNAMENODE_OPTS="-Xmx${SECONDARYNAMENODE_MEM_MB}m \${HADOOP_SECONDARYNAMENODE_OPTS}" 60 | EOF 61 | 62 | # Increase maximum number of files for HDFS 63 | MAX_FILES=16384 64 | ulimit -n ${MAX_FILES} 65 | cat << EOF > /etc/security/limits.d/hadoop.conf 66 | ${HDFS_ADMIN} hard nofile ${MAX_FILES} 67 | ${HDFS_ADMIN} soft nofile ${MAX_FILES} 68 | EOF 69 | 70 | export HDFS_DATA_DIRS="${HDFS_DATA_DIRS// /,}" 71 | 72 | bdconfig merge_configurations \ 73 | --configuration_file ${HADOOP_CONF_DIR}/hdfs-site.xml \ 74 | --source_configuration_file hdfs-template.xml \ 75 | --resolve_environment_variables \ 76 | --create_if_absent \ 77 | --clobber 78 | fi 79 | -------------------------------------------------------------------------------- /libexec/install_and_configure_bigquery_connector.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2013 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Downloads and installs the relevant bigquery-connector-.jar. 16 | # Also configures it for use with hadoop. 17 | 18 | set -e 19 | 20 | if (( ${INSTALL_BIGQUERY_CONNECTOR} )); then 21 | if [[ -r "${HADOOP_INSTALL_DIR}/libexec/hadoop-config.sh" ]]; then 22 | . "${HADOOP_INSTALL_DIR}/libexec/hadoop-config.sh" 23 | fi 24 | if [[ -n "${HADOOP_COMMON_LIB_JARS_DIR}" ]] && \ 25 | [[ -n "${HADOOP_PREFIX}" ]]; then 26 | LIB_JARS_DIR="${HADOOP_PREFIX}/${HADOOP_COMMON_LIB_JARS_DIR}" 27 | else 28 | LIB_JARS_DIR="${HADOOP_INSTALL_DIR}/lib" 29 | fi 30 | 31 | 32 | # Grab the connector jarfile, add it to installation /lib directory. 33 | JARNAME=$(grep -o '[^/]*\.jar' <<< ${BIGQUERY_CONNECTOR_JAR}) 34 | LOCAL_JAR="${LIB_JARS_DIR}/${JARNAME}" 35 | 36 | download_bd_resource "${BIGQUERY_CONNECTOR_JAR}" "${LOCAL_JAR}" 37 | 38 | chown hadoop:hadoop ${LOCAL_JAR} 39 | 40 | echo "export HADOOP_CLASSPATH=\$HADOOP_CLASSPATH:${LOCAL_JAR}" \ 41 | >> ${HADOOP_CONF_DIR}/hadoop-env.sh 42 | 43 | bdconfig merge_configurations \ 44 | --configuration_file ${HADOOP_CONF_DIR}/mapred-site.xml \ 45 | --source_configuration_file bq-mapred-template.xml \ 46 | --resolve_environment_variables \ 47 | --create_if_absent \ 48 | --noclobber 49 | 50 | chown -R hadoop:hadoop ${HADOOP_CONF_DIR} 51 | fi 52 | -------------------------------------------------------------------------------- /libexec/install_and_configure_gcs_connector.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved.D 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Downloads and installs the relevant gcs-connector-.jar. 16 | # Also configures it for use with hadoop. 17 | 18 | if (( ${INSTALL_GCS_CONNECTOR} )) ; then 19 | 20 | if [[ -r "${HADOOP_INSTALL_DIR}/libexec/hadoop-config.sh" ]]; then 21 | . "${HADOOP_INSTALL_DIR}/libexec/hadoop-config.sh" 22 | fi 23 | if [[ -n "${HADOOP_COMMON_LIB_JARS_DIR}" ]] && \ 24 | [[ -n "${HADOOP_PREFIX}" ]]; then 25 | LIB_JARS_DIR="${HADOOP_PREFIX}/${HADOOP_COMMON_LIB_JARS_DIR}" 26 | else 27 | LIB_JARS_DIR="${HADOOP_INSTALL_DIR}/lib" 28 | fi 29 | 30 | # Grab the connector jarfile, add it to installation /lib directory. 31 | JARNAME=$(grep -o '[^/]*\.jar' <<< ${GCS_CONNECTOR_JAR}) 32 | LOCAL_JAR="${LIB_JARS_DIR}/${JARNAME}" 33 | 34 | download_bd_resource "${GCS_CONNECTOR_JAR}" "${LOCAL_JAR}" 35 | 36 | echo "export HADOOP_CLASSPATH=\$HADOOP_CLASSPATH:${LOCAL_JAR}" \ 37 | >> ${HADOOP_CONF_DIR}/hadoop-env.sh 38 | 39 | if (( ${ENABLE_NFS_GCS_FILE_CACHE} )); then 40 | export GCS_METADATA_CACHE_TYPE='FILESYSTEM_BACKED' 41 | export GCS_FILE_CACHE_DIRECTORY="$(get_nfs_mount_point)" 42 | else 43 | export GCS_METADATA_CACHE_TYPE='IN_MEMORY' 44 | # For IN_MEMORY cache, this directory won't actually be used, but we set 45 | # it to a sane default for easy manual experimentation of file caching. 46 | export GCS_FILE_CACHE_DIRECTORY='/tmp/gcs_connector_metadata_cache' 47 | fi 48 | bdconfig merge_configurations \ 49 | --configuration_file ${HADOOP_CONF_DIR}/core-site.xml \ 50 | --source_configuration_file gcs-core-template.xml \ 51 | --resolve_environment_variables \ 52 | --create_if_absent \ 53 | --noclobber 54 | 55 | # Install a script that can be used to cleanup filesystem-based GCS caches. 56 | if [[ "$(hostname -s)" == "${MASTER_HOSTNAME}" \ 57 | && "${ENABLE_NFS_GCS_FILE_CACHE}" -ne 0 ]] ; then 58 | setup_cache_cleaner 59 | fi 60 | fi 61 | -------------------------------------------------------------------------------- /libexec/install_bdconfig.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Downloads and installs bdconfig and the xml templates 16 | 17 | set -e 18 | 19 | # Download and use bdconfig for xml configuration. 20 | if [[ ! -f "$(which bdconfig)" ]]; then 21 | download_bd_resource "${BDCONFIG}" /tmp/bdconfig.tar.gz 22 | mkdir -p /usr/local/share/google 23 | tar -C /usr/local/share/google -xzf /tmp/bdconfig.tar.gz 24 | ln -s /usr/local/share/google/bdconfig*/bdconfig /usr/local/bin 25 | fi 26 | -------------------------------------------------------------------------------- /libexec/install_java.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Installs the OpenJDK Java7 JRE using apt-get. 16 | 17 | # Strip the debian mirrors to force only using the GCS mirrors. Not ideal for 18 | # production usage due to stripping security.debian.org, but reduces external 19 | # load for non-critical use cases. 20 | 21 | if (( ${INSTALL_JDK_DEVEL} )); then 22 | echo 'Installing JDK with compiler and tools' 23 | install_application "openjdk-7-jdk" "java-1.7.0-openjdk-devel" 24 | else 25 | echo 'Installing minimal JRE' 26 | install_application "openjdk-7-jre-headless" "java-1.7.0-openjdk" 27 | fi 28 | -------------------------------------------------------------------------------- /libexec/mount_disks.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2013 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Mounts any attached persistent and ephemeral disks non-boot disks 16 | 17 | set -e 18 | 19 | # Get a list of disks from the metadata server. 20 | BASE_DISK_URL='http://metadata.google.internal/computeMetadata/v1/instance/disks/' 21 | MOUNT_TOOL_URL='https://raw.githubusercontent.com/GoogleCloudPlatform/compute-image-packages/legacy/google-startup-scripts/usr/share/google/safe_format_and_mount' 22 | DISK_PATHS=$(curl_v1_metadata "${BASE_DISK_URL}") 23 | MOUNTED_DISKS=() 24 | 25 | MOUNT_TOOL=/tmp/${MOUNT_TOOL_URL##*/} 26 | download_bd_resource ${MOUNT_TOOL_URL} ${MOUNT_TOOL} 27 | chmod a+x ${MOUNT_TOOL} 28 | 29 | for DISK_PATH in ${DISK_PATHS}; do 30 | # Use the metadata server to determine the official index/name of each disk. 31 | DISK_NAME=$(curl_v1_metadata "${BASE_DISK_URL}${DISK_PATH}device-name") 32 | DISK_INDEX=$(curl_v1_metadata "${BASE_DISK_URL}${DISK_PATH}index") 33 | DISK_TYPE=$(curl_v1_metadata "${BASE_DISK_URL}${DISK_PATH}type") 34 | 35 | # Index '0' is the boot disk and is thus already mounted. 36 | if [[ "${DISK_INDEX}" == '0' ]]; then 37 | echo "Boot disk is ${DISK_NAME}; will not attempt to mount it." 38 | continue 39 | fi 40 | 41 | if [[ "${DISK_TYPE}" == 'EPHEMERAL' ]]; then 42 | DISK_PREFIX='ed' 43 | elif [[ "${DISK_TYPE}" == 'PERSISTENT' ]]; then 44 | DISK_PREFIX='pd' 45 | fi 46 | 47 | # The metadata-specified 'name' can be converted to a disk 'id' by prepending 48 | # 'google-' and finding it under /dev/disk/by-id. 49 | DISK_ID="/dev/disk/by-id/google-${DISK_NAME}" 50 | echo "Resolved disk name '${DISK_NAME}' to expected path '${DISK_ID}'." 51 | 52 | # We will name the mount-point after the official 'disk index'; this means 53 | # there will be no mounted disk with suffix '0' since '0' is the boot disk. 54 | DATAMOUNT="/mnt/${DISK_PREFIX}${DISK_INDEX}" 55 | mkdir -p ${DATAMOUNT} 56 | MOUNTED_DISKS+=(${DATAMOUNT}) 57 | echo "Mounting '${DISK_ID}' under mount point '${DATAMOUNT}'..." 58 | 59 | ${MOUNT_TOOL} -m 'mkfs.ext4 -F' ${DISK_ID} ${DATAMOUNT} 60 | 61 | # Idempotently update /etc/fstab 62 | if cut -d '#' -f 1 /etc/fstab | grep -qvw ${DATAMOUNT}; then 63 | DISK_UUID=$(blkid ${DISK_ID} -s UUID -o value) 64 | MOUNT_ENTRY=($(grep -w ${DATAMOUNT} /proc/mounts)) 65 | # Taken from /usr/share/google/safe_format_and_mount 66 | MOUNT_OPTIONS='defaults,discard' 67 | echo "UUID=${DISK_UUID} ${MOUNT_ENTRY[@]:1:2} ${MOUNT_OPTIONS} 0 2 \ 68 | # added by bdutil" >> /etc/fstab 69 | fi 70 | done 71 | 72 | # If disks are mounted use the first one to hold target of symlink /hadoop 73 | if (( ${#MOUNTED_DISKS[@]} )); then 74 | MOUNTED_HADOOP_DIR=${MOUNTED_DISKS[0]}/hadoop 75 | mkdir -p ${MOUNTED_HADOOP_DIR} 76 | if [[ ! -d /hadoop ]]; then 77 | ln -s ${MOUNTED_HADOOP_DIR} /hadoop 78 | fi 79 | fi 80 | -------------------------------------------------------------------------------- /libexec/set_default_fs.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Sets the default file system for Hadoop. 16 | 17 | set -e 18 | 19 | # Set FS specific config variables 20 | if [[ "${DEFAULT_FS}" == 'gs' ]]; then 21 | DEFAULT_FS_NAME="gs://${CONFIGBUCKET}/" 22 | elif [[ "${DEFAULT_FS}" == 'hdfs' ]]; then 23 | DEFAULT_FS_NAME="${NAMENODE_URI}" 24 | fi 25 | 26 | bdconfig set_property \ 27 | --configuration_file ${HADOOP_CONF_DIR}/core-site.xml \ 28 | --name 'fs.default.name' \ 29 | --value ${DEFAULT_FS_NAME} \ 30 | --clobber 31 | 32 | bdconfig set_property \ 33 | --configuration_file ${HADOOP_CONF_DIR}/core-site.xml \ 34 | --name 'fs.defaultFS' \ 35 | --value ${DEFAULT_FS_NAME} \ 36 | --clobber 37 | -------------------------------------------------------------------------------- /libexec/setup_client_nfs.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | if (( ${INSTALL_GCS_CONNECTOR} )) && \ 16 | (( ${ENABLE_NFS_GCS_FILE_CACHE} )) ; then 17 | # Set up the GCS_ADMIN user. 18 | setup_gcs_admin 19 | 20 | install_application "nfs-common" "nfs-utils" 21 | install_application "autofs" 22 | 23 | NFS_MOUNT_POINT="$(get_nfs_mount_point)" 24 | NFS_EXPORT_POINT="$(get_nfs_export_point)" 25 | 26 | mkdir -p "${NFS_MOUNT_POINT}" 27 | chown ${GCS_ADMIN}:${GCS_ADMIN} "${NFS_MOUNT_POINT}" 28 | if ! grep -e "auto.hadoop_gcs_metadata_cache" /etc/auto.master ; then 29 | echo "/- /etc/auto.hadoop_gcs_metadata_cache nobind" >> /etc/auto.master 30 | fi 31 | 32 | MOUNT_STRING="/${NFS_MOUNT_POINT} -fstype=nfs,defaults,rw,hard,intr" 33 | MOUNT_STRING="${MOUNT_STRING} ${GCS_CACHE_MASTER_HOSTNAME}:${NFS_EXPORT_POINT}" 34 | echo "${MOUNT_STRING}" > /etc/auto.hadoop_gcs_metadata_cache 35 | 36 | if [[ -f /usr/lib/systemd/system/autofs.service ]] \ 37 | && which systemctl ; then 38 | systemctl enable autofs 39 | fi 40 | 41 | service autofs restart 42 | fi 43 | -------------------------------------------------------------------------------- /libexec/setup_hadoop_user.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Adds a new 'hadoop' user which will be used to run the hadoop servers. 16 | 17 | set -e 18 | 19 | mkdir -p /home/hadoop 20 | mkdir -p /home/hadoop/.ssh 21 | 22 | if ! (id -u hadoop >& /dev/null); then 23 | useradd --system --shell /bin/bash -M --home /home/hadoop --user-group hadoop 24 | fi 25 | 26 | if skeleton_files=$(find /etc/skel/ -maxdepth 1 -type f); then 27 | cp ${skeleton_files} /home/hadoop 28 | fi 29 | 30 | chown -R hadoop:hadoop /home/hadoop 31 | 32 | mkdir -p ~hadoop/.ssh 33 | chown -R hadoop:hadoop ~hadoop/.ssh/ 34 | 35 | if [[ -x $(which restorecon) ]]; then 36 | restorecon -Rv /home 37 | fi 38 | 39 | mkdir -p /var/log/hadoop 40 | chown hadoop:hadoop /var/log/hadoop 41 | -------------------------------------------------------------------------------- /libexec/setup_master_ssh.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2013 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Sets up ssh keys on the master and then uploads them to a GCS CONFIGBUCKET 16 | # for worker to later download. 17 | 18 | set -e 19 | 20 | mkdir -p /home/hadoop/.ssh/ 21 | chmod 700 /home/hadoop/.ssh 22 | 23 | PRIVATE_KEY_NAME='hadoop_master_id_rsa' 24 | PUBLIC_KEY_NAME="${PRIVATE_KEY_NAME}.pub" 25 | LOCAL_PUBLIC_KEY="/home/hadoop/.ssh/${PUBLIC_KEY_NAME}" 26 | REMOTE_PUBLIC_KEY="${BDUTIL_GCS_STAGING_DIR}/${PUBLIC_KEY_NAME}" 27 | LOCAL_PRIVATE_KEY="/home/hadoop/.ssh/${PRIVATE_KEY_NAME}" 28 | 29 | ssh-keygen -N "" -f ${LOCAL_PRIVATE_KEY} 30 | 31 | # Authorize ssh into self as well, in case the master is also a worker node. 32 | cat ${LOCAL_PUBLIC_KEY} >> /home/hadoop/.ssh/authorized_keys 33 | 34 | echo "Host ${PREFIX}*" >> /home/hadoop/.ssh/config 35 | echo " IdentityFile ${LOCAL_PRIVATE_KEY}" >> /home/hadoop/.ssh/config 36 | echo ' UserKnownHostsFile /dev/null' >> /home/hadoop/.ssh/config 37 | echo ' CheckHostIP no' >> /home/hadoop/.ssh/config 38 | echo ' StrictHostKeyChecking no' >> /home/hadoop/.ssh/config 39 | 40 | gsutil cp ${LOCAL_PUBLIC_KEY} ${REMOTE_PUBLIC_KEY} 41 | -------------------------------------------------------------------------------- /libexec/setup_worker_ssh.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2013 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Downloads shared ssh keys previously generated by the hadoop master and 16 | # uses them to configure intra-cluster ssh access. 17 | 18 | set -e 19 | 20 | mkdir -p ~hadoop/.ssh/ 21 | 22 | PRIVATE_KEY_NAME='hadoop_master_id_rsa' 23 | PUBLIC_KEY_NAME="${PRIVATE_KEY_NAME}.pub" 24 | LOCAL_PUBLIC_KEY="/home/hadoop/.ssh/${PUBLIC_KEY_NAME}" 25 | REMOTE_PUBLIC_KEY="${BDUTIL_GCS_STAGING_DIR}/${PUBLIC_KEY_NAME}" 26 | 27 | gsutil cp ${REMOTE_PUBLIC_KEY} ${LOCAL_PUBLIC_KEY} 28 | cat ${LOCAL_PUBLIC_KEY} >> ~hadoop/.ssh/authorized_keys 29 | 30 | echo "Host ${PREFIX}*" >> ~hadoop/.ssh/config 31 | echo ' UserKnownHostsFile /dev/null' >> ~hadoop/.ssh/config 32 | echo ' CheckHostIP no' >> ~hadoop/.ssh/config 33 | echo ' StrictHostKeyChecking no' >> ~hadoop/.ssh/config 34 | 35 | chown -R hadoop:hadoop ~hadoop/.ssh/ 36 | chmod 700 ~hadoop/.ssh 37 | -------------------------------------------------------------------------------- /libexec/start_hadoop.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Starts relevant hadoop daemon servers as the 'hadoop' user. 16 | set -e 17 | 18 | source hadoop_helpers.sh 19 | 20 | HADOOP_PORTS=(50010 50020 50030 50060 50070 50075 50090) 21 | 22 | cd ${HADOOP_INSTALL_DIR} 23 | 24 | # Test for sshability to workers. 25 | for NODE in ${WORKERS[@]}; do 26 | sudo -u hadoop ssh ${NODE} "exit 0" 27 | done 28 | 29 | # Wait for our ports to be free, but keep running even if not. 30 | wait_until_ports_free_and_report "${HADOOP_PORTS[@]}" || true 31 | 32 | # Start namenode and jobtracker 33 | if (( ${ENABLE_HDFS} )); then 34 | start_with_retry_namenode start_dfs_hadoop_1 & 35 | fi 36 | start_with_retry_jobtracker & 37 | for SUBPROC in $(jobs -p); do 38 | wait ${SUBPROC} 39 | done 40 | 41 | check_filesystem_accessibility 42 | -------------------------------------------------------------------------------- /libexec/start_hadoop2.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Starts relevant hadoop daemon servers as the 'hadoop' user. 16 | 17 | set -e 18 | 19 | source hadoop_helpers.sh 20 | 21 | HADOOP_PORTS=(8088 50010 50020 50070 50090) 22 | 23 | cd ${HADOOP_INSTALL_DIR} 24 | 25 | # Test for sshability to workers. 26 | for NODE in ${WORKERS[@]}; do 27 | sudo -u hadoop ssh ${NODE} "exit 0" 28 | done 29 | 30 | # Wait for our ports to be free, but keep running even if not. 31 | wait_until_ports_free_and_report "${HADOOP_PORTS[@]}" || true 32 | 33 | if (( ${ENABLE_HDFS} )); then 34 | # Start namenode and jobtracker 35 | start_with_retry_namenode start_dfs_hadoop_2 36 | 37 | if [[ "${DEFAULT_FS}" == 'hdfs' ]]; then 38 | # Set up HDFS /tmp and /user dirs 39 | initialize_hdfs_dirs 40 | fi 41 | fi 42 | 43 | # Start up resource and node managers 44 | sudo -u hadoop ./sbin/start-yarn.sh 45 | service hadoop-mapreduce-historyserver start 46 | 47 | check_filesystem_accessibility 48 | -------------------------------------------------------------------------------- /platforms/cdh/README.md: -------------------------------------------------------------------------------- 1 | Deploying Cloudera Data Hub (CDH) on Google Compute Engine 2 | ========================================================== 3 | 4 | Basic Usage 5 | ----------- 6 | 7 | This plugin replaces the vanilla Apache binary tarballs with [Cloudera Data Hub](http://www.cloudera.com/content/cloudera/en/products-and-services/cdh.html) packages. Cluster configuration is the same as in core bdutil. 8 | 9 | ./bdutil -e platforms/cdh/cdh_env.sh deploy 10 | 11 | Or alternatively, using shorthand syntax: 12 | 13 | ./bdutil -e cdh deploy 14 | 15 | Status 16 | ------ 17 | 18 | This plugin is currently considered experimental and not officially supported. 19 | Contributions are welcome. 20 | -------------------------------------------------------------------------------- /platforms/cdh/cdh-core-template.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | hadoop.proxyuser.hue.hosts 6 | * 7 | 8 | 9 | hadoop.proxyuser.hue.groups 10 | * 11 | 12 | 13 | hadoop.proxyuser.oozie.hosts 14 | * 15 | 16 | 17 | hadoop.proxyuser.oozie.groups 18 | * 19 | 20 | 21 | -------------------------------------------------------------------------------- /platforms/cdh/cdh_env.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Extension file for deploying CDH with bdutil 16 | 17 | # Requies Hadoop 2 libraries (for recent versions at least). 18 | import_env hadoop2_env.sh 19 | 20 | # Change these. 21 | CDH_VERSION=5 22 | # Components are installed / started in the order they are listed. 23 | MASTER_COMPONENTS="hadoop-hdfs-namenode hadoop-hdfs-secondarynamenode 24 | hadoop-yarn-resourcemanager hadoop-mapreduce-historyserver 25 | hive-metastore hive pig oozie hue" 26 | DATANODE_COMPONENTS="hadoop-hdfs-datanode hadoop-yarn-nodemanager 27 | hadoop-mapreduce" 28 | 29 | # Install JDK with compiler/tools instead of just the minimal JRE. 30 | INSTALL_JDK_DEVEL=true 31 | 32 | # Hardware configuration. 33 | NUM_WORKERS=4 34 | WORKER_ATTACHED_PDS_SIZE_GB=1500 35 | MASTER_ATTACHED_PD_SIZE_GB=1500 36 | 37 | # Don't change these. 38 | HADOOP_CONF_DIR='/etc/hadoop/conf' 39 | HADOOP_INSTALL_DIR='/usr/lib/hadoop' 40 | DEFAULT_FS='hdfs' 41 | UPLOAD_FILES+=('platforms/cdh/cdh-core-template.xml') 42 | USE_ATTACHED_PDS=true 43 | 44 | COMMAND_GROUPS+=( 45 | "deploy-cdh: 46 | libexec/mount_disks.sh 47 | libexec/install_java.sh 48 | platforms/cdh/install_cdh.sh 49 | libexec/install_bdconfig.sh 50 | libexec/configure_hadoop.sh 51 | libexec/install_and_configure_gcs_connector.sh 52 | libexec/configure_hdfs.sh 53 | libexec/set_default_fs.sh 54 | platforms/cdh/configure_cdh.sh" 55 | 56 | "restart_services: 57 | platforms/restart_services.sh" 58 | ) 59 | 60 | COMMAND_STEPS=( 61 | 'deploy-cdh,deploy-cdh' 62 | 'deploy-master-nfs-setup,*' 63 | 'deploy-client-nfs-setup,deploy-client-nfs-setup' 64 | 'restart_services,restart_services' 65 | ) 66 | -------------------------------------------------------------------------------- /platforms/cdh/configure_cdh.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2014 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS-IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # Misc configurations for components not installed elsewhere. 17 | # Not necessarily CDH specific. 18 | 19 | # Use FQDNs 20 | grep ${HOSTNAME} -lR ${HADOOP_CONF_DIR} \ 21 | | xargs -r sed -i "s/${HOSTNAME}/$(hostname --fqdn)/g" 22 | 23 | # Configure Hive Metastore 24 | if dpkg -s hive-metastore > /dev/null; then 25 | # Configure Hive metastorea 26 | bdconfig set_property \ 27 | --configuration_file /etc/hive/conf/hive-site.xml \ 28 | --name 'hive.metastore.uris' \ 29 | --value "thrift://$(hostname --fqdn):9083" \ 30 | --clobber 31 | fi 32 | 33 | # Configure Hue 34 | if dpkg -s hue > /dev/null; then 35 | # Replace localhost with hostname. 36 | sed -i "s/#*\([^#]*=.*\)localhost/\1$(hostname --fqdn)/" /etc/hue/conf/hue.ini 37 | fi 38 | 39 | # Configure Oozie 40 | if dpkg -s oozie > /dev/null; then 41 | sudo -u oozie /usr/lib/oozie/bin/ooziedb.sh create -run 42 | 43 | # Try to enable gs:// paths 44 | bdconfig set_property \ 45 | --configuration_file /etc/oozie/conf/oozie-site.xml \ 46 | --name 'oozie.service.HadoopAccessorService.supported.filesystems' \ 47 | --value 'hdfs,gs,webhdfs,hftp' \ 48 | --clobber 49 | fi 50 | 51 | # Enable WebHDFS 52 | bdconfig set_property \ 53 | --configuration_file ${HADOOP_CONF_DIR}/hdfs-site.xml \ 54 | --name 'dfs.webhdfs.enabled' \ 55 | --value true \ 56 | --clobber 57 | 58 | # Enable Hue / Oozie impersonation 59 | bdconfig merge_configurations \ 60 | --configuration_file ${HADOOP_CONF_DIR}/core-site.xml \ 61 | --source_configuration_file cdh-core-template.xml \ 62 | --resolve_environment_variables \ 63 | --clobber 64 | -------------------------------------------------------------------------------- /platforms/cdh/install_cdh.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2014 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS-IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | #TODO(user) support other Linux distributions. 17 | ARCHIVE_URL="http://archive.cloudera.com/cdh${CDH_VERSION}/debian/jessie/amd64/cdh" 18 | cat << EOF > /etc/apt/sources.list.d/cloudera.list 19 | deb ${ARCHIVE_URL} jessie-cdh${CDH_VERSION} contrib 20 | deb-src ${ARCHIVE_URL} jessie-cdh${CDH_VERSION} contrib 21 | EOF 22 | # TODO(user): fix insecure download of apt-key. 23 | download_bd_resource ${ARCHIVE_URL}/archive.key /tmp/cloudera.key 24 | apt-key add /tmp/cloudera.key 25 | 26 | apt-get update 27 | 28 | if [[ $(hostname -s) == ${MASTER_HOSTNAME} ]]; then 29 | COMPONENTS="${MASTER_COMPONENTS}" 30 | else 31 | COMPONENTS="${DATANODE_COMPONENTS}" 32 | fi 33 | 34 | for COMPONENT in ${COMPONENTS}; do 35 | if ! install_application ${COMPONENT}; then 36 | # Check that it was actually installed as Services often fail to start. 37 | dpkg -s ${COMPONENT} 38 | fi 39 | # Stop installed services: 40 | if [[ -x "/etc/init.d/${COMPONENT}" ]]; then 41 | service ${COMPONENT} stop 42 | fi 43 | done 44 | -------------------------------------------------------------------------------- /platforms/hdp/TEST.md: -------------------------------------------------------------------------------- 1 | ## Prep 2 | 3 | ``` 4 | CONFIGBUCKET=hdp-00 5 | PROJECT=hdp-00 6 | switches="-b ${CONFIGBUCKET} -p ${PROJECT}" 7 | 8 | # add this to make it a smaller test than the defaults 9 | switches+=" 10 | --master_attached_pd_size_gb 100 11 | --worker_attached_pds_size_gb 100 12 | -n 1 13 | -m n1-standard-2" 14 | 15 | 16 | bdutil="./bdutil ${switches}" 17 | ``` 18 | 19 | ## Test ambari_env.sh 20 | 21 | ``` 22 | environment=platforms/hdp/ambari_env.sh 23 | bdutil="${bdutil} -e ${environment}" 24 | 25 | ## deploy 26 | ${bdutil} deploy 27 | 28 | ## test 29 | ${bdutil} shell < ./hadoop-validate-setup.sh 30 | ${bdutil} shell < ./hadoop-validate-gcs.sh 31 | ${bdutil} shell < ./extensions/querytools/hive-validate-setup.sh 32 | ${bdutil} shell < ./extensions/querytools/pig-validate-setup.sh 33 | #${bdutil} shell < ./extensions/spark/spark-validate-setup.sh 34 | 35 | ## delete 36 | ${bdutil} delete 37 | ``` 38 | 39 | 40 | ## Test ambari_manual_env.sh 41 | 42 | ``` 43 | environment=platforms/hdp/ambari_manual_env.sh 44 | bdutil="${bdutil} -e ${environment}" 45 | 46 | ## deploy 47 | ${bdutil} deploy 48 | 49 | ## test 50 | # need to add an automated test here: 51 | ${bdutil} shell # do something here like check the appropriate number of hosts in /api/v1/hosts 52 | 53 | ## delete 54 | ${bdutil} delete 55 | 56 | ``` 57 | 58 | ## Test re-using disks across multiple deployments of same instance count 59 | 60 | ``` 61 | environment=platforms/hdp/ambari_env.sh 62 | bdutil="${bdutil} -e ${environment}" 63 | unset CREATE_ATTACHED_PDS_ON_DEPLOY 64 | unset DELETE_ATTACHED_PDS_ON_DELETE 65 | 66 | ## create 67 | export CREATE_ATTACHED_PDS_ON_DEPLOY=true 68 | ${bdutil} deploy 69 | 70 | ## generate some data onto HDFS, and dont’ delete it 71 | echo "hadoop fs -mkdir redeploy-validation.tmp" | ${bdutil} shell 72 | ## if you want more data than that: 73 | #${bdutil} -u hadoop-validate-setup.sh run_command -- \ 74 | # sudo -u "$(whoami)" TERA_CLEANUP_SKIP=true TERA_GEN_NUM_RECORDS=100000 ./hadoop-validate-setup.sh 75 | 76 | ## check that the ‘validate_...’ dir is there 77 | echo "hadoop fs -ls" | ${bdutil} shell 78 | 79 | ## delete the cluster but keep disks 80 | export DELETE_ATTACHED_PDS_ON_DELETE=false 81 | ${bdutil} delete 82 | 83 | ## create with existing disks 84 | export CREATE_ATTACHED_PDS_ON_DEPLOY=false 85 | ${bdutil} deploy 86 | 87 | ## check that the ‘validate_...’ dir is there 88 | echo "hadoop fs -ls" | ${bdutil} shell 89 | 90 | ## delete everything to cleanup this testing 91 | export DELETE_ATTACHED_PDS_ON_DELETE=true 92 | ${bdutil} delete 93 | ``` 94 | -------------------------------------------------------------------------------- /platforms/hdp/ambari.conf: -------------------------------------------------------------------------------- 1 | ######################################################################## 2 | ######################################################################## 3 | ## This is the base configuration file for the ## 4 | ## Hortonworks Data Platform (HDP) extension to Google's `bdutil` ## 5 | ## ## 6 | ## Most of the values are commented out and just shown here for ## 7 | ## completeness, together with their default value. ## 8 | ######################################################################## 9 | ######################################################################## 10 | 11 | ## ambari.conf 12 | ## Provides configuration for 'bdutil' installations of Ambari 13 | 14 | 15 | ## bdutil setting overrides 16 | ## For further details see: 17 | ## `bdutil_env.sh` 18 | ## https://cloud.google.com/hadoop/setting-up-a-hadoop-cluster 19 | 20 | ## Your Google Cloud Platform configbucket & project 21 | ## Must be set here, 22 | ## or in `bdutil_env.sh` 23 | ## or with the -b & -p switches to `bdutil` 24 | #CONFIGBUCKET="" 25 | #PROJECT="" 26 | 27 | ## the region/zone to deploy into 28 | #GCE_ZONE='us-central1-a' 29 | 30 | ## Number of worker nodes. Total nodes will be NUM_WORKERS+1 31 | #NUM_WORKERS=4 32 | 33 | ## Google Compute Engine machine type 34 | #GCE_MACHINE_TYPE='n1-standard-4' 35 | 36 | ## Amount of storage to attach 37 | #WORKER_ATTACHED_PDS_SIZE_GB=1500 38 | #MASTER_ATTACHED_PD_SIZE_GB=1500 39 | 40 | ## Amount of storage to give the boot disk. 41 | ## A full HDP stack starts to fill up 10 GB. 42 | #MASTER_BOOT_DISK_SIZE_GB=50 43 | #WORKER_BOOT_DISK_SIZE_GB=50 44 | 45 | ## Storage types (pd-standard or pd-ssd) 46 | #WORKER_ATTACHED_PDS_TYPE='pd-standard' 47 | #MASTER_ATTACHED_PD_TYPE='pd-standard' 48 | 49 | 50 | ## HDP settings 51 | ## ============ 52 | 53 | ## If 'true', URLs for web interfaces, such as the jobtracker will be 54 | ## linked from Ambari with the public IP. 55 | ## Default is false. You will need to SSH to reach the host in this case. 56 | #AMBARI_PUBLIC=false 57 | 58 | #AMBARI_VERSION='2.2.1.0' 59 | #AMBARI_REPO=http://public-repo-1.hortonworks.com/ambari/centos6/${AMBARI_VERSION:0:1}.x/updates/${AMBARI_VERSION}/ambari.repo 60 | 61 | ## The distribution to install on your cluster. 62 | #AMBARI_STACK='HDP' 63 | #AMBARI_STACK_VERSION='2.4' 64 | 65 | ## The components of that distribution to install on the cluster. 66 | ## Default is all but Kerberos, Apache Knox, Apache Ranger, and Hortonworks 67 | # SmartSense. 68 | #AMBARI_SERVICES="ACCUMULO AMBARI_METRICS ATLAS FALCON FLUME GANGLIA HBASE HDFS 69 | # HIVE KAFKA MAHOUT MAPREDUCE2 OOZIE PIG SLIDER SPARK SQOOP STORM TEZ YARN 70 | # ZOOKEEPER" 71 | 72 | ## You can run with as little as: 73 | #AMBARI_SERVICES='HDFS MAPREDUCE2 YARN' 74 | 75 | ## If using HDP 2.2, these are the supported services: 76 | #AMBARI_SERVICES="AMBARI_METRICS FALCON FLUME GANGLIA HBASE HDFS HIVE KAFKA 77 | # MAPREDUCE2 OOZIE PIG SLIDER SPARK SQOOP STORM TEZ YARN ZOOKEEPER" 78 | 79 | ## If you want to use a different JAVA 80 | ## Default is set by alternatives to 'openjdk-7-devel' 81 | #JAVA_HOME="/etc/alternatives/java_sdk" 82 | -------------------------------------------------------------------------------- /platforms/hdp/ambari_env.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | # ambari_env.sh 17 | # 18 | # Extension providing a cluster with Apache Ambari installed and automatically 19 | # provisions and configures the cluster's software. This installs and configures 20 | # the GCS connector. 21 | 22 | ######################################################################## 23 | ## There should be nothing to edit here, use ambari.conf ## 24 | ######################################################################## 25 | 26 | # Import the base Ambari installation 27 | import_env platforms/hdp/ambari_manual_env.sh 28 | 29 | # The distribution to install on your cluster. 30 | AMBARI_STACK="${AMBARI_STACK:-HDP}" 31 | AMBARI_STACK_VERSION="${AMBARI_STACK_VERSION:-2.4}" 32 | 33 | ## The components of that distribution to install on the cluster. 34 | # Default is all but Kerberos, Apache Knox, Apache Ranger, and Hortonworks 35 | # SmartSense. 36 | AMBARI_SERVICES="${AMBARI_SERVICES:-ACCUMULO AMBARI_METRICS ATLAS FALCON FLUME 37 | GANGLIA HBASE HDFS HIVE KAFKA MAHOUT MAPREDUCE2 OOZIE PIG SLIDER SPARK SQOOP 38 | STORM TEZ YARN ZOOKEEPER}" 39 | 40 | 41 | if [[ -n "${BDUTIL_DIR}" ]]; then 42 | UPLOAD_FILES+=( 43 | "${BDUTIL_DIR}/platforms/hdp/create_blueprint.py" 44 | ) 45 | fi 46 | 47 | COMMAND_GROUPS+=( 48 | "install-ambari-components: 49 | platforms/hdp/install_ambari_components.sh 50 | " 51 | ) 52 | 53 | COMMAND_STEPS+=( 54 | 'install-ambari-components,*' 55 | 'install-gcs-connector-on-ambari,install-gcs-connector-on-ambari' 56 | 'update-ambari-config,*' 57 | ) 58 | -------------------------------------------------------------------------------- /platforms/hdp/ambari_functions.sh: -------------------------------------------------------------------------------- 1 | ## Tools for interacting with Ambari SERVER 2 | 3 | AMBARI_TIMEOUT=${AMBARI_TIMEOUT:-3600} 4 | POLLING_INTERVAL=${POLLING_INTERVAL:-10} 5 | 6 | 7 | function ambari_wait() { 8 | local condition="$1" 9 | local goal="$2" 10 | local failed="FAILED" 11 | local limit=$(( ${AMBARI_TIMEOUT} / ${POLLING_INTERVAL} + 1 )) 12 | 13 | for (( i=0; i<${limit}; i++ )); do 14 | local status=$(bash -c "${condition}") 15 | echo "ambari_wait status: ${status}" >&2 16 | if [[ "${status}" == "${goal}" ]]; then 17 | break 18 | elif [[ "${status}" =~ "${failed}" ]]; then 19 | echo "Ambari operiation failed with status: ${status}" >&2 20 | return 1 21 | fi 22 | sleep ${POLLING_INTERVAL} 23 | done 24 | 25 | if [[ ${i} == ${limit} ]]; then 26 | echo "ambari_wait did not finish within" \ 27 | "'${AMBARI_TIMEOUT}' seconds. Exiting." >&2 28 | return 1 29 | fi 30 | } 31 | 32 | # Only useful during a fresh install where we expect no failures 33 | # Will not work if any requested TIMEDOUT/ABORTED 34 | function ambari_wait_requests_completed() { 35 | # Avoid race conditions with requests. 36 | sleep 10 37 | AMBARI_CLUSTER=$(get_ambari_cluster_name) 38 | # Poll for completion 39 | ambari_wait "${AMBARI_CURL} ${AMBARI_API}/clusters/${AMBARI_CLUSTER}/requests \ 40 | | grep -Eo 'http://.*/requests/[0-9]+' \ 41 | | xargs ${AMBARI_CURL} \ 42 | | grep request_status \ 43 | | grep -Eo '\"[A-Z_]+\"' \ 44 | | sort | uniq | paste -sd'+'" \ 45 | '"COMPLETED"' 46 | } 47 | 48 | function ambari_service_stop() { 49 | AMBARI_CLUSTER=$(get_ambari_cluster_name) 50 | if [[ -z "${SERVICE}" ]]; then 51 | echo "Taking no action as no SERVICE was defined. You may specific ALL to stop all Services." 52 | else 53 | AMBARI_REQUEST='{"RequestInfo": {"context" :"Stop '${SERVICE}' via REST"}, "Body": {"ServiceInfo": {"state": "INSTALLED"}}}' 54 | if [[ "${SERVICE}" == "ALL" ]]; then 55 | ${AMBARI_CURL} -i -X PUT -d "${AMBARI_REQUEST}" ${AMBARI_API}/clusters/${AMBARI_CLUSTER}/services/ 56 | else 57 | ${AMBARI_CURL} -i -X PUT -d "${AMBARI_REQUEST}" ${AMBARI_API}/clusters/${AMBARI_CLUSTER}/services/${SERVICE} 58 | fi 59 | fi 60 | } 61 | 62 | function ambari_service_start() { 63 | AMBARI_CLUSTER=$(get_ambari_cluster_name) 64 | if [[ -z "${SERVICE}" ]]; then 65 | echo "Taking no action as no SERVICE was defined" 66 | else 67 | AMBARI_REQUEST='{"RequestInfo": {"context" :"Start '${SERVICE}' via REST"}, "Body": {"ServiceInfo": {"state": "STARTED"}}}' 68 | if [[ "${SERVICE}" == 'ALL' ]]; then 69 | ${AMBARI_CURL} -i -X PUT -d "${AMBARI_REQUEST}" ${AMBARI_API}/clusters/${AMBARI_CLUSTER}/services/ 70 | else 71 | ${AMBARI_CURL} -i -X PUT -d "${AMBARI_REQUEST}" ${AMBARI_API}/clusters/${AMBARI_CLUSTER}/services/${SERVICE} 72 | fi 73 | fi 74 | } 75 | 76 | # set SERVICE=ALL to restart all services 77 | function ambari_service_restart() { 78 | ambari_service_stop 79 | ambari_wait_requests_completed 80 | ambari_service_start 81 | ambari_wait_requests_completed 82 | } 83 | 84 | function ambari_restart_all_services() { 85 | AMBARI_CLUSTER=$(get_ambari_cluster_name) 86 | SERVICES=($(${AMBARI_CURL} ${AMBARI_API}/clusters/${AMBARI_CLUSTER}/services \ 87 | | grep -Eo 'http://.*/services/[^\"]+')) 88 | 89 | for STATE in 'INSTALLED' 'STARTED'; do 90 | ${AMBARI_CURL} -X PUT -d "{\"ServiceInfo\":{\"state\":\"${STATE}\"}}" "${SERVICES[@]}" 91 | ambari_wait_requests_completed 92 | done 93 | } 94 | 95 | # Make variable substitutions in a json file. 96 | function subsitute_bash_in_json() { 97 | local custom_configuration_file="$1" 98 | loginfo "Replacing variables in ${custom_configuration_file}." 99 | perl -pi -e 's/\$\{([^\}]*)\}/$ENV{$1}/e' ${custom_configuration_file} 100 | } 101 | 102 | # Print out name of first (and presumably only) cluster in Ambari. 103 | function get_ambari_cluster_name() { 104 | ${AMBARI_CURL} ${AMBARI_API}/clusters \ 105 | | sed -n 's/.*cluster_name" : "\(\S*\)".*/\1/p' \ 106 | | head -1 107 | } 108 | -------------------------------------------------------------------------------- /platforms/hdp/ambari_manual_env.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | # ambari_manual_env.sh 17 | # 18 | # Extension installing Apache Ambari on the cluster allowing the user to 19 | # manually log in and provision and configure the clusters software. 20 | # This installs but does not configure the GCS connector. 21 | 22 | ######################################################################## 23 | ## There should be nothing to edit here, use ambari.conf ## 24 | ######################################################################## 25 | 26 | # Remove core bdutil upload files. 27 | UPLOAD_FILES=() 28 | 29 | # Import hadoop2_env.sh just for the GCS_CONNECTOR_JAR. 30 | import_env hadoop2_env.sh 31 | 32 | # Default to 4 workers plus master for good spreading of master daemons. 33 | NUM_WORKERS=4 34 | # Use CentOS instead of Debian. 35 | GCE_IMAGE='' 36 | GCE_IMAGE_FAMILY='centos-6' 37 | GCE_IMAGE_PROJECT='centos-cloud' 38 | 39 | # Create attached storage 40 | USE_ATTACHED_PDS=true 41 | # Since we'll be using HDFS as the default file system, size disks to grant 42 | # maximum I/O per VM. 43 | WORKER_ATTACHED_PDS_SIZE_GB=1500 44 | MASTER_ATTACHED_PD_SIZE_GB=1500 45 | 46 | ## Amount of storage to give the boot disk. 47 | ## A full HDP stack starts to fill up 10 GB. 48 | MASTER_BOOT_DISK_SIZE_GB=${MASTER_BOOT_DISK_SIZE_GB:-50} 49 | WORKER_BOOT_DISK_SIZE_GB=${MASTER_BOOT_DISK_SIZE_GB:-50} 50 | 51 | # Install the full Java JDK. Most services need it 52 | INSTALL_JDK_DEVEL=true 53 | JAVA_HOME=/etc/alternatives/java_sdk 54 | 55 | ## import configuration overrides 56 | import_env platforms/hdp/ambari.conf 57 | 58 | ## Version of Ambari and location of YUM package repository 59 | AMBARI_VERSION="${AMBARI_VERSION:-2.2.1.0}" 60 | AMBARI_REPO=${AMBARI_REPO:-http://public-repo-1.hortonworks.com/ambari/centos6/${AMBARI_VERSION:0:1}.x/updates/${AMBARI_VERSION}/ambari.repo} 61 | 62 | ## If 'true', URLs for web interfaces, such as the jobtracker will below 63 | ## linked from Ambari with the public IP. 64 | ## Default is false. You will need to SSH to reach the host in this case. 65 | AMBARI_PUBLIC=${AMBARI_PUBLIC:-false} 66 | normalize_boolean 'AMBARI_PUBLIC' 67 | 68 | # HDFS will always be the default file system (even if changed here), because 69 | # many services require it to be. This is purely advisory. 70 | DEFAULT_FS='hdfs' 71 | 72 | GCS_CACHE_CLEANER_LOG_DIRECTORY="/var/log/hadoop/${GCS_CACHE_CLEANER_USER}" 73 | GCS_CACHE_CLEANER_LOGGER='INFO,RFA' 74 | HADOOP_CONF_DIR="/etc/hadoop/conf" 75 | HADOOP_INSTALL_DIR="/usr/local/lib/hadoop" 76 | 77 | # For interacting with Ambari Server API 78 | AMBARI_API="http://localhost:8080/api/v1" 79 | AMBARI_CURL='curl -fsSu admin:admin -H X-Requested-By:ambari' 80 | MASTER_UI_PORTS=('8080') 81 | 82 | import_env platforms/hdp/ambari_functions.sh 83 | 84 | if [[ -n "${BDUTIL_DIR}" ]]; then 85 | UPLOAD_FILES+=( 86 | "${BDUTIL_DIR}/libexec/hadoop_helpers.sh" 87 | "${BDUTIL_DIR}/platforms/hdp/configuration.json" 88 | "${BDUTIL_DIR}/platforms/hdp/resources/public-hostname-gcloud.sh" 89 | "${BDUTIL_DIR}/platforms/hdp/resources/thp-disable.sh" 90 | ) 91 | fi 92 | 93 | COMMAND_GROUPS+=( 94 | "ambari-setup: 95 | libexec/mount_disks.sh 96 | libexec/install_java.sh 97 | libexec/setup_hadoop_user.sh 98 | platforms/hdp/install_ambari.sh 99 | " 100 | 101 | "install-gcs-connector-on-ambari: 102 | platforms/hdp/install_gcs_connector_on_ambari.sh 103 | " 104 | 105 | "update-ambari-config: 106 | platforms/hdp/update_ambari_config.sh 107 | " 108 | ) 109 | 110 | COMMAND_STEPS=( 111 | 'ambari-setup,ambari-setup' 112 | 'deploy-master-nfs-setup,*' 113 | 'deploy-client-nfs-setup,deploy-client-nfs-setup' 114 | ) 115 | -------------------------------------------------------------------------------- /platforms/hdp/ambari_manual_post_deploy_env.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | # ambari_env.sh 17 | # 18 | # Extension providing a cluster with Apache Ambari installed and automatically 19 | # provisions and configures the cluster's software. This installs and configures 20 | # the GCS connector. 21 | 22 | ######################################################################## 23 | ## There should be nothing to edit here, use ambari.conf ## 24 | ######################################################################## 25 | 26 | # Import the base Ambari installation 27 | import_env platforms/hdp/ambari_manual_env.sh 28 | 29 | COMMAND_STEPS=( 30 | 'install-gcs-connector-on-ambari,install-gcs-connector-on-ambari' 31 | 'update-ambari-config,*' 32 | ) 33 | -------------------------------------------------------------------------------- /platforms/hdp/configuration.json: -------------------------------------------------------------------------------- 1 | { 2 | "configurations" : { 3 | "core-site" : { 4 | "fs.gs.project.id": "${PROJECT}", 5 | "fs.gs.system.bucket": "${CONFIGBUCKET}", 6 | "fs.gs.working.dir": "/", 7 | "fs.gs.impl" : "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem", 8 | "fs.AbstractFileSystem.gs.impl": "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS", 9 | "fs.gs.metadata.cache.enable": "true", 10 | "fs.gs.metadata.cache.type": "${GCS_METADATA_CACHE_TYPE}", 11 | "fs.gs.metadata.cache.directory": "${GCS_FILE_CACHE_DIRECTORY}", 12 | 13 | "hadoop.proxyuser.root.hosts": "*", 14 | "hadoop.proxyuser.root.groups": "*", 15 | "hadoop.proxyuser.root.users": "*" 16 | }, 17 | "hdfs-site" : { 18 | "dfs.replication" : "2" 19 | }, 20 | "mapred-site" : { 21 | "mapreduce.job.working.dir" : "/user/${user.name}" 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /platforms/hdp/install_gcs_connector_on_ambari.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright 2014 Google Inc. All Rights Reserved. 4 | 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS-IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | ## install_gcs_connector_on_ambari.sh 19 | ## This file: 20 | ## * downloads the relevant gcs-connector-.jar 21 | ## * installs into a local lib dir 22 | ## * adds that lib dir to relevant classpaths 23 | 24 | if (( ${INSTALL_GCS_CONNECTOR} )) ; then 25 | loginfo "installing GCS_CONNECTOR_JAR on each node" 26 | LIB_JARS_DIR="${HADOOP_INSTALL_DIR}/lib" 27 | mkdir -p ${LIB_JARS_DIR} 28 | 29 | # Grab the connector jarfile, add it to installation /lib directory. 30 | JARNAME=$(grep -o '[^/]*\.jar' <<< ${GCS_CONNECTOR_JAR}) 31 | LOCAL_JAR="${LIB_JARS_DIR}/${JARNAME}" 32 | 33 | download_bd_resource "${GCS_CONNECTOR_JAR}" "${LOCAL_JAR}" 34 | 35 | # link gcs connector into main hadoop lib dir 36 | source <(grep "^export HADOOP_HOME=" /etc/hadoop/conf/hadoop-env.sh) || true 37 | if [[ -d "${HADOOP_HOME}/lib/" ]]; then 38 | ln -sv "${LOCAL_JAR}" "${HADOOP_HOME}/lib/" 39 | fi 40 | fi 41 | -------------------------------------------------------------------------------- /platforms/hdp/resources/public-hostname-gcloud.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | curl -Ls -m 5 http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0/access-configs/0/external-ip -H "Metadata-Flavor: Google" 3 | 4 | -------------------------------------------------------------------------------- /platforms/hdp/resources/thp-disable.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # disable transparent huge pages: for Hadoop 3 | thp_disable=true 4 | if [ "${thp_disable}" = true ]; then 5 | for path in redhat_transparent_hugepage transparent_hugepage; do 6 | if test -f /sys/kernel/mm/${path}/enabled; then 7 | echo never > /sys/kernel/mm/${path}/enabled 8 | fi 9 | if test -f /sys/kernel/mm/${path}/defrag; then 10 | echo never > /sys/kernel/mm/${path}/defrag 11 | fi 12 | done 13 | fi 14 | exit 0 15 | -------------------------------------------------------------------------------- /platforms/hdp/update_ambari_config.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # finalize the cluster configuration 16 | 17 | source hadoop_helpers.sh 18 | 19 | # initialize hdfs dirs 20 | loginfo "Set up HDFS /tmp and /user dirs" 21 | initialize_hdfs_dirs admin 22 | 23 | 24 | AMBARI_CLUSTER=$(get_ambari_cluster_name) 25 | 26 | # update hadoop configuration to include the gcs connector 27 | if (( ${INSTALL_GCS_CONNECTOR} )) ; then 28 | loginfo "Setting up GCS connector cache cleaner and configuration." 29 | if (( ${ENABLE_NFS_GCS_FILE_CACHE} )); then 30 | export GCS_METADATA_CACHE_TYPE='FILESYSTEM_BACKED' 31 | export GCS_FILE_CACHE_DIRECTORY="$(get_nfs_mount_point)" 32 | 33 | setup_cache_cleaner 34 | else 35 | export GCS_METADATA_CACHE_TYPE='IN_MEMORY' 36 | # For IN_MEMORY cache, this directory won't actually be used, but we set 37 | # it to a sane default for easy manual experimentation of file caching. 38 | export GCS_FILE_CACHE_DIRECTORY='/tmp/gcs_connector_metadata_cache' 39 | fi 40 | 41 | # If it wasn't set at cluster creation configure the GCS connector. 42 | if ! /var/lib/ambari-server/resources/scripts/configs.sh \ 43 | get localhost ${AMBARI_CLUSTER} core-site \ 44 | | grep -q '^"fs.gs'; then 45 | subsitute_bash_in_json configuration.json 46 | sed -n < configuration.json \ 47 | 's/.*"\(fs\.\S*gs\.\S*\)"\s*:\s*"\([^"]*\)".*/\1 \2/p' \ 48 | | xargs -n 2 /var/lib/ambari-server/resources/scripts/configs.sh \ 49 | set localhost ${AMBARI_CLUSTER} core-site 50 | # Will reload core-site.xml 51 | SERVICES_TO_UPDATE+=" HDFS" 52 | fi 53 | 54 | loginfo "Adding /usr/local/lib/hadoop/lib to " \ 55 | "mapreduce.application.classpath." 56 | NEW_CLASSPATH=$(/var/lib/ambari-server/resources/scripts/configs.sh \ 57 | get localhost ${AMBARI_CLUSTER} mapred-site \ 58 | | grep -E '^"mapreduce.application.classpath"' \ 59 | | tr -d \" \ 60 | | awk '{print "/usr/local/lib/hadoop/lib/*,"$3}' | sed 's/,$//') 61 | /var/lib/ambari-server/resources/scripts/configs.sh \ 62 | set localhost ${AMBARI_CLUSTER} \ 63 | mapred-site mapreduce.application.classpath ${NEW_CLASSPATH} 64 | sleep 10 65 | fi 66 | 67 | loginfo "Restarting services, because Ambari usually requires it." 68 | SERVICE='ALL' 69 | ambari_service_stop 70 | ambari_wait_requests_completed 71 | ambari_service_start 72 | ambari_wait_requests_completed 73 | 74 | # Check GCS connectivity 75 | check_filesystem_accessibility 76 | 77 | # Set up files and pig views, which was added in Ambari 2.1. 78 | # 79 | if version_at_least "${AMBARI_VERSION}" '2.1'; then 80 | # This should be done automatically but it wasn't as of 2016-03-16. 81 | for view in FILES PIG; do 82 | # Both of these views are currently 1.0.0 83 | VIEW="${AMBARI_API}/views/${view}/versions/1.0.0/instances/AUTO_${view}_INSTANCE" 84 | if ${AMBARI_CURL} ${VIEW} |& grep -q '404 Not Found'; then 85 | ${AMBARI_CURL} -X POST ${VIEW} \ 86 | -d "{\"ViewInstanceInfo\": {\"cluster_handle\": \"${AMBARI_CLUSTER}\"}}" 87 | fi 88 | done 89 | fi 90 | -------------------------------------------------------------------------------- /platforms/mapr/README.md: -------------------------------------------------------------------------------- 1 | MapR Cluster on Google Compute Engine 2 | ------------------------------------- 3 | 4 | The [MapR distribution](https://www.mapr.com/products/mapr-distribution-including-apache-hadoop) for Hadoop adds enterprise-grade features to the Hadoop platform that make Hadoop easier to use and more dependable. The MapR distribution for Hadoop is fully integrated with the [Google Compute Engine (GCE)](https://cloud.google.com/compute/) framework, allowing customers to deploy a MapR cluster with ready access to Google's cloud infrastructure. MapR provides network file system (NFS) and open database connectivity (ODBC) interfaces, a comprehensive management suite, and automatic compression. MapR provides high availability with a no-NameNode architecture and data protection with snapshots, disaster recovery, and cross-cluster mirroring. 5 | 6 | ### Make sure you have... 7 | * an active [Google Cloud Platform](https://console.developers.google.com/) account. 8 | * a client machine with [Google Cloud SDK](https://cloud.google.com/sdk/) and [bdutil](https://cloud.google.com/hadoop/downloads) installed. 9 | * access to a GCE project where you can add instances, buckets and disks. 10 | * a valid MapR license (optional). 11 | 12 | ### Now, to launch a MapR Cluster on GCE using `bdutil`... 13 | 14 | 1. Set the project and bucket in `mapr_env.sh` (located under `bdutil/platforms/mapr/`). 15 | 2. Update `node.lst` to determine the [allocation of cluster roles](http://doc.mapr.com/display/MapR/MapR+Cluster+on+the+Google+Compute+Engine#MapRClusterontheGoogleComputeEngine-gce-config) for the nodes in the cluster. For reference, the config file contains a simple 4-node [M7](https://www.mapr.com/products/hadoop-download) cluster allocation. 16 | * Node names must have the PREFIX mentioned in `mapr_env.sh` 17 | * Node names must have suffixes: -m, -w-0, -w-1, -w-2 ... 18 | For example, if the PREFIX is 'mapr', node names must be 'mapr-m', 'mapr-w-0', 'mapr-w-1', ... 19 | * NUM_WORKERS in `mapr_env.sh` must equal one less than number of nodes in `node.lst` 20 | 3. (Optional) Copy a valid license into `mapr_license.txt` 21 | 4. Deploy the cluster by invoking in the bdutil root directory: 22 | ``` 23 | ./bdutil -e mapr deploy 24 | ``` 25 | 26 | 5. Access the cluster by invoking: 27 | ``` 28 | gcloud compute config-ssh 29 | ``` 30 | 31 | The output shows how to ssh into a node. Login as the `MAPR_USER` mentioned in `mapr_env.sh` (for example, `ssh mapr@node1.us-central1-f.t-diplomatic-962`). 32 | 6. Test an example application by running: 33 | ``` 34 | yarn jar $MAPR_HOME/hadoop/hadoop-2.5.1/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.5.1-mapr-1501.jar pi 16 100 35 | ``` 36 | 37 | 38 | ### At the end... 39 | To delete the cluster, ensure `mapr_env.sh` is same as in when deployed. In the bdutil root directory, invoke: 40 | ``` 41 | ./bdutil -e mapr delete 42 | ``` 43 | 44 | ### Additional Resources 45 | * [Free Hadoop On-Demand Training](https://www.mapr.com/services/mapr-academy/big-data-hadoop-online-training) 46 | * [Why MapR](https://www.mapr.com/why-hadoop/why-mapr) 47 | * [MapR Development Guide](http://doc.mapr.com/display/MapR/Development+Guide) 48 | * [MapR Documentation](http://doc.mapr.com/) 49 | * [MapR Support](https://www.mapr.com/support/overview) 50 | * [Another way](http://doc.mapr.com/display/MapR/MapR+Cluster+on+the+Google+Compute+Engine) to deploy 51 | * [MapR-on-GCE](https://github.com/mapr/gce) 52 | 53 | **LICENSE:** [Apache License, Version 2.0](https://github.com/GoogleCloudPlatform/bdutil/blob/master/LICENSE) -------------------------------------------------------------------------------- /platforms/mapr/mapr_license.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudDataproc/bdutil/967fd15b1f690e961f7d61809e4976aaa4ade90f/platforms/mapr/mapr_license.txt -------------------------------------------------------------------------------- /platforms/mapr/node.lst: -------------------------------------------------------------------------------- 1 | # Simple 4-node M7 cluster 2 | # NOTE: 3 | # (1) Node names MUST have the PREFIX mentioned in 'mapr_env.sh' 4 | # (2) Node names MUST have suffixes: -m, -w-0, -w-1, -w-2 ... 5 | # For example, if the PREFIX is 'mapr', 6 | # node names MUST be 'mapr-m', 'mapr-w-0', 'mapr-w-1', ... 7 | # (3) Do not forget to update NUM_WORKERS variable 8 | # Refer to MapR documentation for other values 9 | mapr-m:zookeeper,cldb,fileserver,nodemanager,nfs,webserver,hbase 10 | mapr-w-0:zookeeper,cldb,fileserver,nodemanager,nfs,hbase 11 | mapr-w-1:zookeeper,resourcemanager,historyserver,fileserver,nodemanager,nfs,hbase 12 | mapr-w-2:resourcemanager,fileserver,nodemanager,nfs,hbase 13 | -------------------------------------------------------------------------------- /platforms/restart_services.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Copyright 2014 Google Inc. All Rights Reserved.D 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS-IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # Restarts services corresponding to installed packages. 17 | # Performs last minute initialization as needed. 18 | 19 | set -e 20 | 21 | source hadoop_helpers.sh 22 | 23 | if [[ $(hostname -s) == ${MASTER_HOSTNAME} ]]; then 24 | COMPONENTS=${MASTER_COMPONENTS} 25 | else 26 | COMPONENTS=${DATANODE_COMPONENTS} 27 | fi 28 | 29 | # Component ordering is sensitive. hive-metastore must come before hive-server2 30 | # and hdfs must be up before oozie. 31 | for COMPONENT in ${COMPONENTS}; do 32 | if [[ -x /etc/init.d/${COMPONENT} ]]; then 33 | # Initialize HDFS 34 | if [[ ${COMPONENT} == 'hadoop-hdfs-namenode' ]]; then 35 | service hadoop-hdfs-namenode stop 36 | # Do not refomat if already formatted. 37 | yes n | service hadoop-hdfs-namenode init 38 | service hadoop-hdfs-namenode start 39 | 40 | # Setup /tmp and /user directories. 41 | if [[ "${DEFAULT_FS}" == 'hdfs' ]]; then 42 | initialize_hdfs_dirs 43 | fi 44 | # Initialize Oozie. Requires Namenode to be up. 45 | elif [[ ${COMPONENT} == 'oozie' ]]; then 46 | # Requires HDFS to be up and running. 47 | # Might be CDH specific. 48 | oozie-setup sharelib create -fs ${NAMENODE_URI} \ 49 | -locallib /usr/lib/oozie/oozie-sharelib-yarn* 50 | service oozie restart 51 | else 52 | service ${COMPONENT} restart 53 | fi 54 | fi 55 | done 56 | -------------------------------------------------------------------------------- /sampleapps/querytools/conf/hive/hive-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 19 | 20 | 21 | 22 | 23 | hive.metastore.warehouse.dir 24 | /user/${user.name}/warehouse 25 | location of default database for the warehouse 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /sampleapps/querytools/examples/ngrams/hive_query_ngrams.q: -------------------------------------------------------------------------------- 1 | -- 2 | -- Copyright 2013 Google Inc. All Rights Reserved. 3 | -- 4 | -- Licensed under the Apache License, Version 2.0 (the "License"); 5 | -- you may not use this file except in compliance with the License. 6 | -- You may obtain a copy of the License at 7 | -- 8 | -- http://www.apache.org/licenses/LICENSE-2.0 9 | -- 10 | -- Unless required by applicable law or agreed to in writing, software 11 | -- distributed under the License is distributed on an "AS IS" BASIS, 12 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | -- See the License for the specific language governing permissions and 14 | -- 15 | 16 | -- 17 | -- This script is intended to be run from the Hive shell: 18 | -- 19 | -- hive> source hive_query_ngrams.q; 20 | -- 21 | -- or from the operating system shell: 22 | -- 23 | -- $ hive -f hive_query_ngrams.q 24 | -- 25 | -- The result of this query is a table of records indicating the count 26 | -- of occurrences of the words "radio" and "television" in the Google 27 | -- ngrams corpora for each year since 1920. 28 | -- 29 | -- This query ensures that a record exists in the result for every year 30 | -- since 1920, even if there were no instances of a given word. 31 | -- In practice this is unnecessary as radio and television both occur 32 | -- more than once in the data set for every year since 1920. 33 | -- 34 | -- The structure of this query is to join three distinct subqueries (on year): 35 | -- y: list of years since 1920 (implicitly ordered by the DISTINCT operation) 36 | -- r: sum of instances of the word "radio" for each year since 1920 37 | -- t: sum of instances of the word "television" for each year since 1920 38 | -- 39 | 40 | SELECT y.year AS year, 41 | r.instance_count AS radio, t.instance_count AS television, 42 | CAST(r.instance_count AS DOUBLE)/(r.instance_count + t.instance_count) 43 | AS pct 44 | FROM 45 | (SELECT DISTINCT year AS year FROM 46 | (SELECT distinct year from 1gram where prefix = 'r' and year >= 1920 47 | UNION ALL 48 | SELECT distinct year from 1gram where prefix = 't' and year >= 1920) y_all) 49 | y 50 | JOIN 51 | (SELECT LOWER(word) AS ngram_col, year, SUM(instance_count) AS instance_count 52 | FROM 1gram 53 | WHERE LOWER(word) = 'radio' AND prefix='r' AND (year >= 1920) 54 | GROUP BY LOWER(word), year) r 55 | ON y.year = r.year 56 | JOIN 57 | (SELECT LOWER(word) AS ngram_col, year, SUM(instance_count) AS instance_count 58 | FROM 1gram 59 | WHERE LOWER(word) = 'television' AND prefix='t' AND (year >= 1920) 60 | GROUP BY LOWER(word), year) t 61 | ON y.year = t.year 62 | ORDER BY year; 63 | 64 | EXIT; 65 | 66 | -- 67 | -- This is a simplified version of the above which eliminates the explicit 68 | -- generation of the "year" list. It assumes (correctly) that the word 69 | -- "television" appears every year that "radio" does. 70 | -- This query is listed here for reference and educational purposes only. 71 | -- 72 | -- SELECT a.year, a.instance_count, b.instance_count, 73 | -- CAST(a.instance_count AS DOUBLE)/(a.instance_count + b.instance_count) 74 | -- FROM 75 | -- (SELECT LOWER(word) AS ngram_col, year, SUM(instance_count) AS instance_count 76 | -- FROM 1gram 77 | -- WHERE LOWER(word) = 'radio' AND prefix='r' AND (year >= 1920) 78 | -- GROUP BY LOWER(word), year) a 79 | -- JOIN 80 | -- (SELECT LOWER(word) AS ngram_col, year, SUM(instance_count) AS instance_count 81 | -- FROM 1gram 82 | -- WHERE LOWER(word) = 'television' AND prefix='t' AND (year >= 1920) 83 | -- GROUP BY LOWER(word), year) b 84 | -- ON a.year = b.year 85 | -- ORDER BY year; 86 | -- 87 | -------------------------------------------------------------------------------- /sampleapps/querytools/examples/ngrams/hive_table_create.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2013 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # 17 | # This script is intended to be run from the unix command line 18 | # on an instance with hive installed (and the hive executable 19 | # available in the user PATH). 20 | # 21 | # It is assumed that the one has already run the shell script 22 | # ngram_hdfs_load.sh which will have downloaded the associated 23 | # ngram data and deposited it into HDFS under /user/hdpusr/ngrams/ 24 | # 25 | # This script will create a table ("1gram") and then load each 26 | # file into a separate partition within the table. 27 | # 28 | 29 | set -o errexit 30 | set -o nounset 31 | 32 | # Select what to install 33 | readonly SCRIPT_DIR=$(dirname $0) 34 | source $SCRIPT_DIR/ngram_setup.sh 35 | 36 | # Create the table if it does not already exist 37 | hive << END_CREATE 38 | CREATE TABLE IF NOT EXISTS $NGRAMS ( 39 | word STRING, 40 | year INT, 41 | instance_count INT, 42 | book_count INT 43 | ) 44 | PARTITIONED BY (prefix STRING) 45 | ROW FORMAT DELIMITED 46 | FIELDS TERMINATED BY '\t' 47 | STORED AS TEXTFILE 48 | ; 49 | EXIT 50 | ; 51 | END_CREATE 52 | 53 | # Get the list of files to put into the table 54 | FILE_PATTERN=$(printf $SOURCE_FORMAT $NGRAMS "" "") 55 | FILE_LIST=$($HDFS_CMD -ls $HDFS_DIR | grep $FILE_PATTERN | awk '{ print $8 }') 56 | for filepath in $FILE_LIST; do 57 | filename=$(basename $filepath) 58 | prefix=${filename##$FILE_PATTERN} 59 | 60 | hive --silent << END_LOAD 61 | LOAD DATA INPATH '$HDFS_DIR/$filename' 62 | OVERWRITE INTO TABLE $NGRAMS 63 | PARTITION (prefix='$prefix') 64 | ; 65 | EXIT 66 | ; 67 | END_LOAD 68 | done 69 | 70 | echo "Data loaded into hive table $NGRAMS" 71 | -------------------------------------------------------------------------------- /sampleapps/querytools/examples/ngrams/ngram_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2013 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # Utility script, sourced by both ngram_hdfs_load.sh and hive_table_create.sh 17 | # This script will set a series of constants, some based on the choice 18 | # of the command line "N" value (defaults to 1). N indicates the ngram 19 | # dataset to download and copy into HDFS. 20 | 21 | readonly SOURCE_FORMAT="googlebooks-eng-all-%s-20120701-%s%s" 22 | readonly SOURCE_LOCATION="gs://books/ngrams/books" 23 | 24 | # The "hadoop" executable should be in the user path 25 | readonly HDFS_CMD="hadoop fs" 26 | 27 | # What to install: 1gram by default 28 | N=1 29 | 30 | # Now parse command line arguments 31 | while [[ $# -ne 0 ]]; do 32 | case "$1" in 33 | --N=*) 34 | N=${1#--N=} 35 | shift 36 | ;; 37 | --help) 38 | N= 39 | shift 40 | ;; 41 | *) 42 | esac 43 | done 44 | 45 | if [[ ! $N -ge 1 ]]; then 46 | echo "usage $(basename $0): --N=" 47 | exit 1 48 | fi 49 | 50 | # Now set constants based on the selection of N 51 | readonly NGRAMS="${N}gram" 52 | readonly HDFS_DIR="ngrams/$NGRAMS" 53 | readonly STAGE_DIR="/hadoop/tmp/$USER/ngrams/$NGRAMS" 54 | 55 | -------------------------------------------------------------------------------- /sampleapps/querytools/project_properties.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2013 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # Begin: edit these values to set up your cluster 17 | # GCS bucket for packages 18 | readonly GCS_PACKAGE_BUCKET={{{{ bucket_name }}}} 19 | # Zone of the Hadoop master instance 20 | readonly ZONE={{{{ zone_id }}}} 21 | # Hadoop master instance name 22 | readonly MASTER={{{{ master_hostname }}}} 23 | 24 | # Subdirectory in cloud storage where packages are pushed at initial setup 25 | readonly GCS_PACKAGE_DIR=hdp_tools 26 | 27 | # Full GCS URIs of the Pig and Hive tarballs, if packages-to-gcs__at__host.sh 28 | # is used; alternatively, these can be set to other pre-existing GCS paths 29 | readonly SUPPORTED_HDPTOOLS="hive pig" 30 | readonly TARBALL_BASE="gs://$GCS_PACKAGE_BUCKET/$GCS_PACKAGE_DIR/packages" 31 | readonly HIVE_TARBALL_URI="$TARBALL_BASE/hive/hive-*.tar.gz" 32 | readonly PIG_TARBALL_URI="$TARBALL_BASE/pig/pig-*.tar.gz" 33 | 34 | # Directory on master where hadoop is installed 35 | readonly HADOOP_HOME=/home/hadoop/hadoop 36 | 37 | # Set to the major version of hadoop ("1" or "2") 38 | readonly HADOOP_MAJOR_VERSION="1" 39 | 40 | # Hadoop username and group on Compute Engine Cluster 41 | readonly HADOOP_USER=hadoop 42 | readonly HADOOP_GROUP=hadoop 43 | 44 | # Hadoop client username on Compute Engine Cluster 45 | readonly HDP_USER=hdpuser 46 | 47 | # Directory on master where packages are installed 48 | readonly HDP_USER_HOME=/home/hdpuser 49 | readonly MASTER_INSTALL_DIR=/home/hdpuser 50 | 51 | # End: edit these values to set up your cluster 52 | 53 | 54 | # Begin: constants used througout the solution 55 | 56 | # Subdirectory where packages files (tar.gz) are stored 57 | readonly PACKAGES_DIR=packages 58 | 59 | # Subdirectory where scripts are stored 60 | readonly SCRIPTS_DIR=scripts 61 | 62 | # Subdirectory on master where we pull down package files 63 | readonly MASTER_PACKAGE_DIR=/tmp/hdp_tools 64 | 65 | # User tmp dir in HDFS 66 | readonly HDFS_TMP_DIR="/tmp" 67 | 68 | # Hadoop temp dir (hadoop.tmp.dir) 69 | readonly HADOOP_TMP_DIR="/hadoop/tmp" 70 | 71 | # End: constants used througout the solution 72 | -------------------------------------------------------------------------------- /sampleapps/querytools/scripts/common_utils.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2013 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -o nounset 17 | set -o errexit 18 | 19 | function emit() { 20 | echo -e "$@" 21 | } 22 | readonly -f emit 23 | 24 | function die() { 25 | echo -e "$@" >&2 26 | exit 1 27 | } 28 | readonly -f die 29 | 30 | -------------------------------------------------------------------------------- /sampleapps/querytools/scripts/install-packages-on-master__at__host.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2013 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -o nounset 17 | set -o errexit 18 | 19 | readonly SCRIPTDIR=$(dirname $0) 20 | 21 | # Pull in global properties 22 | source project_properties.sh 23 | 24 | # Pull in common functions 25 | source $SCRIPTDIR/common_utils.sh 26 | 27 | # Files to push to master; place project_properties.sh in the same directory 28 | # as the other scripts 29 | readonly SCRIPT_FILES_TO_PUSH="\ 30 | project_properties.sh \ 31 | $SCRIPTS_DIR/common_utils.sh \ 32 | $SCRIPTS_DIR/package_utils.sh \ 33 | $SCRIPTS_DIR/setup-hdfs-for-hdtools__at__master.sh \ 34 | $SCRIPTS_DIR/setup-packages__at__master.sh \ 35 | $SCRIPTS_DIR/setup-ssh-keys__at__master.sh \ 36 | " 37 | readonly MASTER_PACKAGE_SUBDIRS="\ 38 | $MASTER_PACKAGE_DIR/$SCRIPTS_DIR \ 39 | $MASTER_PACKAGE_DIR/conf/hive \ 40 | $MASTER_PACKAGE_DIR/ssh-key 41 | " 42 | 43 | # Ensure permissions on the script files before we push them 44 | chmod 755 $SCRIPT_FILES_TO_PUSH 45 | 46 | # Create the destination directory on the master 47 | emit "" 48 | emit "Ensuring setup directories exist on master:" 49 | gcutil ssh --zone=$ZONE --ssh_arg -t $MASTER sudo -i \ 50 | "rm -rf $MASTER_PACKAGE_DIR && \ 51 | mkdir -p $MASTER_PACKAGE_SUBDIRS" 52 | 53 | # Push the setup script to the master 54 | emit "" 55 | emit "Pushing the setup scripts to the master:" 56 | gcutil push --zone=$ZONE $MASTER \ 57 | $SCRIPT_FILES_TO_PUSH $MASTER_PACKAGE_DIR/$SCRIPTS_DIR 58 | 59 | # Push configuration to the master 60 | emit "" 61 | emit "Pushing configuration to the master:" 62 | gcutil push --zone=$ZONE $MASTER \ 63 | conf/hive/* $MASTER_PACKAGE_DIR/conf/hive 64 | 65 | # Execute the setup script on the master 66 | emit "" 67 | emit "Launching the user and package setup script on the master:" 68 | gcutil ssh --zone=$ZONE --ssh_arg -t $MASTER \ 69 | sudo $MASTER_PACKAGE_DIR/$SCRIPTS_DIR/setup-packages__at__master.sh 70 | 71 | # Execute the HDFS setup script on the master 72 | emit "" 73 | emit "Launching the HDFS setup script on the master:" 74 | gcutil ssh --zone=$ZONE --ssh_arg -t $MASTER \ 75 | sudo \ 76 | $MASTER_PACKAGE_DIR/$SCRIPTS_DIR/setup-hdfs-for-hdtools__at__master.sh 77 | 78 | # Set up SSH keys for the user 79 | emit "" 80 | emit "Generating SSH keys for user $HDP_USER" 81 | 82 | readonly KEY_DIR=./ssh-key 83 | mkdir -p $KEY_DIR 84 | rm -f $KEY_DIR/$HDP_USER $KEY_DIR/${HDP_USER}.pub 85 | 86 | ssh-keygen -t rsa -P '' -f $KEY_DIR/$HDP_USER 87 | chmod o+r $KEY_DIR/${HDP_USER}.pub 88 | emit "Pushing SSH keys for user $HDP_USER to $MASTER" 89 | gcutil push --zone=$ZONE $MASTER \ 90 | $KEY_DIR/${HDP_USER}.pub $MASTER_PACKAGE_DIR/ssh-key/ 91 | emit "Adding SSH public key for user $HDP_USER to authorized_keys" 92 | gcutil ssh --zone=$ZONE --ssh_arg -t $MASTER \ 93 | sudo sudo -u $HDP_USER -i \ 94 | $MASTER_PACKAGE_DIR/$SCRIPTS_DIR/setup-ssh-keys__at__master.sh \ 95 | $MASTER_PACKAGE_DIR/ssh-key 96 | 97 | MASTER_IP=$(gcutil getinstance --zone=$ZONE $MASTER | \ 98 | awk -F '|' \ 99 | '$2 ~ / *external-ip */ { gsub(/[ ]*/, "", $3); print $3 }') 100 | 101 | emit "" 102 | emit "***" 103 | emit "SSH keys generated locally to:" 104 | emit " Public key: $KEY_DIR/$HDP_USER.pub" 105 | emit " Private key: $KEY_DIR/$HDP_USER" 106 | emit "" 107 | emit "Public key installed on $MASTER to ~$HDP_USER/.ssh/authorized_keys" 108 | emit "" 109 | emit "You may now ssh to user $HDP_USER@$MASTER with:" 110 | emit " ssh -i $KEY_DIR/$HDP_USER $HDP_USER@$MASTER_IP" 111 | emit "***" 112 | 113 | emit "" 114 | emit "Installation complete" 115 | -------------------------------------------------------------------------------- /sampleapps/querytools/scripts/package_utils.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2013 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -o nounset 17 | set -o errexit 18 | 19 | function pkgutil_get_list() { 20 | local pkg_dir="$1" 21 | 22 | find $pkg_dir -mindepth 2 -maxdepth 2 | sort 23 | } 24 | readonly -f pkgutil_get_list 25 | 26 | function pkgutil_pkg_name() { 27 | local pkg_dir="$1" 28 | local pkg="$2" 29 | 30 | # Strip the "package" directory 31 | local pkg_stripped=${pkg#$pkg_dir/} 32 | 33 | # Get the query-tool specific directory name 34 | echo ${pkg_stripped%/*} 35 | } 36 | readonly -f pkgutil_pkg_name 37 | 38 | function pkgutil_pkg_file() { 39 | local pkg_dir="$1" 40 | local pkg="$2" 41 | 42 | # Return just the filename 43 | echo ${pkg##*/} 44 | } 45 | readonly -f pkgutil_pkg_file 46 | 47 | function pkgutil_emit_list() { 48 | local pkg_dir="$1" 49 | local pkg_list="$2" 50 | 51 | emit "" 52 | emit "Discovered packages:" 53 | for pkg in $pkg_list; do 54 | # Get the query-tool specific directory name 55 | local pkg_name=$(pkgutil_pkg_name $pkg_dir $pkg) 56 | 57 | # Get the name of the zip file 58 | local pkg_file=$(pkgutil_pkg_file $pkg_dir $pkg) 59 | 60 | emit " $pkg_name ($pkg_file)" 61 | done 62 | } 63 | readonly -f pkgutil_emit_list 64 | 65 | -------------------------------------------------------------------------------- /sampleapps/querytools/scripts/packages-delete-from-gcs__at__host.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2013 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # packages-delete-from-gcs 17 | # This script removes the Hadoop query tool packages from Google Cloud 18 | # Storage which were uploaded by packages-to-gcs__at__host.sh 19 | 20 | set -o nounset 21 | set -o errexit 22 | 23 | readonly SCRIPTDIR=$(dirname $0) 24 | 25 | # Pull in global properties 26 | source project_properties.sh 27 | 28 | # Pull in common functions 29 | source $SCRIPTDIR/common_utils.sh 30 | 31 | # Remove packages from GCS 32 | emit "" 33 | emit "Removing packages:" 34 | gsutil rm -R -f gs://$GCS_PACKAGE_BUCKET/$GCS_PACKAGE_DIR 35 | 36 | emit "" 37 | emit "Package removal complete" 38 | 39 | -------------------------------------------------------------------------------- /sampleapps/querytools/scripts/packages-to-gcs__at__host.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2013 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # packages-to-gcs 17 | # This script examines the Hadoop tools packages directory for a list 18 | # of packages to push to Google Cloud Storage. 19 | # 20 | # All packages should be found in the "packages" subdirectory. 21 | # The required format is for the package name to be a subdirectory 22 | # and the associated TAR.GZ file to be inside the package subdirectory: 23 | # packages/ 24 | # hive/ 25 | # hive-0.10.0.tar.gz 26 | # pig/ 27 | # pig-0.11.1.tar.gz 28 | 29 | set -o nounset 30 | set -o errexit 31 | 32 | readonly SCRIPTDIR=$(dirname $0) 33 | 34 | # Pull in global properties 35 | source project_properties.sh 36 | 37 | # Pull in common functions 38 | source $SCRIPTDIR/common_utils.sh 39 | source $SCRIPTDIR/package_utils.sh 40 | 41 | # The resulting PACKAGE_LIST will contain one entry per package where the 42 | # the entry is of the form "package_dir/package/gzip" 43 | # (for example packages/hive/hive-0.10.0.tar.gz) 44 | PACKAGE_LIST=$(pkgutil_get_list $PACKAGES_DIR) 45 | if [[ -z $PACKAGE_LIST ]]; then 46 | die "No package found in $PACKAGES_DIR subdirectory" 47 | fi 48 | 49 | # Emit package list 50 | pkgutil_emit_list "$PACKAGES_DIR" "$PACKAGE_LIST" 51 | 52 | # Push packages to GCS 53 | emit "" 54 | emit "Pushing packages to gs://$GCS_PACKAGE_BUCKET/$GCS_PACKAGE_DIR/:" 55 | gsutil -m cp -R $PACKAGES_DIR gs://$GCS_PACKAGE_BUCKET/$GCS_PACKAGE_DIR/ 56 | 57 | emit "" 58 | emit "Package upload complete" 59 | 60 | -------------------------------------------------------------------------------- /sampleapps/querytools/scripts/setup-hdfs-for-hdtools__at__master.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2013 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -o nounset 17 | set -o errexit 18 | 19 | SCRIPT=$(basename $0) 20 | SCRIPTDIR=$(dirname $0) 21 | 22 | source $SCRIPTDIR/project_properties.sh 23 | source $SCRIPTDIR/common_utils.sh 24 | 25 | readonly HDFS_CMD="sudo -u $HADOOP_USER -i $HADOOP_HOME/bin/hadoop fs" 26 | readonly HDFS_ROOT_USER="$HADOOP_USER" 27 | 28 | function hdfs_mkdir () { 29 | local dir=$1 30 | local owner=${2:-} 31 | local permissions=${3:-} 32 | 33 | emit " Checking directory $dir" 34 | if ! $HDFS_CMD -test -d $dir 2> /dev/null; then 35 | emit " Creating directory $dir" 36 | $HDFS_CMD -mkdir $dir 37 | fi 38 | 39 | if [[ -n "$owner" ]]; then 40 | emit " Ensuring owner $owner" 41 | $HDFS_CMD -chown $owner $dir 42 | fi 43 | 44 | if [[ -n "$permissions" ]]; then 45 | emit " Ensuring permissions $permissions" 46 | $HDFS_CMD -chmod $permissions $dir 47 | fi 48 | } 49 | readonly -f hdfs_mkdir 50 | 51 | emit "" 52 | emit "*** Begin: $SCRIPT running on master $(hostname) ***" 53 | 54 | # Ensure that /tmp exists (it should) and is fully accessible 55 | hdfs_mkdir "$HDFS_TMP_DIR" "$HDFS_ROOT_USER" "777" 56 | 57 | # Create a hive-specific scratch space in /tmp for the hdpuser 58 | hdfs_mkdir "$HDFS_TMP_DIR/hive-$HDP_USER" "$HDP_USER" 59 | 60 | # Create a warehouse directory (hive) for the hdpuser 61 | hdfs_mkdir "/user" "$HDFS_ROOT_USER" 62 | hdfs_mkdir "/user/$HDP_USER" "$HDP_USER" 63 | hdfs_mkdir "/user/$HDP_USER/warehouse" "$HDP_USER" 64 | 65 | # Create a mapreduce staging directory for the hdpuser 66 | if [[ "${HADOOP_MAJOR_VERSION}" == "2" ]]; then 67 | hdfs_mkdir "/hadoop/mapreduce" "$HADOOP_USER" "o+rw" 68 | hdfs_mkdir "/hadoop/mapreduce/staging" "$HADOOP_USER" "o+rw" 69 | hdfs_mkdir "/hadoop/mapreduce/staging/history" "$HADOOP_USER" "777" 70 | hdfs_mkdir "/hadoop/mapreduce/staging/$HDP_USER" "$HDP_USER" 71 | else 72 | hdfs_mkdir "$HADOOP_TMP_DIR/mapred/staging/$HDP_USER" "$HDP_USER" 73 | fi 74 | 75 | emit "" 76 | emit "*** End: $SCRIPT running on master $(hostname) ***" 77 | emit "" 78 | 79 | -------------------------------------------------------------------------------- /sampleapps/querytools/scripts/setup-ssh-keys__at__master.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2013 Google Inc. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # This script runs on the Hadoop master node as the target user ($HDP_USER). 17 | # It is asssumed that a public key file for the user has been pushed 18 | # onto the master node and the location of that file is the first argument 19 | # to the script. 20 | 21 | set -o nounset 22 | set -o errexit 23 | 24 | readonly SCRIPT=$(basename $0) 25 | readonly SCRIPTDIR=$(dirname $0) 26 | 27 | # Pull in global properties 28 | source $SCRIPTDIR/project_properties.sh 29 | source $SCRIPTDIR/common_utils.sh 30 | 31 | if [[ $# -lt 1 ]]; then 32 | die "usage: $0 [keys-dir]" 33 | fi 34 | 35 | KEY_DIR=$1; shift 36 | KEY_FILE=$KEY_DIR/${USER}.pub 37 | 38 | if [[ ! -e $KEY_FILE ]]; then 39 | die "Public key file not found: $KEY_FILE" 40 | fi 41 | 42 | # Ensure that the .ssh directory and authorized_keys files exist 43 | if [[ ! -e $HOME/.ssh/authorized_keys ]]; then 44 | mkdir -p $HOME/.ssh 45 | chmod 700 $HOME/.ssh 46 | 47 | touch $HOME/.ssh/authorized_keys 48 | chmod 600 $HOME/.ssh/authorized_keys 49 | fi 50 | 51 | # Add the public key file for the user to authorized_keys 52 | emit "Updating $HOME/.ssh/authorized_keys" 53 | (echo "# Added $(date)" && cat $KEY_FILE) >> $HOME/.ssh/authorized_keys 54 | 55 | -------------------------------------------------------------------------------- /samples/bigquery_wordcount.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GoogleCloudDataproc/bdutil/967fd15b1f690e961f7d61809e4976aaa4ade90f/samples/bigquery_wordcount.jar -------------------------------------------------------------------------------- /samples/test-mr-bigquery.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # Copyright 2013 Google Inc. All Rights Reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS-IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | ############################################################################### 18 | # Sets up and runs WordCount job to verify BigQuery setup. 19 | # Usage: 20 | # Specify fully-qualified outputTable, e.g. "[datasetId].[tableId]": 21 | # ./bdutil -v -u "samples/*" run_command ./test-mr-bigquery.sh [outputTable] 22 | # Auto-generate/create a datasetId, and use that (provide no args) 23 | # ./bdutil -v -u "samples/*" run_command ./test-mr-bigquery.sh 24 | ################################################################################ 25 | 26 | set -e 27 | 28 | source hadoop-env-setup.sh 29 | 30 | OUTPUT_TABLE=$1 31 | 32 | CREATED_DATASET=0 33 | if [[ -z "${OUTPUT_TABLE}" ]]; then 34 | OUTPUT_DATASET="validate_bigquery_dataset_$(date +%s)" 35 | OUTPUT_TABLE="${OUTPUT_DATASET}.wordcount_output" 36 | echo "No OUTPUT_TABLE provided; using ${OUTPUT_TABLE}" 37 | bq mk "${PROJECT}:${OUTPUT_DATASET}" 38 | CREATED_DATASET=1 39 | fi 40 | 41 | INPUT_TABLE='publicdata:samples.shakespeare' 42 | INPUT_TABLE_FIELD='word' 43 | JAR='bigquery_wordcount.jar' 44 | 45 | # Check for existence of jar 46 | if ! [[ -r ${JAR} ]]; then 47 | echo "Error. Could not find jar: ${JAR}" >&2 48 | exit 1 49 | fi 50 | 51 | # Perform word count MapReduce on README.txt 52 | hadoop jar ${JAR} ${PROJECT} ${INPUT_TABLE} ${INPUT_TABLE_FIELD} ${OUTPUT_TABLE} 53 | 54 | echo 'Word count finished successfully.' \ 55 | "Manually clean up with 'bq rm ${OUTPUT_TABLE}'" 56 | if (( ${CREATED_DATASET} )); then 57 | echo "To delete entire dataset: 'bq rm -r ${OUTPUT_DATASET}'" 58 | fi 59 | -------------------------------------------------------------------------------- /samples/word_count_mapper.py: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Mapper for use with hadoop-streaming bigquery word-count example. 15 | 16 | Reads each line of input and writes out lines each containing 17 | a single word and the number 1. 18 | The input lines consist of two tab-separated fields: 19 | 1. the record number 20 | 2. JSON data 21 | We pick one field of the JSON and use its value as the word to output. 22 | """ 23 | 24 | import re 25 | import sys 26 | 27 | 28 | def main(args): 29 | # Set up the pattern that we use to extract our field 30 | field_name = args[1] 31 | field_pattern = '\\{.*"(' + field_name + ')":"([^"]*)".*\\}' 32 | field_extractor = re.compile(field_pattern) 33 | 34 | for line in sys.stdin: 35 | line = line.strip() 36 | key_and_json = line.split('\t', 1) 37 | json = key_and_json[1] 38 | matches = field_extractor.match(json) 39 | if matches: 40 | word = matches.group(2) 41 | if word: 42 | print '%s\t%s' % (word, 1) 43 | 44 | 45 | if __name__ == '__main__': 46 | main(sys.argv) 47 | -------------------------------------------------------------------------------- /samples/word_count_reducer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Reducer for use with hadoop-streaming word-count example. 15 | 16 | Reads each line of input, sums the counts for each word, 17 | outputs a line with word and total count for each word. 18 | The input is assumed to be sorted by word. 19 | """ 20 | 21 | from __future__ import print_function 22 | 23 | import re 24 | import sys 25 | 26 | current_word = None 27 | current_count = 0 28 | output_json = False 29 | 30 | 31 | def print_word_and_count(word, count): 32 | word = re.sub('"', "'", word) # replace double-quotes with single-quotes 33 | if output_json: 34 | print('0\t{"Word": "%s", "Count": %d}' % (word, count)) 35 | # When streaming out to BigQuery, this key (0 here) is ignored. 36 | else: 37 | print('%s\t%s' % (word, count)) 38 | 39 | 40 | def next_word(word, count): 41 | global current_word, current_count 42 | if current_word: 43 | print_word_and_count(current_word, current_count) 44 | current_word = word 45 | current_count = count 46 | 47 | 48 | def main(args): 49 | global current_count 50 | global output_json 51 | 52 | if len(args) > 1: 53 | if args[1] == '--output_json': 54 | output_json = True 55 | else: 56 | print("Unknown command line option '%s'" % args[1], file=sys.stderr) 57 | sys.exit(2) 58 | 59 | for line in sys.stdin: 60 | line = line.strip() 61 | word, count_string = line.split('\t', 1) 62 | 63 | try: 64 | count = int(count_string) 65 | except ValueError: 66 | continue # ignore lines that are not formatted correctly 67 | 68 | if word == current_word: 69 | current_count += count 70 | else: 71 | next_word(word, count) 72 | 73 | next_word(None, 0) 74 | 75 | if __name__ == '__main__': 76 | main(sys.argv) 77 | -------------------------------------------------------------------------------- /single_node_env.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2013 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # This file contains environment-variable overrides to be used in conjunction 16 | # with bdutil_env.sh in order to deploy a single-node Hadoop cluster. 17 | # Usage: ./bdutil deploy -e single_node_env.sh 18 | 19 | NUM_WORKERS=1 20 | 21 | # A single-node setup is much more likely to be used for development, so install 22 | # JDK with compiler/tools instead of just the minimal JRE. 23 | INSTALL_JDK_DEVEL=true 24 | 25 | # Save away the base evaluate_late_variable_bindings function so we can 26 | # override it. 27 | copy_func evaluate_late_variable_bindings old_evaluate_late_variable_bindings 28 | 29 | function evaluate_late_variable_bindings() { 30 | # Stash away the old value here so we can differentiate between whether the 31 | # user overrides set it or we just resolved it in the base implementation 32 | # of evaluate_late_variable_bindings. 33 | local old_nfs_master_hostname="${GCS_CACHE_MASTER_HOSTNAME}" 34 | 35 | old_evaluate_late_variable_bindings 36 | 37 | # In the case of the single-node cluster, we'll just use the whole PREFIX 38 | # as the name of the master and worker. 39 | WORKERS[0]=${PREFIX} 40 | MASTER_HOSTNAME=${PREFIX} 41 | WORKER_ATTACHED_PDS[0]="${PREFIX}-pd" 42 | MASTER_ATTACHED_PD="${PREFIX}-pd" 43 | 44 | # Fully qualified HDFS URI of namenode 45 | NAMENODE_URI="hdfs://${MASTER_HOSTNAME}:8020/" 46 | 47 | # Host and port of jobtracker 48 | JOB_TRACKER_URI="${MASTER_HOSTNAME}:9101" 49 | 50 | # GCS directory for deployment-related temporary files. 51 | local staging_dir_base="gs://${CONFIGBUCKET}/bdutil-staging" 52 | BDUTIL_GCS_STAGING_DIR="${staging_dir_base}/${MASTER_HOSTNAME}" 53 | 54 | # Default NFS cache host is the master node, but it can be overriden to point 55 | # at an NFS server off-cluster. 56 | if [[ -z "${old_nfs_master_hostname}" ]]; then 57 | GCS_CACHE_MASTER_HOSTNAME="${MASTER_HOSTNAME}" 58 | fi 59 | 60 | # Since $WORKERS and $MASTER_HOSTNAME both refer to the same single-node 61 | # VM, we must override COMMAND_STEPS to prevent duplicating steps. We also 62 | # omit deploy-ssh-worker-setup because there is no need to copy SSH keys to 63 | # the localhost. 64 | COMMAND_STEPS=(${COMMAND_STEPS[@]/,*/,*}) 65 | } 66 | -------------------------------------------------------------------------------- /standalone_nfs_cache_env.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2015 Google Inc. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS-IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Handy wrapper around single_node_env.sh to turn up just a single server 16 | # capable of acting as the NFS-based GCS consistency cache for multiple 17 | # other clusters. 18 | # 19 | # Usage: 20 | # ./bdutil -P my-nfs-server -p -z -b generate_config my-nfs-server_env.sh 21 | # ./bdutil -e my-nfs-server_env.sh deploy 22 | # 23 | # ./bdutil -P cluster1 -p -z -b generate_config cluster1_env.sh 24 | # echo GCS_CACHE_MASTER_HOSTNAME=my-nfs-server >> cluster1_env.sh 25 | # ./bdutil -e cluster1_env.sh deploy 26 | # 27 | # ./bdutil -P cluster2 -p -z -b generate_config cluster2_env.sh 28 | # echo GCS_CACHE_MASTER_HOSTNAME=my-nfs-server >> cluster2_env.sh 29 | # ./bdutil -e cluster2_env.sh deploy 30 | # 31 | # ./bdutil -e cluster2_env.sh delete 32 | # ./bdutil -e cluster1_env.sh delete 33 | # ./bdutil -e my-nfs-server_env.sh delete 34 | 35 | # Start with single_node_env.sh to get all the MASTER_HOSTNAME, etc., 36 | # resolution. 37 | import_env single_node_env.sh 38 | 39 | # This server would be somewhat pointless without the GCS connector and the 40 | # NFS cache enabled. 41 | INSTALL_GCS_CONNECTOR=true 42 | DEFAULT_FS='gs' 43 | ENABLE_NFS_GCS_FILE_CACHE=true 44 | 45 | # We'll set up Hadoop as normal since it'll be handy to have "hadoop fs -ls" 46 | # on the cache server, but we just won't configure the hadoop daemons to start 47 | # on boot, and won't start them explicitly during deployment. That means 48 | # no jobracker or resourcemanager or namenode, but we should still be able to 49 | # use "hadoop fs" against GCS just fine. 50 | COMMAND_GROUPS+=( 51 | "deploy-standalone-nfs-cache: 52 | libexec/install_java.sh 53 | libexec/mount_disks.sh 54 | libexec/setup_hadoop_user.sh 55 | libexec/install_hadoop.sh 56 | libexec/install_bdconfig.sh 57 | libexec/configure_hadoop.sh 58 | libexec/install_and_configure_gcs_connector.sh 59 | libexec/configure_hdfs.sh 60 | libexec/set_default_fs.sh 61 | libexec/setup_master_nfs.sh 62 | " 63 | ) 64 | 65 | COMMAND_STEPS=( 66 | "deploy-standalone-nfs-cache,*" 67 | "deploy-client-nfs-setup,*" 68 | ) 69 | --------------------------------------------------------------------------------