├── .gitignore ├── MANIFEST.in ├── README.md ├── cloud ├── __init__.py ├── cluster.py ├── decorators.py ├── exception.py ├── plugin.py ├── providers │ ├── __init__.py │ └── ec2.py ├── service.py ├── settings.py ├── storage.py └── util.py ├── example_scripts ├── cassandra-ec2-init-remote.sh ├── hadoop-cassandra-hybrid-ec2-init-remote.sh ├── hadoop-ec2-init-remote.sh ├── hbase-ec2-init-remote.sh └── zookeeper-ec2-init-remote.sh ├── plugins ├── __init__.py ├── cassandra │ ├── __init__.py │ ├── cli.plugin │ ├── cli.py │ ├── service.plugin │ └── service.py ├── hadoop │ ├── __init__.py │ ├── cli.plugin │ ├── cli.py │ ├── service.plugin │ └── service.py ├── hadoop_cassandra_hybrid │ ├── __init__.py │ ├── cli.plugin │ ├── cli.py │ ├── service.plugin │ └── service.py └── simple │ ├── __init__.py │ ├── cli.plugin │ ├── cli.py │ ├── service.plugin │ └── service.py ├── setup.py └── stratus /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.iml 3 | *.pyc 4 | 5 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include example_scripts/ * 2 | recursive-include plugins/ * 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This work was originally derived from the Cloudera CDH Cloud Scripts for managing Hadoop in 2 | Amazon EC2 (https://wiki.cloudera.com/display/DOC/CDH+Cloud+Scripts). We needed a way to 3 | manage Hadoop, Cassandra, and other distributed services, thus PyStratus was born. Thanks 4 | to Cloudera for providing a great starting point for us!Currently only Amazon EC2 is supported, 5 | but we hope to add new cloud providers very soon. 6 | 7 | To get up and running quickly, use virtualenv and install PyStratus with these instructions: 8 | ```code 9 | $ mkvirtualenv stratus 10 | (stratus)$ pip install https://github.com/digitalreasoning/PyStratus/archive/master.zip 11 | ... 12 | # issue commands like: 13 | (stratus)$ stratus list 14 | (stratus)$ stratus exec HADOOP_CLUSTER launch-cluster 3 15 | (stratus)$ stratus exec HADOOP_CLUSTER terminate-cluster 16 | ... 17 | (stratus)$ deactivate # to leave virtualenv 18 | ``` 19 | 20 | 21 | Additionally, the following script is sufficient (assumes that you have a ~/bin directory and it is on your PATH): 22 | 23 | ```code 24 | INSTALL_DIR=~/Tools/pystratus 25 | virtualenv $INSTALL_DIR --no-site-packages 26 | $INSTALL_DIR/bin/pip install https://github.com/digitalreasoning/PyStratus/archive/master.zip 27 | ln -snf $INSTALL_DIR/bin/stratus ~/bin/stratus 28 | ``` 29 | 30 | PyStratus uses the following dependencies: 31 | 32 | * Python 2.5+ 33 | * boto 34 | * simplejson 35 | * prettytable 36 | * setuptools 37 | * dateutil 38 | * PyYAML 39 | * cElementTree or elementree 40 | * Fabric 41 | 42 | You may also check out the project and run "python setup.py install" and the command "stratus" will now available 43 | and an egg file will be located in your site-packages directory. You may want to run the command with 44 | sudo to install it for all users. 45 | 46 | See the full documentation at http://github.com/digitalreasoning/PyStratus/wiki/Documentation 47 | 48 | 49 | -------------------------------------------------------------------------------- /cloud/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | VERSION = "0.8.30" 16 | -------------------------------------------------------------------------------- /cloud/cluster.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """ 17 | Classes for controlling a cluster of cloud instances. 18 | """ 19 | 20 | from __future__ import with_statement 21 | 22 | import gzip 23 | import StringIO 24 | import urllib 25 | import providers 26 | 27 | from cloud.storage import Storage 28 | 29 | CLUSTER_PROVIDER_MAP = {} 30 | 31 | def _build_provider_map() : 32 | from pkgutil import iter_modules 33 | it = iter_modules(providers.__path__, 'providers.') 34 | for module in it : 35 | try : 36 | provider = __import__(module[1], globals(), locals(), ['CLOUD_PROVIDER']).CLOUD_PROVIDER 37 | except : 38 | pass 39 | else : 40 | CLUSTER_PROVIDER_MAP[provider[0]] = provider[1] 41 | 42 | def get_cluster(provider): 43 | """ 44 | Retrieve the Cluster class for a provider. 45 | """ 46 | if not len(CLUSTER_PROVIDER_MAP): 47 | _build_provider_map() 48 | mod_name, driver_name = CLUSTER_PROVIDER_MAP[provider] 49 | _mod = __import__(mod_name, globals(), locals(), [driver_name]) 50 | return getattr(_mod, driver_name) 51 | 52 | class Cluster(object): 53 | """ 54 | A cluster of server instances. A cluster has a unique name. 55 | One may launch instances which run in a certain role. 56 | """ 57 | 58 | def __init__(self, name, config_dir, region): 59 | self.name = name 60 | self.config_dir = config_dir 61 | self.region = region 62 | 63 | def get_provider_code(self): 64 | """ 65 | The code that uniquely identifies the cloud provider. 66 | """ 67 | raise Exception("Unimplemented") 68 | 69 | def authorize_role(self, role, from_port, to_port, cidr_ip): 70 | """ 71 | Authorize access to machines in a given role from a given network. 72 | """ 73 | pass 74 | 75 | def get_instances_in_role(self, role, state_filter=None): 76 | """ 77 | Get all the instances in a role, filtered by state. 78 | 79 | @param role: the name of the role 80 | @param state_filter: the state that the instance should be in 81 | (e.g. "running"), or None for all states 82 | """ 83 | raise Exception("Unimplemented") 84 | 85 | def print_status(self, roles=None, state_filter="running"): 86 | """ 87 | Print the status of instances in the given roles, filtered by state. 88 | """ 89 | pass 90 | 91 | def check_running(self, role, number): 92 | """ 93 | Check that a certain number of instances in a role are running. 94 | """ 95 | instances = self.get_instances_in_role(role, "running") 96 | if len(instances) != number: 97 | print "Expected %s instances in role %s, but was %s %s" % \ 98 | (number, role, len(instances), instances) 99 | return False 100 | else: 101 | return instances 102 | 103 | def launch_instances(self, roles, number, image_id, size_id, 104 | instance_user_data, **kwargs): 105 | """ 106 | Launch instances (having the given roles) in the cluster. 107 | Returns a list of IDs for the instances started. 108 | """ 109 | pass 110 | 111 | def wait_for_instances(self, instance_ids, timeout=600): 112 | """ 113 | Wait for instances to start. 114 | Raise TimeoutException if the timeout is exceeded. 115 | """ 116 | pass 117 | 118 | def terminate(self): 119 | """ 120 | Terminate all instances in the cluster. 121 | """ 122 | pass 123 | 124 | def delete(self): 125 | """ 126 | Delete the cluster permanently. This operation is only permitted if no 127 | instances are running. 128 | """ 129 | pass 130 | 131 | def get_storage(self): 132 | """ 133 | Return the external storage for the cluster. 134 | """ 135 | return Storage(self) 136 | 137 | class InstanceUserData(object): 138 | """ 139 | The data passed to an instance on start up. 140 | """ 141 | 142 | def __init__(self, filename, replacements={}): 143 | self.filename = filename 144 | self.replacements = replacements 145 | 146 | def _read_file(self, filename): 147 | """ 148 | Read the user data. 149 | """ 150 | return urllib.urlopen(filename).read() 151 | 152 | def read(self): 153 | """ 154 | Read the user data, making replacements. 155 | """ 156 | contents = self._read_file(self.filename) 157 | for (match, replacement) in self.replacements.iteritems(): 158 | if replacement == None: 159 | replacement = '' 160 | contents = contents.replace(match, replacement) 161 | return contents 162 | 163 | def read_as_gzip_stream(self): 164 | """ 165 | Read and compress the data. 166 | """ 167 | output = StringIO.StringIO() 168 | compressed = gzip.GzipFile(mode='wb', fileobj=output) 169 | compressed.write(self.read()) 170 | compressed.close() 171 | return output.getvalue() 172 | 173 | class Instance(object): 174 | """ 175 | A server instance. 176 | """ 177 | def __init__(self, id, role, public_ip, private_ip, launch_time, instance_type, zone): 178 | self.id = id 179 | self.role = role 180 | self.public_ip = public_ip 181 | self.private_ip = private_ip 182 | self.launch_time = launch_time 183 | self.instance_type = instance_type 184 | self.zone = zone 185 | 186 | class RoleSyntaxException(Exception): 187 | """ 188 | Raised when a role name is invalid. Role names may consist of a sequence 189 | of alphanumeric characters and underscores. Dashes are not permitted in role 190 | names. 191 | """ 192 | def __init__(self, message): 193 | super(RoleSyntaxException, self).__init__() 194 | self.message = message 195 | def __str__(self): 196 | return repr(self.message) 197 | 198 | class TimeoutException(Exception): 199 | """ 200 | Raised when a timeout is exceeded. 201 | """ 202 | pass 203 | 204 | class InstanceTerminatedException(Exception): 205 | """ 206 | Raised when an instance that should start goes to a terminated state. 207 | """ 208 | pass 209 | -------------------------------------------------------------------------------- /cloud/decorators.py: -------------------------------------------------------------------------------- 1 | import signal 2 | from cloud.cluster import TimeoutException 3 | 4 | def timeout(seconds_before_timeout): 5 | """ 6 | Borrowed from http://www.saltycrane.com/blog/2010/04/using-python-timeout-decorator-uploading-s3/ 7 | """ 8 | def decorate(f): 9 | def handler(signum, frame): 10 | raise TimeoutException() 11 | def new_f(*args, **kwargs): 12 | old = signal.signal(signal.SIGALRM, handler) 13 | signal.alarm(seconds_before_timeout) 14 | try: 15 | result = f(*args, **kwargs) 16 | finally: 17 | signal.signal(signal.SIGALRM, old) 18 | signal.alarm(0) 19 | return result 20 | new_f.func_name = f.func_name 21 | return new_f 22 | return decorate 23 | 24 | -------------------------------------------------------------------------------- /cloud/exception.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | class VolumesStillInUseException(Exception): 17 | pass 18 | 19 | class InvalidSpotConfigurationException(Exception): 20 | pass 21 | -------------------------------------------------------------------------------- /cloud/plugin.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import os 3 | import subprocess 4 | import sys 5 | import logging 6 | import time 7 | 8 | from optparse import OptionParser 9 | from optparse import make_option 10 | from yapsy.IPlugin import IPlugin 11 | from prettytable import PrettyTable 12 | 13 | from cloud.cluster import InstanceUserData 14 | from cloud.util import xstr 15 | from cloud.util import build_env_string 16 | from cloud.exception import VolumesStillInUseException 17 | 18 | from cloud import VERSION 19 | 20 | CONFIG_DIR_OPTION = \ 21 | make_option("--config-dir", metavar="CONFIG-DIR", 22 | help="The configuration directory.") 23 | 24 | PROVIDER_OPTION = \ 25 | make_option("--cloud-provider", metavar="PROVIDER", 26 | help="The cloud provider, e.g. 'ec2' for Amazon EC2.") 27 | 28 | AVAILABILITY_ZONE_OPTION = \ 29 | make_option("-z", "--availability-zone", metavar="ZONE", 30 | help="The availability zone to run the instances in.") 31 | 32 | REGION_OPTION = \ 33 | make_option("-r", "--region", metavar="REGION", 34 | help="The region run the instances in.") 35 | 36 | FORCE_OPTION = \ 37 | make_option("--force", metavar="FORCE", 38 | action="store_true", default=False, 39 | help="Force the command without prompting.") 40 | 41 | BASIC_OPTIONS = [ 42 | CONFIG_DIR_OPTION, 43 | PROVIDER_OPTION, 44 | AVAILABILITY_ZONE_OPTION, 45 | REGION_OPTION, 46 | ] 47 | 48 | class CLIPlugin(IPlugin): 49 | """ 50 | """ 51 | USAGE = None 52 | 53 | def __init__(self, service=None): 54 | self.service = service 55 | self.logger = logging #logging.getLogger(self.__class__.__name__) 56 | 57 | def print_help(self, exitCode=1): 58 | if self.USAGE is None: 59 | raise RuntimeError("USAGE has not been defined.") 60 | 61 | print self.USAGE 62 | sys.exit(exitCode) 63 | 64 | def parse_options(self, command, argv, option_list=[], expected_arguments=[], 65 | unbounded_args=False): 66 | """ 67 | Parse the arguments to command using the given option list. 68 | 69 | If unbounded_args is true then there must be at least as many extra arguments 70 | as specified by extra_arguments (the first argument is always CLUSTER). 71 | Otherwise there must be exactly the same number of arguments as 72 | extra_arguments. 73 | """ 74 | 75 | usage = "%%prog CLUSTER [options] %s" % \ 76 | (" ".join([command] + expected_arguments[:]),) 77 | 78 | parser = OptionParser(usage=usage, version="%%prog %s" % VERSION, 79 | option_list=option_list) 80 | 81 | parser.disable_interspersed_args() 82 | (options, args) = parser.parse_args(argv) 83 | if unbounded_args: 84 | if len(args) < len(expected_arguments): 85 | parser.error("incorrect number of arguments") 86 | elif len(args) != len(expected_arguments): 87 | parser.error("incorrect number of arguments") 88 | 89 | return (vars(options), args) 90 | 91 | def _prompt(self, prompt): 92 | """ 93 | Returns true if user responds "yes" to prompt. 94 | """ 95 | return raw_input("%s [yes or no]: " % prompt).lower() == "yes" 96 | 97 | def execute_command(self, argv, options_dict): 98 | """ 99 | Should be overridden by the subclass to handle 100 | command specific options. 101 | """ 102 | raise RuntimeError("Not implemented.") 103 | 104 | def create_storage(self, argv, options_dict): 105 | raise RuntimeError("Not implemented.") 106 | 107 | def terminate_cluster(self, argv, options_dict): 108 | opt, args = self.parse_options(self._command_name, argv, [FORCE_OPTION]) 109 | 110 | if not self.service.get_instances(): 111 | print "No running instances. Aborting." 112 | return 113 | 114 | if opt.get("force"): 115 | print "Terminating cluster..." 116 | self.service.terminate_cluster() 117 | else: 118 | self.print_instances() 119 | if not self._prompt("Terminate all instances?"): 120 | print "Not terminating cluster." 121 | else: 122 | print "Terminating cluster..." 123 | self.service.terminate_cluster() 124 | 125 | def simple_print_instances(self, argv, options_dict): 126 | opt, fields = self.parse_options(self._command_name, argv, expected_arguments=['FIELD*'], unbounded_args=True) 127 | 128 | for instance in self.service.get_instances(): 129 | print("|".join([instance.__getattribute__(field) for field in fields])) 130 | 131 | def print_instances(self): 132 | if not self.service.get_instances(): 133 | print "No running instances. Aborting." 134 | return 135 | 136 | table = PrettyTable() 137 | table.set_field_names(("Role", "Instance Id", "Image Id", 138 | "Public DNS", "Private DNS", "State", 139 | "Key", "Instance Type", "Launch Time", 140 | "Zone", "Region")) 141 | 142 | for i in self.service.get_instances(): 143 | table.add_row(( 144 | i.role, i.id, i.image_id, i.public_dns_name, 145 | i.private_dns_name, i.state, i.key_name, i.instance_type, 146 | i.launch_time, i.placement, i.region.name)) 147 | 148 | table.printt() 149 | 150 | def print_storage(self): 151 | storage = self.service.get_storage() 152 | 153 | table = PrettyTable() 154 | table.set_field_names(("Role", "Instance ID", "Volume Id", 155 | "Volume Size", "Snapshot Id", "Zone", 156 | "Status", "Device", "Create Time", 157 | "Attach Time")) 158 | 159 | for (r, v) in storage.get_volumes(): 160 | table.add_row((r, v.attach_data.instance_id, v.id, 161 | str(v.size), v.snapshot_id, v.zone, 162 | "%s / %s" % (v.status, v.attach_data.status), 163 | v.attach_data.device, str(v.create_time), 164 | str(v.attach_data.attach_time))) 165 | 166 | if len(table.rows) > 0: 167 | s = 0 168 | for r in table.rows: 169 | s += int(r[3]) 170 | 171 | table.printt() 172 | print "Total volumes: %d" % len(table.rows) 173 | print "Total size: %d" % s 174 | else: 175 | print "No volumes defined." 176 | 177 | def delete_storage(self, argv, options_dict): 178 | opt, args = self.parse_options(self._command_name, argv, [FORCE_OPTION]) 179 | 180 | storage = self.service.get_storage() 181 | volumes = storage.get_volumes() 182 | 183 | if not volumes: 184 | print "No volumes defined." 185 | sys.exit() 186 | 187 | if opt.get('force'): 188 | print "Deleting storage..." 189 | try: 190 | storage.delete(storage.get_roles()) 191 | except VolumesStillInUseException, e: 192 | print e.message 193 | sys.exit(1) 194 | else: 195 | self.print_storage() 196 | if not self._prompt("Delete all storage volumes? THIS WILL PERMANENTLY DELETE ALL DATA"): 197 | print "Not deleting storage." 198 | else: 199 | print "Deleting storage..." 200 | try: 201 | storage.delete(storage.get_roles()) 202 | except VolumesStillInUseException, e: 203 | print e.message 204 | sys.exit(1) 205 | 206 | def login(self, argv, options_dict): 207 | """ 208 | """ 209 | instances = self.service.get_instances() 210 | if not instances: 211 | print "No running instances. Aborting." 212 | return 213 | 214 | table = PrettyTable() 215 | table.set_field_names(("", "ROLE", "INSTANCE ID", "PUBLIC IP", "PRIVATE IP")) 216 | 217 | for instance in instances: 218 | table.add_row((len(table.rows)+1, 219 | instance.role, 220 | instance.id, 221 | instance.public_dns_name, 222 | instance.private_dns_name)) 223 | 224 | table.printt() 225 | 226 | while True: 227 | try: 228 | choice = raw_input("Instance to login to [Enter = quit]: ") 229 | if choice == "": 230 | sys.exit(0) 231 | choice = int(choice) 232 | if choice > 0 and choice <= len(table.rows): 233 | instance = instances[choice-1] 234 | self.service.login(instance, options_dict.get('ssh_options')) 235 | break 236 | else: 237 | print "Not a valid choice. Try again." 238 | except ValueError: 239 | print "Not a valid choice. Try again." 240 | 241 | def transfer_files(self, argv, options_dict): 242 | opt, args = self.parse_options(self._command_name, argv, expected_arguments=['FILE_NAME*'], unbounded_args=True) 243 | result = self.service.transfer_files(args, options_dict.get('ssh_options')) 244 | 245 | table = PrettyTable() 246 | table.set_field_names(("INSTANCE ID", "PUBLIC IP", "PRIVATE IP", "FILE NAME", "RESULT")) 247 | for instance, file, retcode in result: 248 | table.add_row((instance.id, 249 | instance.public_dns_name, 250 | instance.private_dns_name, 251 | file, 252 | retcode 253 | )) 254 | table.printt() 255 | 256 | def run_command(self, argv, options_dict): 257 | opt, args = self.parse_options(self._command_name, argv, expected_arguments=['COMMAND']) 258 | result = self.service.run_command(args[0], options_dict.get('ssh_options')) 259 | 260 | table = PrettyTable() 261 | table.set_field_names(("INSTANCE ID", "PUBLIC IP", "PRIVATE IP", "RESULT")) 262 | for instance, retcode in result: 263 | table.add_row((instance.id, 264 | instance.public_dns_name, 265 | instance.private_dns_name, 266 | retcode 267 | )) 268 | table.printt() 269 | 270 | 271 | 272 | class ServicePlugin(object): 273 | def __init__(self, cluster=None): 274 | self.cluster = cluster 275 | self.logger = logging #logging.getLogger(self.__class__.__name__) 276 | 277 | def get_roles(self): 278 | """ 279 | Returns a list of role identifiers for this service type. 280 | """ 281 | raise RuntimeError("Not implemented.") 282 | 283 | def get_instances(self): 284 | """ 285 | Returns a list of running Instance objects from the cluster 286 | 287 | self.cluster.get_instances_in_role(ROLE, "running") 288 | """ 289 | raise RuntimeError("Not implemented.") 290 | 291 | def launch_cluster(self): 292 | raise RuntimeError("Not implemented.") 293 | 294 | def terminate_cluster(self): 295 | """ 296 | Terminates all instances in the cluster 297 | """ 298 | # TODO: Clear all tags 299 | self.logger.info("Terminating cluster") 300 | self.cluster.terminate() 301 | 302 | def get_storage(self): 303 | return self.cluster.get_storage() 304 | 305 | def print_storage_status(self): 306 | storage = self.get_storage() 307 | if not os.path.isfile(storage._get_storage_filename()): 308 | storage.print_status(volumes=self._get_cluster_volumes(storage)) 309 | else: 310 | storage.print_status() 311 | 312 | def _get_standard_ssh_command(self, instance, ssh_options, remote_command=None): 313 | """ 314 | Returns the complete SSH command ready for execution on the instance. 315 | """ 316 | cmd = "ssh %s %s" % (xstr(ssh_options), instance.public_dns_name) 317 | 318 | if remote_command is not None: 319 | cmd += " '%s'" % remote_command 320 | 321 | return cmd 322 | 323 | def _attach_storage(self, roles): 324 | storage = self.cluster.get_storage() 325 | if storage.has_any_storage(roles): 326 | print "Waiting 10 seconds before attaching storage" 327 | time.sleep(10) 328 | for role in roles: 329 | storage.attach(role, self.cluster.get_instances_in_role(role, 'running')) 330 | storage.print_status(roles) 331 | 332 | def _launch_instances(self, instance_template, exclude_roles=[]): 333 | it = instance_template 334 | user_data_file_template = it.user_data_file_template 335 | 336 | if it.user_data_file_template == None: 337 | user_data_file_template = self._get_default_user_data_file_template() 338 | 339 | ebs_mappings = [] 340 | storage = self.cluster.get_storage() 341 | for role in it.roles: 342 | if role in exclude_roles: 343 | continue 344 | if storage.has_any_storage((role,)): 345 | ebs_mappings.append(storage.get_mappings_string_for_role(role)) 346 | 347 | replacements = { 348 | "%ENV%": build_env_string(it.env_strings, { 349 | "ROLES": ",".join(it.roles), 350 | "USER_PACKAGES": it.user_packages, 351 | "AUTO_SHUTDOWN": it.auto_shutdown, 352 | "EBS_MAPPINGS": ";".join(ebs_mappings), 353 | }) 354 | } 355 | self.logger.debug("EBS Mappings: %s" % ";".join(ebs_mappings)) 356 | instance_user_data = InstanceUserData(user_data_file_template, replacements) 357 | 358 | self.logger.debug("InstanceUserData gzipped length: %d" % len(instance_user_data.read_as_gzip_stream())) 359 | 360 | instance_ids = self.cluster.launch_instances(it.roles, 361 | it.number, 362 | it.image_id, 363 | it.size_id, 364 | instance_user_data, 365 | key_name=it.key_name, 366 | public_key=it.public_key, 367 | placement=it.placement, 368 | security_groups=it.security_groups, 369 | spot_config=it.spot_config) 370 | 371 | self.logger.debug("Instance ids reported to start: %s" % str(instance_ids)) 372 | return instance_ids 373 | 374 | def delete_storage(self, force=False): 375 | storage = self.cluster.get_storage() 376 | self._print_storage_status(storage) 377 | if not force and not self._prompt("Delete all storage volumes? THIS WILL \ 378 | PERMANENTLY DELETE ALL DATA"): 379 | print "Not deleting storage volumes." 380 | else: 381 | print "Deleting storage" 382 | storage.delete(storage.get_roles()) 383 | 384 | def create_storage(self, role, number_of_instances, availability_zone, spec_file): 385 | storage = self.get_storage() 386 | storage.create(role, number_of_instances, availability_zone, spec_file) 387 | 388 | def run_command(self, command, ssh_options): 389 | instances = self.get_instances() 390 | ssh_commands = [self._get_standard_ssh_command(instance, ssh_options=ssh_options, remote_command=command) 391 | for instance in instances] 392 | procs = [subprocess.Popen(ssh_command, shell=True) for ssh_command in ssh_commands] 393 | retcodes = [proc.wait() for proc in procs] 394 | return zip(instances, retcodes) 395 | 396 | def _get_transfer_command(self, instance, file_name, ssh_options): 397 | transfer_command = "scp %s %s %s:" % (xstr(ssh_options), file_name, instance.public_dns_name) 398 | # transfer_command = self._get_standard_ssh_command(instance, ssh_options, "cat > %s" % file_name) + " < %s" % file_name 399 | self.logger.debug("Transfer command: %s" % transfer_command) 400 | return transfer_command 401 | 402 | def transfer_files(self, file_names, ssh_options): 403 | instances = self.get_instances() 404 | operations = list(itertools.product(instances, file_names)) 405 | ssh_commands = [self._get_transfer_command(instance, file_name, ssh_options) for instance, file_name in 406 | operations] 407 | procs = [subprocess.Popen(ssh_command, shell=True) for ssh_command in ssh_commands] 408 | retcodes = [proc.wait() for proc in procs] 409 | return [(operation[0], operation[1], retcode) for operation, retcode in zip(operations, retcodes)] 410 | 411 | def login(self, instance, ssh_options): 412 | ssh_command = self._get_standard_ssh_command(instance, ssh_options) 413 | subprocess.call(ssh_command, shell=True) 414 | 415 | -------------------------------------------------------------------------------- /cloud/providers/__init__.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. -------------------------------------------------------------------------------- /cloud/service.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """ 17 | Classes for running services on a cluster. 18 | """ 19 | 20 | from __future__ import with_statement 21 | 22 | from cloud.settings import SERVICE_PROVIDER_MAP 23 | from cloud.cluster import get_cluster 24 | from cloud.cluster import InstanceUserData 25 | from cloud.cluster import TimeoutException 26 | from cloud.providers.ec2 import Ec2Storage 27 | from cloud.util import build_env_string 28 | from cloud.util import url_get 29 | from cloud.util import xstr 30 | from prettytable import PrettyTable 31 | from datetime import datetime 32 | import logging 33 | import types 34 | import os 35 | import re 36 | import socket 37 | import subprocess 38 | import sys 39 | import time 40 | import tempfile 41 | import simplejson 42 | 43 | logger = logging.getLogger(__name__) 44 | 45 | class InstanceTemplate(object): 46 | """ 47 | A template for creating server instances in a cluster. 48 | """ 49 | def __init__(self, roles, number, image_id, size_id, 50 | key_name, public_key, 51 | user_data_file_template=None, placement=None, 52 | user_packages=None, auto_shutdown=None, env_strings=[], 53 | security_groups=[], spot_config=None): 54 | self.roles = roles 55 | self.number = number 56 | self.image_id = image_id 57 | self.size_id = size_id 58 | self.key_name = key_name 59 | self.public_key = public_key 60 | self.user_data_file_template = user_data_file_template 61 | self.placement = placement 62 | self.user_packages = user_packages 63 | self.auto_shutdown = auto_shutdown 64 | self.env_strings = env_strings 65 | self.security_groups = security_groups 66 | self.spot_config = spot_config 67 | 68 | t = type(self.security_groups) 69 | if t is types.NoneType: 70 | self.security_groups = [] 71 | elif t is types.StringType: 72 | self.security_groups = [security_groups] 73 | 74 | def add_env_strings(self, env_strings): 75 | new_env_strings = list(self.env_strings or []) 76 | new_env_strings.extend(env_strings) 77 | self.env_strings = new_env_strings 78 | 79 | def get_service(service, provider): 80 | """ 81 | Retrieve the Service class for a service and provider. 82 | """ 83 | mod_name, service_classname = SERVICE_PROVIDER_MAP[service][provider] 84 | _mod = __import__(mod_name, globals(), locals(), [service_classname]) 85 | return getattr(_mod, service_classname) 86 | -------------------------------------------------------------------------------- /cloud/settings.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | SERVICE_PROVIDER_MAP = { 17 | "cassandra": { 18 | "ec2": ('cassandra.service', 'CassandraService') 19 | }, 20 | "hadoop": { 21 | "ec2": ('hadoop.service', 'HadoopService'), 22 | "ec2_spot": ('hadoop.service', 'HadoopService'), 23 | }, 24 | "hadoop_cassandra_hybrid": { 25 | "ec2": ('hadoop_cassandra_hybrid.service', 'HadoopCassandraHybridService') 26 | }, 27 | } 28 | 29 | -------------------------------------------------------------------------------- /cloud/storage.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """ 17 | Classes for controlling external cluster storage. 18 | """ 19 | 20 | import logging 21 | import sys 22 | import simplejson as json 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | class VolumeSpec(object): 27 | """ 28 | The specification for a storage volume, encapsulating all the information 29 | needed to create a volume and ultimately mount it on an instance. 30 | """ 31 | def __init__(self, size, mount_point, device, snapshot_id): 32 | self.size = size 33 | self.mount_point = mount_point 34 | self.device = device 35 | self.snapshot_id = snapshot_id 36 | 37 | 38 | class JsonVolumeSpecManager(object): 39 | """ 40 | A container for VolumeSpecs. This object can read VolumeSpecs specified in 41 | JSON. 42 | """ 43 | def __init__(self, spec_file): 44 | self.spec = json.load(spec_file) 45 | 46 | def volume_specs_for_role(self, role): 47 | return [VolumeSpec(d["size_gb"], d["mount_point"], d["device"], 48 | d["snapshot_id"]) for d in self.spec[role]] 49 | 50 | def get_mappings_string_for_role(self, role): 51 | """ 52 | Returns a short string of the form 53 | "role,mount_point1,device1;role,mount_point2,device2;..." 54 | which is useful for passing as an environment variable. 55 | """ 56 | return ";".join(["%s,%s,%s" % (role, d["mount_point"], d["device"]) 57 | for d in self.spec[role]]) 58 | 59 | 60 | class MountableVolume(object): 61 | """ 62 | A storage volume that has been created. It may or may not have been attached 63 | or mounted to an instance. 64 | """ 65 | def __init__(self, volume_id, mount_point, device): 66 | self.volume_id = volume_id 67 | self.mount_point = mount_point 68 | self.device = device 69 | 70 | 71 | class JsonVolumeManager(object): 72 | 73 | def __init__(self, filename): 74 | self.filename = filename 75 | 76 | def _load(self): 77 | try: 78 | return json.load(open(self.filename, "r")) 79 | except IOError: 80 | logger.debug("File %s does not exist.", self.filename) 81 | return {} 82 | 83 | def _store(self, obj): 84 | return json.dump(obj, open(self.filename, "w"), sort_keys=True, indent=2) 85 | 86 | def get_roles(self): 87 | json_dict = self._load() 88 | return json_dict.keys() 89 | 90 | def add_instance_storage_for_role(self, role, mountable_volumes): 91 | json_dict = self._load() 92 | mv_dicts = [mv.__dict__ for mv in mountable_volumes] 93 | json_dict.setdefault(role, []).append(mv_dicts) 94 | self._store(json_dict) 95 | 96 | def remove_instance_storage_for_role(self, role): 97 | json_dict = self._load() 98 | del json_dict[role] 99 | self._store(json_dict) 100 | 101 | def get_instance_storage_for_role(self, role): 102 | """ 103 | Returns a list of lists of MountableVolume objects. Each nested list is 104 | the storage for one instance. 105 | """ 106 | try: 107 | json_dict = self._load() 108 | instance_storage = [] 109 | for instance in json_dict[role]: 110 | vols = [] 111 | for vol in instance: 112 | vols.append(MountableVolume(vol["volume_id"], vol["mount_point"], 113 | vol["device"])) 114 | instance_storage.append(vols) 115 | return instance_storage 116 | except KeyError: 117 | return [] 118 | 119 | class Storage(object): 120 | """ 121 | Storage volumes for a cluster. The storage is associated with a named 122 | cluster. Many clusters just have local storage, in which case this is 123 | not used. 124 | """ 125 | 126 | def __init__(self, cluster): 127 | self.cluster = cluster 128 | 129 | def create(self, role, number_of_instances, availability_zone, spec_filename): 130 | """ 131 | Create new storage volumes for instances with the given role, according to 132 | the mapping defined in the spec file. 133 | """ 134 | pass 135 | 136 | def get_mappings_string_for_role(self, role): 137 | """ 138 | Returns a short string of the form 139 | "mount_point1,device1;mount_point2,device2;..." 140 | which is useful for passing as an environment variable. 141 | """ 142 | raise Exception("Unimplemented") 143 | 144 | def has_any_storage(self, roles): 145 | """ 146 | Return True if any of the given roles has associated storage 147 | """ 148 | return False 149 | 150 | def get_roles(self): 151 | """ 152 | Return a list of roles that have storage defined. 153 | """ 154 | return [] 155 | 156 | def print_status(self, roles=None): 157 | """ 158 | Print the status of storage volumes for the given roles. 159 | """ 160 | pass 161 | 162 | def attach(self, role, instances): 163 | """ 164 | Attach volumes for a role to instances. Some volumes may already be 165 | attached, in which case they are ignored, and we take care not to attach 166 | multiple volumes to an instance. 167 | """ 168 | pass 169 | 170 | def delete(self, roles=[]): 171 | """ 172 | Permanently delete all the storage for the given roles. 173 | """ 174 | pass 175 | -------------------------------------------------------------------------------- /cloud/util.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """ 17 | Utility functions. 18 | """ 19 | 20 | import os 21 | import csv 22 | import time 23 | import ConfigParser 24 | import socket 25 | import urllib2 26 | import paramiko 27 | import logging 28 | 29 | from subprocess import Popen, PIPE, CalledProcessError 30 | from boto.ec2 import regions as EC2Regions 31 | from fabric.api import * 32 | 33 | FULL_HIDE = hide("running", "stdout", "stderr", "warnings") 34 | 35 | def get_ec2_connection(regionName): 36 | for region in EC2Regions(): 37 | if region.name == regionName: 38 | return region.connect() 39 | 40 | raise RuntimeError("Unknown region name: %s" % regionName) 41 | 42 | def bash_quote(text): 43 | """Quotes a string for bash, by using single quotes.""" 44 | if text == None: 45 | return "" 46 | return "'%s'" % text.replace("'", "'\\''") 47 | 48 | def bash_quote_env(env): 49 | """Quotes the value in an environment variable assignment.""" 50 | if env.find("=") == -1: 51 | return env 52 | (var, value) = env.split("=", 1) 53 | return "%s=%s" % (var, bash_quote(value)) 54 | 55 | def build_env_string(env_strings=[], pairs={}): 56 | """Build a bash environment variable assignment""" 57 | env = '' 58 | if env_strings: 59 | for env_string in env_strings: 60 | env += "%s " % bash_quote_env(env_string) 61 | if pairs: 62 | for key, val in pairs.items(): 63 | env += "%s=%s " % (key, bash_quote(val)) 64 | return env[:-1] 65 | 66 | def get_all_cluster_names_from_config_file(config): 67 | return config.sections() 68 | 69 | def merge_config_with_options(section_name, config, options): 70 | """ 71 | Merge configuration options with a dictionary of options. 72 | Keys in the options dictionary take precedence. 73 | """ 74 | res = {} 75 | try: 76 | for (key, value) in config.items(section_name): 77 | if value.find("\n") != -1: 78 | res[key] = value.split("\n") 79 | else: 80 | res[key] = value 81 | except ConfigParser.NoSectionError: 82 | pass 83 | except ValueError, e: 84 | # incomplete format error usually means you forgot 85 | # to include the type for interpolation 86 | if "incomplete format" in e.message: 87 | msg = "Section '%s'. Double check that your formatting " \ 88 | "contains the format type after the closing parantheses. " \ 89 | "Example: %%(foo)s" % section_name 90 | raise ConfigParser.InterpolationError(options, section_name, msg) 91 | 92 | for key in options: 93 | if options[key] != None: 94 | res[key] = options[key] 95 | return res 96 | 97 | def url_get(url, timeout=10, retries=0): 98 | """ 99 | Retrieve content from the given URL. 100 | """ 101 | # in Python 2.6 we can pass timeout to urllib2.urlopen 102 | socket.setdefaulttimeout(timeout) 103 | attempts = 0 104 | while True: 105 | try: 106 | return urllib2.urlopen(url).read() 107 | except urllib2.URLError: 108 | attempts = attempts + 1 109 | if attempts > retries: 110 | raise 111 | 112 | def xstr(string): 113 | """Sane string conversion: return an empty string if string is None.""" 114 | return '' if string is None else str(string) 115 | 116 | def check_output(*popenargs, **kwargs): 117 | r"""Run command with arguments and return its output as a byte string. 118 | 119 | If the exit code was non-zero it raises a CalledProcessError. The 120 | CalledProcessError object will have the return code in the returncode 121 | attribute and output in the output attribute. 122 | 123 | The arguments are the same as for the Popen constructor. Example: 124 | 125 | >>> check_output(["ls", "-l", "/dev/null"]) 126 | 'crw-rw-rw- 1 root root 1, 3 Oct 18 2007 /dev/null\n' 127 | 128 | The stdout argument is not allowed as it is used internally. 129 | To capture standard error in the result, use stderr=STDOUT. 130 | 131 | >>> check_output(["/bin/sh", "-c", 132 | ... "ls -l non_existent_file ; exit 0"], 133 | ... stderr=STDOUT) 134 | 'ls: non_existent_file: No such file or directory\n' 135 | 136 | NOTE: copied from 2.7 standard library so that we maintain our compatibility with 2.5 137 | """ 138 | if 'stdout' in kwargs: 139 | raise ValueError('stdout argument not allowed, it will be overridden.') 140 | process = Popen(stdout=PIPE, *popenargs, **kwargs) 141 | output, unused_err = process.communicate() 142 | retcode = process.poll() 143 | if retcode: 144 | cmd = kwargs.get("args") 145 | if cmd is None: 146 | cmd = popenargs[0] 147 | raise CalledProcessError(retcode, cmd) 148 | return output 149 | 150 | def log_cluster_action(config_dir, cluster_name, command, number, 151 | instance_type=None, provider=None, plugin=None): 152 | """Log details of cluster launching or termination to a csv file. 153 | """ 154 | 155 | csv_file = open(os.path.join(config_dir, "launch_log.csv"), "a+b") 156 | csv_log = csv.writer(csv_file) 157 | csv_log.writerow([cluster_name, command, number, instance_type, provider, plugin, time.strftime("%Y-%m-%d %H:%M:%S %Z")]) 158 | csv_file.close() 159 | 160 | def ssh_available(user, private_key, host, port=22, timeout=10): 161 | client = paramiko.SSHClient() 162 | 163 | # Load known host keys (e.g. ~/.ssh/known_hosts) unless user says not to. 164 | if not env.disable_known_hosts: 165 | client.load_system_host_keys() 166 | # Unless user specified not to, accept/add new, unknown host keys 167 | if not env.reject_unknown_hosts: 168 | client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) 169 | 170 | try: 171 | client.connect( 172 | hostname=host, 173 | port=port, 174 | username=user, 175 | key_filename=private_key, 176 | timeout=timeout, 177 | allow_agent=not env.no_agent, 178 | look_for_keys=not env.no_keys 179 | ) 180 | return True 181 | except Exception, e: 182 | logging.warn(e) 183 | return False 184 | 185 | def exec_command(cmd, **kwargs): 186 | c = sudo if use_sudo() else run 187 | return c(cmd, **kwargs) 188 | 189 | def use_sudo(): 190 | return env.user != "root" 191 | -------------------------------------------------------------------------------- /example_scripts/cassandra-ec2-init-remote.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -x 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | ################################################################################ 19 | # Script that is run on each EC2 instance on boot. It is passed in the EC2 user 20 | # data, so should not exceed 16K in size after gzip compression. 21 | # 22 | # This script is executed by /etc/init.d/ec2-run-user-data, and output is 23 | # logged to /var/log/messages. 24 | ################################################################################ 25 | 26 | set -e -x 27 | 28 | ################################################################################ 29 | # Initialize variables 30 | ################################################################################ 31 | 32 | # Substitute environment variables passed by the client 33 | export %ENV% 34 | 35 | # Write environment variables to /root/.bash_profile 36 | echo "export %ENV%" >> ~root/.bash_profile 37 | echo "export %ENV%" >> ~root/.bashrc 38 | 39 | DEFAULT_CASSANDRA_URL="http://mirror.cloudera.com/apache/cassandra/0.6.4/apache-cassandra-0.6.4-bin.tar.gz" 40 | PUBLIC_HOSTNAME=`wget -q -O - http://169.254.169.254/latest/meta-data/public-hostname` 41 | CASSANDRA_HOME_ALIAS=/usr/local/apache-cassandra 42 | DEFAULT_JNA_URL="http://java.net/projects/jna/sources/svn/content/tags/3.2.7/jnalib/dist/jna.jar?rev=1182" 43 | if [ -z INSTALL_JNA ]; then 44 | INSTALL_JNA=1 45 | fi 46 | if [ -z PUBLIC_JMX ]; then 47 | PUBLIC_JMX=0 48 | fi 49 | 50 | function install_jna() { 51 | if [ $INSTALL_JNA -eq 0 ]; then 52 | return 53 | fi 54 | curl="curl --retry 3 --silent --show-error --fail" 55 | if [ -z "$JNA_URL" ]; then 56 | JNA_URL=$DEFAULT_JNA_URL 57 | fi 58 | 59 | $curl -o "jna.jar" $DEFAULT_JNA_URL 60 | cp "jna.jar" $CASSANDRA_HOME_WITH_VERSION/lib 61 | rm -rf "jna.jar" 62 | } 63 | 64 | function install_cassandra() { 65 | 66 | curl="curl --retry 3 --silent --show-error --fail" 67 | if [ ! -z "$CASSANDRA_URL" ]; then 68 | DEFAULT_CASSANDRA_URL=$CASSANDRA_URL 69 | fi 70 | 71 | cassandra_tar_file=`basename $DEFAULT_CASSANDRA_URL` 72 | $curl -O $DEFAULT_CASSANDRA_URL 73 | 74 | tar zxf $cassandra_tar_file -C /usr/local 75 | rm -f $cassandra_tar_file 76 | 77 | CASSANDRA_HOME_WITH_VERSION=/usr/local/`ls -1 /usr/local | grep cassandra` 78 | 79 | echo "export CASSANDRA_HOME=$CASSANDRA_HOME_ALIAS" >> ~root/.bash_profile 80 | echo 'export PATH=$CASSANDRA_HOME/bin:$PATH' >> ~root/.bash_profile 81 | 82 | install_jna 83 | } 84 | 85 | function wait_for_mount { 86 | mount=$1 87 | device=$2 88 | 89 | mkdir -p $mount 90 | 91 | i=1 92 | echo "Attempting to mount $device" 93 | while true ; do 94 | sleep 10 95 | echo -n "$i " 96 | i=$[$i+1] 97 | mount -o defaults,noatime $device $mount || continue 98 | echo " Mounted." 99 | break; 100 | done 101 | 102 | if [ -e $mount/lost+found ]; then 103 | rm -rf $mount/lost+found 104 | fi 105 | } 106 | 107 | function configure_cassandra() { 108 | if [ -n "$EBS_MAPPINGS" ]; then 109 | # EBS_MAPPINGS is like "cn,/ebs1,/dev/sdj;cn,/ebs2,/dev/sdk" 110 | # EBS_MAPPINGS is like "ROLE,MOUNT_POINT,DEVICE;ROLE,MOUNT_POINT,DEVICE" 111 | for mapping in $(echo "$EBS_MAPPINGS" | tr ";" "\n"); do 112 | role=`echo $mapping | cut -d, -f1` 113 | mount=`echo $mapping | cut -d, -f2` 114 | device=`echo $mapping | cut -d, -f3` 115 | wait_for_mount $mount $device 116 | done 117 | fi 118 | 119 | if [ -f "$CASSANDRA_HOME_WITH_VERSION/conf/cassandra-env.sh" ] 120 | then 121 | # for cassandra 0.7.x we need to set the MAX_HEAP_SIZE env 122 | # variable so that it can be used in cassandra-env.sh on 123 | # startup 124 | if [ -z "$MAX_HEAP_SIZE" ] 125 | then 126 | JVM_OPTS="-XX:+PrintGCApplicationStoppedTime -XX:HeapDumpPath=/mnt" 127 | if [ $PUBLIC_JMX -gt 0 ]; then 128 | JVM_OPTS="$JVM_OPTS -Djava.rmi.server.hostname="$PUBLIC_HOSTNAME 129 | fi 130 | INSTANCE_TYPE=`wget -q -O - http://169.254.169.254/latest/meta-data/instance-type` 131 | case $INSTANCE_TYPE in 132 | m1.xlarge|m2.xlarge) 133 | MAX_HEAP_SIZE="10G" 134 | ;; 135 | m1.large|c1.xlarge) 136 | MAX_HEAP_SIZE="5G" 137 | ;; 138 | *) 139 | # Don't set it and let cassandra-env figure it out 140 | ;; 141 | esac 142 | 143 | # write it to the profile 144 | echo "export MAX_HEAP_SIZE=$MAX_HEAP_SIZE" >> ~root/.bash_profile 145 | echo "export MAX_HEAP_SIZE=$MAX_HEAP_SIZE" >> ~root/.bashrc 146 | echo "export JVM_OPTS=\"$JVM_OPTS\"" >> ~root/.bash_profile 147 | echo "export JVM_OPTS=\"$JVM_OPTS\"" >> ~root/.bashrc 148 | fi 149 | else 150 | write_cassandra_in_sh_file 151 | fi 152 | } 153 | 154 | function write_cassandra_in_sh_file { 155 | # for cassandra 0.6.x memory settings 156 | 157 | # configure the cassandra.in.sh script based on instance type 158 | INSTANCE_TYPE=`wget -q -O - http://169.254.169.254/latest/meta-data/instance-type` 159 | SETTINGS_FILE=$CASSANDRA_HOME_WITH_VERSION/bin/cassandra.in.sh 160 | 161 | cat > $SETTINGS_FILE <> $SETTINGS_FILE <> $SETTINGS_FILE <> $SETTINGS_FILE <> ~root/.bash_profile 34 | echo "export %ENV%" >> ~root/.bashrc 35 | 36 | DEFAULT_CASSANDRA_URL="http://mirror.cloudera.com/apache/cassandra/0.6.4/apache-cassandra-0.6.4-bin.tar.gz" 37 | CASSANDRA_HOME_ALIAS=/usr/local/apache-cassandra 38 | 39 | HADOOP_VERSION=${HADOOP_VERSION:-0.20.1} 40 | HADOOP_HOME=/usr/local/hadoop-$HADOOP_VERSION 41 | HADOOP_CONF_DIR=$HADOOP_HOME/conf 42 | 43 | PIG_VERSION=${PIG_VERSION:-0.7.0} 44 | PIG_HOME=/usr/local/pig-$PIG_VERSION 45 | PIG_CONF_DIR=$PIG_HOME/conf 46 | 47 | SELF_HOST=`wget -q -O - http://169.254.169.254/latest/meta-data/public-hostname` 48 | for role in $(echo "$ROLES" | tr "," "\n"); do 49 | case $role in 50 | hybrid_nn) 51 | HYBRID_NN_HOST=$SELF_HOST 52 | ;; 53 | hybrid_jt) 54 | HYBRID_JT_HOST=$SELF_HOST 55 | ;; 56 | esac 57 | done 58 | 59 | function register_auto_shutdown() { 60 | if [ ! -z "$AUTO_SHUTDOWN" ]; then 61 | shutdown -h +$AUTO_SHUTDOWN >/dev/null & 62 | fi 63 | } 64 | 65 | # Install a list of packages on debian or redhat as appropriate 66 | function install_packages() { 67 | if which dpkg &> /dev/null; then 68 | apt-get update 69 | apt-get -y install $@ 70 | elif which rpm &> /dev/null; then 71 | yum install -y $@ 72 | else 73 | echo "No package manager found." 74 | fi 75 | } 76 | 77 | # Install any user packages specified in the USER_PACKAGES environment variable 78 | function install_user_packages() { 79 | if [ ! -z "$USER_PACKAGES" ]; then 80 | install_packages $USER_PACKAGES 81 | fi 82 | } 83 | 84 | function install_yourkit() { 85 | mkdir /mnt/yjp 86 | YOURKIT_URL="http://www.yourkit.com/download/yjp-9.0.7-linux.tar.bz2" 87 | curl="curl --retry 3 --silent --show-error --fail" 88 | $curl -O $YOURKIT_URL 89 | yourkit_tar_file=`basename $YOURKIT_URL` 90 | tar xjf $yourkit_tar_file -C /mnt/yjp 91 | rm -f $yourkit_tar_file 92 | chown -R hadoop /mnt/yjp 93 | chgrp -R hadoop /mnt/yjp 94 | } 95 | 96 | function install_hadoop() { 97 | useradd hadoop 98 | 99 | hadoop_tar_url=http://s3.amazonaws.com/hadoop-releases/core/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz 100 | hadoop_tar_file=`basename $hadoop_tar_url` 101 | hadoop_tar_md5_file=`basename $hadoop_tar_url.md5` 102 | 103 | curl="curl --retry 3 --silent --show-error --fail" 104 | for i in `seq 1 3`; 105 | do 106 | $curl -O $hadoop_tar_url 107 | $curl -O $hadoop_tar_url.md5 108 | if md5sum -c $hadoop_tar_md5_file; then 109 | break; 110 | else 111 | rm -f $hadoop_tar_file $hadoop_tar_md5_file 112 | fi 113 | done 114 | 115 | if [ ! -e $hadoop_tar_file ]; then 116 | echo "Failed to download $hadoop_tar_url. Aborting." 117 | exit 1 118 | fi 119 | 120 | tar zxf $hadoop_tar_file -C /usr/local 121 | rm -f $hadoop_tar_file $hadoop_tar_md5_file 122 | 123 | echo "export HADOOP_HOME=$HADOOP_HOME" >> ~root/.bashrc 124 | echo 'export PATH=$JAVA_HOME/bin:$HADOOP_HOME/bin:$PATH' >> ~root/.bashrc 125 | } 126 | 127 | function install_pig() 128 | { 129 | pig_tar_url=http://mirror.cloudera.com/apache/hadoop/pig/pig-$PIG_VERSION/pig-$PIG_VERSION.tar.gz 130 | pig_tar_file=`basename $pig_tar_url` 131 | 132 | curl="curl --retry 3 --silent --show-error --fail" 133 | for i in `seq 1 3`; 134 | do 135 | $curl -O $pig_tar_url 136 | done 137 | 138 | if [ ! -e $pig_tar_file ]; then 139 | echo "Failed to download $pig_tar_url. Pig will not be installed." 140 | else 141 | tar zxf $pig_tar_file -C /usr/local 142 | rm -f $pig_tar_file 143 | 144 | if [ ! -e $HADOOP_CONF_DIR ]; then 145 | echo "Hadoop must be installed. Aborting." 146 | exit 1 147 | fi 148 | 149 | cp $HADOOP_CONF_DIR/*.xml $PIG_CONF_DIR/ 150 | 151 | echo "export PIG_HOME=$PIG_HOME" >> ~root/.bashrc 152 | echo 'export PATH=$JAVA_HOME/bin:$PIG_HOME/bin:$PATH' >> ~root/.bashrc 153 | fi 154 | } 155 | 156 | function prep_disk() { 157 | mount=$1 158 | device=$2 159 | automount=${3:-false} 160 | 161 | echo "warning: ERASING CONTENTS OF $device" 162 | mkfs.xfs -f $device 163 | if [ ! -e $mount ]; then 164 | mkdir $mount 165 | fi 166 | mount -o defaults,noatime $device $mount 167 | if $automount ; then 168 | echo "$device $mount xfs defaults,noatime 0 0" >> /etc/fstab 169 | fi 170 | } 171 | 172 | function wait_for_mount { 173 | mount=$1 174 | device=$2 175 | 176 | mkdir $mount 177 | 178 | i=1 179 | echo "Attempting to mount $device" 180 | while true ; do 181 | sleep 10 182 | echo -n "$i " 183 | i=$[$i+1] 184 | mount -o defaults,noatime $device $mount || continue 185 | echo " Mounted." 186 | break; 187 | done 188 | } 189 | 190 | function make_hadoop_dirs { 191 | for mount in "$@"; do 192 | if [ ! -e $mount/hadoop ]; then 193 | mkdir -p $mount/hadoop 194 | chown hadoop:hadoop $mount/hadoop 195 | fi 196 | done 197 | } 198 | 199 | # Configure Hadoop by setting up disks and site file 200 | function configure_hadoop() { 201 | 202 | install_packages xfsprogs # needed for XFS 203 | 204 | INSTANCE_TYPE=`wget -q -O - http://169.254.169.254/latest/meta-data/instance-type` 205 | 206 | if [ -n "$EBS_MAPPINGS" ]; then 207 | # EBS_MAPPINGS is like "hybrid_nn,/ebs1,/dev/sdj;hybrid_dn,/ebs2,/dev/sdk" 208 | # EBS_MAPPINGS is like "ROLE,MOUNT_POINT,DEVICE;ROLE,MOUNT_POINT,DEVICE" 209 | DFS_NAME_DIR='' 210 | FS_CHECKPOINT_DIR='' 211 | DFS_DATA_DIR='' 212 | for mapping in $(echo "$EBS_MAPPINGS" | tr ";" "\n"); do 213 | role=`echo $mapping | cut -d, -f1` 214 | mount=`echo $mapping | cut -d, -f2` 215 | device=`echo $mapping | cut -d, -f3` 216 | wait_for_mount $mount $device 217 | DFS_NAME_DIR=${DFS_NAME_DIR},"$mount/hadoop/hdfs/name" 218 | FS_CHECKPOINT_DIR=${FS_CHECKPOINT_DIR},"$mount/hadoop/hdfs/secondary" 219 | DFS_DATA_DIR=${DFS_DATA_DIR},"$mount/hadoop/hdfs/data" 220 | FIRST_MOUNT=${FIRST_MOUNT-$mount} 221 | make_hadoop_dirs $mount 222 | done 223 | # Remove leading commas 224 | DFS_NAME_DIR=${DFS_NAME_DIR#?} 225 | FS_CHECKPOINT_DIR=${FS_CHECKPOINT_DIR#?} 226 | DFS_DATA_DIR=${DFS_DATA_DIR#?} 227 | 228 | DFS_REPLICATION=3 # EBS is internally replicated, but we also use HDFS replication for safety 229 | else 230 | case $INSTANCE_TYPE in 231 | m1.xlarge|c1.xlarge) 232 | DFS_NAME_DIR=/mnt/hadoop/hdfs/name,/mnt2/hadoop/hdfs/name 233 | FS_CHECKPOINT_DIR=/mnt/hadoop/hdfs/secondary,/mnt2/hadoop/hdfs/secondary 234 | DFS_DATA_DIR=/mnt/hadoop/hdfs/data,/mnt2/hadoop/hdfs/data,/mnt3/hadoop/hdfs/data,/mnt4/hadoop/hdfs/data 235 | ;; 236 | m1.large) 237 | DFS_NAME_DIR=/mnt/hadoop/hdfs/name,/mnt2/hadoop/hdfs/name 238 | FS_CHECKPOINT_DIR=/mnt/hadoop/hdfs/secondary,/mnt2/hadoop/hdfs/secondary 239 | DFS_DATA_DIR=/mnt/hadoop/hdfs/data,/mnt2/hadoop/hdfs/data 240 | ;; 241 | *) 242 | # "m1.small" or "c1.medium" 243 | DFS_NAME_DIR=/mnt/hadoop/hdfs/name 244 | FS_CHECKPOINT_DIR=/mnt/hadoop/hdfs/secondary 245 | DFS_DATA_DIR=/mnt/hadoop/hdfs/data 246 | ;; 247 | esac 248 | FIRST_MOUNT=/mnt 249 | DFS_REPLICATION=3 250 | fi 251 | 252 | case $INSTANCE_TYPE in 253 | m1.xlarge|c1.xlarge) 254 | prep_disk /mnt2 /dev/sdc true & 255 | disk2_pid=$! 256 | prep_disk /mnt3 /dev/sdd true & 257 | disk3_pid=$! 258 | prep_disk /mnt4 /dev/sde true & 259 | disk4_pid=$! 260 | wait $disk2_pid $disk3_pid $disk4_pid 261 | MAPRED_LOCAL_DIR=/mnt/hadoop/mapred/local,/mnt2/hadoop/mapred/local,/mnt3/hadoop/mapred/local,/mnt4/hadoop/mapred/local 262 | MAX_MAP_TASKS=8 263 | MAX_REDUCE_TASKS=4 264 | CHILD_OPTS=-Xmx680m 265 | CHILD_ULIMIT=1392640 266 | ;; 267 | m1.large) 268 | prep_disk /mnt2 /dev/sdc true 269 | MAPRED_LOCAL_DIR=/mnt/hadoop/mapred/local,/mnt2/hadoop/mapred/local 270 | MAX_MAP_TASKS=4 271 | MAX_REDUCE_TASKS=2 272 | CHILD_OPTS=-Xmx1024m 273 | CHILD_ULIMIT=2097152 274 | ;; 275 | c1.medium) 276 | MAPRED_LOCAL_DIR=/mnt/hadoop/mapred/local 277 | MAX_MAP_TASKS=4 278 | MAX_REDUCE_TASKS=2 279 | CHILD_OPTS=-Xmx550m 280 | CHILD_ULIMIT=1126400 281 | ;; 282 | *) 283 | # "m1.small" 284 | MAPRED_LOCAL_DIR=/mnt/hadoop/mapred/local 285 | MAX_MAP_TASKS=2 286 | MAX_REDUCE_TASKS=1 287 | CHILD_OPTS=-Xmx550m 288 | CHILD_ULIMIT=1126400 289 | ;; 290 | esac 291 | 292 | make_hadoop_dirs `ls -d /mnt*` 293 | 294 | # Create tmp directory 295 | mkdir /mnt/tmp 296 | chmod a+rwxt /mnt/tmp 297 | 298 | mkdir /etc/hadoop 299 | ln -s $HADOOP_CONF_DIR /etc/hadoop/conf 300 | 301 | ############################################################################## 302 | # Modify this section to customize your Hadoop cluster. 303 | ############################################################################## 304 | cat > $HADOOP_CONF_DIR/hadoop-site.xml < 306 | 307 | 308 | 309 | dfs.block.size 310 | 134217728 311 | true 312 | 313 | 314 | dfs.data.dir 315 | $DFS_DATA_DIR 316 | true 317 | 318 | 319 | dfs.datanode.du.reserved 320 | 1073741824 321 | true 322 | 323 | 324 | dfs.datanode.handler.count 325 | 3 326 | true 327 | 328 | 333 | 338 | 339 | dfs.name.dir 340 | $DFS_NAME_DIR 341 | true 342 | 343 | 344 | dfs.namenode.handler.count 345 | 5 346 | true 347 | 348 | 349 | dfs.permissions 350 | true 351 | true 352 | 353 | 354 | dfs.replication 355 | $DFS_REPLICATION 356 | 357 | 358 | fs.checkpoint.dir 359 | $FS_CHECKPOINT_DIR 360 | true 361 | 362 | 363 | fs.default.name 364 | hdfs://$HYBRID_NN_HOST:8020/ 365 | 366 | 367 | fs.trash.interval 368 | 1440 369 | true 370 | 371 | 372 | hadoop.tmp.dir 373 | /mnt/tmp/hadoop-\${user.name} 374 | true 375 | 376 | 377 | io.file.buffer.size 378 | 65536 379 | 380 | 381 | mapred.child.java.opts 382 | $CHILD_OPTS 383 | 384 | 385 | mapred.child.ulimit 386 | $CHILD_ULIMIT 387 | true 388 | 389 | 390 | mapred.job.tracker 391 | $HYBRID_JT_HOST:8021 392 | 393 | 394 | mapred.job.tracker.handler.count 395 | 5 396 | true 397 | 398 | 399 | mapred.local.dir 400 | $MAPRED_LOCAL_DIR 401 | true 402 | 403 | 404 | mapred.map.tasks.speculative.execution 405 | true 406 | 407 | 408 | mapred.reduce.parallel.copies 409 | 10 410 | 411 | 412 | mapred.reduce.tasks 413 | 10 414 | 415 | 416 | mapred.reduce.tasks.speculative.execution 417 | false 418 | 419 | 420 | mapred.submit.replication 421 | 10 422 | 423 | 424 | mapred.system.dir 425 | /hadoop/system/mapred 426 | 427 | 428 | mapred.tasktracker.map.tasks.maximum 429 | $MAX_MAP_TASKS 430 | true 431 | 432 | 433 | mapred.tasktracker.reduce.tasks.maximum 434 | $MAX_REDUCE_TASKS 435 | true 436 | 437 | 438 | tasktracker.http.threads 439 | 46 440 | true 441 | 442 | 443 | mapred.compress.map.output 444 | true 445 | 446 | 447 | mapred.output.compression.type 448 | BLOCK 449 | 450 | 451 | hadoop.rpc.socket.factory.class.default 452 | org.apache.hadoop.net.StandardSocketFactory 453 | true 454 | 455 | 456 | hadoop.rpc.socket.factory.class.ClientProtocol 457 | 458 | true 459 | 460 | 461 | hadoop.rpc.socket.factory.class.JobSubmissionProtocol 462 | 463 | true 464 | 465 | 466 | io.compression.codecs 467 | org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec 468 | 469 | 470 | fs.s3.awsAccessKeyId 471 | $AWS_ACCESS_KEY_ID 472 | 473 | 474 | fs.s3.awsSecretAccessKey 475 | $AWS_SECRET_ACCESS_KEY 476 | 477 | 478 | fs.s3n.awsAccessKeyId 479 | $AWS_ACCESS_KEY_ID 480 | 481 | 482 | fs.s3n.awsSecretAccessKey 483 | $AWS_SECRET_ACCESS_KEY 484 | 485 | 486 | EOF 487 | 488 | # Keep PID files in a non-temporary directory 489 | sed -i -e "s|# export HADOOP_PID_DIR=.*|export HADOOP_PID_DIR=/var/run/hadoop|" \ 490 | $HADOOP_CONF_DIR/hadoop-env.sh 491 | mkdir -p /var/run/hadoop 492 | chown -R hadoop:hadoop /var/run/hadoop 493 | 494 | # Set SSH options within the cluster 495 | sed -i -e 's|# export HADOOP_SSH_OPTS=.*|export HADOOP_SSH_OPTS="-o StrictHostKeyChecking=no"|' \ 496 | $HADOOP_CONF_DIR/hadoop-env.sh 497 | 498 | # Hadoop logs should be on the /mnt partition 499 | sed -i -e 's|# export HADOOP_LOG_DIR=.*|export HADOOP_LOG_DIR=/var/log/hadoop/logs|' \ 500 | $HADOOP_CONF_DIR/hadoop-env.sh 501 | rm -rf /var/log/hadoop 502 | mkdir /mnt/hadoop/logs 503 | chown hadoop:hadoop /mnt/hadoop/logs 504 | ln -s /mnt/hadoop/logs /var/log/hadoop 505 | chown -R hadoop:hadoop /var/log/hadoop 506 | 507 | } 508 | 509 | # Sets up small website on cluster. 510 | function setup_web() { 511 | 512 | if which dpkg &> /dev/null; then 513 | apt-get -y install thttpd 514 | WWW_BASE=/var/www 515 | elif which rpm &> /dev/null; then 516 | yum install -y thttpd 517 | chkconfig --add thttpd 518 | WWW_BASE=/var/www/thttpd/html 519 | fi 520 | 521 | cat > $WWW_BASE/index.html << END 522 | 523 | 524 | Hadoop EC2 Cluster 525 | 526 | 527 |

Hadoop EC2 Cluster

528 | To browse the cluster you need to have a proxy configured. 529 | Start the proxy with hadoop-ec2 proxy <cluster_name>, 530 | and point your browser to 531 | this Proxy 532 | Auto-Configuration (PAC) file. To manage multiple proxy configurations, 533 | you may wish to use 534 | FoxyProxy. 535 | 539 | 540 | 541 | END 542 | 543 | service thttpd start 544 | 545 | } 546 | 547 | function start_namenode() { 548 | if which dpkg &> /dev/null; then 549 | AS_HADOOP="su -s /bin/bash - hadoop -c" 550 | elif which rpm &> /dev/null; then 551 | AS_HADOOP="/sbin/runuser -s /bin/bash - hadoop -c" 552 | fi 553 | 554 | # Format HDFS 555 | [ ! -e $FIRST_MOUNT/hadoop/hdfs ] && $AS_HADOOP "$HADOOP_HOME/bin/hadoop namenode -format" 556 | 557 | $AS_HADOOP "$HADOOP_HOME/bin/hadoop-daemon.sh start namenode" 558 | 559 | $AS_HADOOP "$HADOOP_HOME/bin/hadoop dfsadmin -safemode wait" 560 | $AS_HADOOP "$HADOOP_HOME/bin/hadoop fs -mkdir /user" 561 | # The following is questionable, as it allows a user to delete another user 562 | # It's needed to allow users to create their own user directories 563 | $AS_HADOOP "$HADOOP_HOME/bin/hadoop fs -chmod +w /user" 564 | 565 | } 566 | 567 | function start_daemon() { 568 | if which dpkg &> /dev/null; then 569 | AS_HADOOP="su -s /bin/bash - hadoop -c" 570 | elif which rpm &> /dev/null; then 571 | AS_HADOOP="/sbin/runuser -s /bin/bash - hadoop -c" 572 | fi 573 | $AS_HADOOP "$HADOOP_HOME/bin/hadoop-daemon.sh start $1" 574 | } 575 | 576 | function install_cassandra() { 577 | 578 | curl="curl --retry 3 --silent --show-error --fail" 579 | if [ ! -z "$CASSANDRA_URL" ]; then 580 | DEFAULT_CASSANDRA_URL=$CASSANDRA_URL 581 | fi 582 | 583 | cassandra_tar_file=`basename $DEFAULT_CASSANDRA_URL` 584 | $curl -O $DEFAULT_CASSANDRA_URL 585 | 586 | tar zxf $cassandra_tar_file -C /usr/local 587 | rm -f $cassandra_tar_file 588 | 589 | CASSANDRA_HOME_WITH_VERSION=/usr/local/`ls -1 /usr/local | grep cassandra` 590 | 591 | echo "export CASSANDRA_HOME=$CASSANDRA_HOME_ALIAS" >> ~root/.bash_profile 592 | echo 'export PATH=$CASSANDRA_HOME/bin:$PATH' >> ~root/.bash_profile 593 | } 594 | 595 | function configure_cassandra() { 596 | if [ -n "$EBS_MAPPINGS" ]; then 597 | # EBS_MAPPINGS is like "cn,/ebs1,/dev/sdj;cn,/ebs2,/dev/sdk" 598 | # EBS_MAPPINGS is like "ROLE,MOUNT_POINT,DEVICE;ROLE,MOUNT_POINT,DEVICE" 599 | for mapping in $(echo "$EBS_MAPPINGS" | tr ";" "\n"); do 600 | role=`echo $mapping | cut -d, -f1` 601 | mount=`echo $mapping | cut -d, -f2` 602 | device=`echo $mapping | cut -d, -f3` 603 | wait_for_mount $mount $device 604 | done 605 | fi 606 | 607 | if [ -f "$CASSANDRA_HOME_WITH_VERSION/conf/cassandra-env.sh" ] 608 | then 609 | # for cassandra 0.7.x we need to set the MAX_HEAP_SIZE env 610 | # variable so that it can be used in cassandra-env.sh on 611 | # startup 612 | if [ -z "$MAX_HEAP_SIZE" ] 613 | then 614 | INSTANCE_TYPE=`wget -q -O - http://169.254.169.254/latest/meta-data/instance-type` 615 | case $INSTANCE_TYPE in 616 | m1.xlarge|m2.xlarge) 617 | MAX_HEAP_SIZE="10G" 618 | ;; 619 | m1.large|c1.xlarge) 620 | MAX_HEAP_SIZE="5G" 621 | ;; 622 | *) 623 | # Don't set it and let cassandra-env figure it out 624 | ;; 625 | esac 626 | 627 | # write it to the profile 628 | echo "export MAX_HEAP_SIZE=$MAX_HEAP_SIZE" >> ~root/.bash_profile 629 | echo "export MAX_HEAP_SIZE=$MAX_HEAP_SIZE" >> ~root/.bashrc 630 | fi 631 | else 632 | write_cassandra_in_sh_file 633 | fi 634 | } 635 | 636 | function write_cassandra_in_sh_file { 637 | # for cassandra 0.6.x memory settings 638 | 639 | # configure the cassandra.in.sh script based on instance type 640 | INSTANCE_TYPE=`wget -q -O - http://169.254.169.254/latest/meta-data/instance-type` 641 | SETTINGS_FILE=$CASSANDRA_HOME_WITH_VERSION/bin/cassandra.in.sh 642 | 643 | cat > $SETTINGS_FILE <> $SETTINGS_FILE <> $SETTINGS_FILE <> $SETTINGS_FILE <> ~root/.bash_profile 34 | echo "export %ENV%" >> ~root/.bashrc 35 | 36 | HADOOP_VERSION=${HADOOP_VERSION:-0.20.2+737} 37 | HADOOP_HOME=/usr/local/hadoop-$HADOOP_VERSION 38 | HADOOP_CONF_DIR=$HADOOP_HOME/conf 39 | 40 | PIG_VERSION=${PIG_VERSION:-0.7.0} 41 | PIG_HOME=/usr/local/pig-$PIG_VERSION 42 | PIG_CONF_DIR=$PIG_HOME/conf 43 | 44 | SELF_HOST=`wget -q -O - http://169.254.169.254/latest/meta-data/public-hostname` 45 | for role in $(echo "$ROLES" | tr "," "\n"); do 46 | case $role in 47 | nn) 48 | NN_HOST=$SELF_HOST 49 | ;; 50 | jt) 51 | JT_HOST=$SELF_HOST 52 | ;; 53 | esac 54 | done 55 | 56 | function register_auto_shutdown() { 57 | if [ ! -z "$AUTO_SHUTDOWN" ]; then 58 | shutdown -h +$AUTO_SHUTDOWN >/dev/null & 59 | fi 60 | } 61 | 62 | # Install a list of packages on debian or redhat as appropriate 63 | function install_packages() { 64 | if which dpkg &> /dev/null; then 65 | apt-get update 66 | apt-get -y install $@ 67 | elif which rpm &> /dev/null; then 68 | yum install -y $@ 69 | else 70 | echo "No package manager found." 71 | fi 72 | } 73 | 74 | # Install any user packages specified in the USER_PACKAGES environment variable 75 | function install_user_packages() { 76 | if [ ! -z "$USER_PACKAGES" ]; then 77 | install_packages $USER_PACKAGES 78 | fi 79 | } 80 | 81 | function install_yourkit() { 82 | mkdir /mnt/yjp 83 | YOURKIT_URL="http://www.yourkit.com/download/yjp-9.0.7-linux.tar.bz2" 84 | curl="curl --retry 3 --silent --show-error --fail" 85 | $curl -O $YOURKIT_URL 86 | yourkit_tar_file=`basename $YOURKIT_URL` 87 | tar xjf $yourkit_tar_file -C /mnt/yjp 88 | rm -f $yourkit_tar_file 89 | chown -R hadoop /mnt/yjp 90 | chgrp -R hadoop /mnt/yjp 91 | } 92 | 93 | function install_hadoop() { 94 | useradd hadoop 95 | 96 | hadoop_tar_url=http://archive.cloudera.com/cdh/3/hadoop-$HADOOP_VERSION.tar.gz 97 | hadoop_tar_file=`basename $hadoop_tar_url` 98 | 99 | curl="curl --retry 3 --silent --show-error --fail" 100 | $curl -O $hadoop_tar_url 101 | 102 | if [ ! -e $hadoop_tar_file ]; then 103 | echo "Failed to download $hadoop_tar_url. Aborting." 104 | exit 1 105 | fi 106 | 107 | tar zxf $hadoop_tar_file -C /usr/local 108 | rm -f $hadoop_tar_file $hadoop_tar_md5_file 109 | 110 | echo "export HADOOP_HOME=$HADOOP_HOME" >> ~root/.bashrc 111 | echo 'export PATH=$JAVA_HOME/bin:$HADOOP_HOME/bin:$PATH' >> ~root/.bashrc 112 | } 113 | 114 | function install_pig() 115 | { 116 | pig_tar_url=http://mirror.cloudera.com/apache/hadoop/pig/pig-$PIG_VERSION/pig-$PIG_VERSION.tar.gz 117 | pig_tar_file=`basename $pig_tar_url` 118 | 119 | curl="curl --retry 3 --silent --show-error --fail" 120 | for i in `seq 1 3`; 121 | do 122 | $curl -O $pig_tar_url 123 | done 124 | 125 | if [ ! -e $pig_tar_file ]; then 126 | echo "Failed to download $pig_tar_url. Pig will not be installed." 127 | else 128 | tar zxf $pig_tar_file -C /usr/local 129 | rm -f $pig_tar_file 130 | 131 | if [ ! -e $HADOOP_CONF_DIR ]; then 132 | echo "Hadoop must be installed. Aborting." 133 | exit 1 134 | fi 135 | 136 | cp $HADOOP_CONF_DIR/*.xml $PIG_CONF_DIR/ 137 | 138 | echo "export PIG_HOME=$PIG_HOME" >> ~root/.bashrc 139 | echo 'export PATH=$JAVA_HOME/bin:$PIG_HOME/bin:$PATH' >> ~root/.bashrc 140 | fi 141 | } 142 | 143 | function prep_disk() { 144 | mount=$1 145 | device=$2 146 | automount=${3:-false} 147 | 148 | echo "warning: ERASING CONTENTS OF $device" 149 | mkfs.xfs -f $device 150 | if [ ! -e $mount ]; then 151 | mkdir $mount 152 | fi 153 | mount -o defaults,noatime $device $mount 154 | if $automount ; then 155 | echo "$device $mount xfs defaults,noatime 0 0" >> /etc/fstab 156 | fi 157 | } 158 | 159 | function wait_for_mount { 160 | mount=$1 161 | device=$2 162 | 163 | mkdir $mount 164 | 165 | i=1 166 | echo "Attempting to mount $device" 167 | while true ; do 168 | sleep 10 169 | echo -n "$i " 170 | i=$[$i+1] 171 | mount -o defaults,noatime $device $mount || continue 172 | echo " Mounted." 173 | break; 174 | done 175 | } 176 | 177 | function make_hadoop_dirs { 178 | for mount in "$@"; do 179 | if [ ! -e $mount/hadoop ]; then 180 | mkdir -p $mount/hadoop 181 | chown hadoop:hadoop $mount/hadoop 182 | fi 183 | done 184 | } 185 | 186 | # Configure Hadoop by setting up disks and site file 187 | function configure_hadoop() { 188 | 189 | install_packages xfsprogs # needed for XFS 190 | 191 | INSTANCE_TYPE=`wget -q -O - http://169.254.169.254/latest/meta-data/instance-type` 192 | 193 | if [ -n "$EBS_MAPPINGS" ]; then 194 | # EBS_MAPPINGS is like "nn,/ebs1,/dev/sdj;dn,/ebs2,/dev/sdk" 195 | # EBS_MAPPINGS is like "ROLE,MOUNT_POINT,DEVICE;ROLE,MOUNT_POINT,DEVICE" 196 | DFS_NAME_DIR='' 197 | FS_CHECKPOINT_DIR='' 198 | DFS_DATA_DIR='' 199 | for mapping in $(echo "$EBS_MAPPINGS" | tr ";" "\n"); do 200 | role=`echo $mapping | cut -d, -f1` 201 | mount=`echo $mapping | cut -d, -f2` 202 | device=`echo $mapping | cut -d, -f3` 203 | wait_for_mount $mount $device 204 | DFS_NAME_DIR=${DFS_NAME_DIR},"$mount/hadoop/hdfs/name" 205 | FS_CHECKPOINT_DIR=${FS_CHECKPOINT_DIR},"$mount/hadoop/hdfs/secondary" 206 | DFS_DATA_DIR=${DFS_DATA_DIR},"$mount/hadoop/hdfs/data" 207 | FIRST_MOUNT=${FIRST_MOUNT-$mount} 208 | make_hadoop_dirs $mount 209 | done 210 | # Remove leading commas 211 | DFS_NAME_DIR=${DFS_NAME_DIR#?} 212 | FS_CHECKPOINT_DIR=${FS_CHECKPOINT_DIR#?} 213 | DFS_DATA_DIR=${DFS_DATA_DIR#?} 214 | 215 | DFS_REPLICATION=3 # EBS is internally replicated, but we also use HDFS replication for safety 216 | else 217 | case $INSTANCE_TYPE in 218 | m1.xlarge|c1.xlarge) 219 | DFS_NAME_DIR=/mnt/hadoop/hdfs/name,/mnt2/hadoop/hdfs/name 220 | FS_CHECKPOINT_DIR=/mnt/hadoop/hdfs/secondary,/mnt2/hadoop/hdfs/secondary 221 | DFS_DATA_DIR=/mnt/hadoop/hdfs/data,/mnt2/hadoop/hdfs/data,/mnt3/hadoop/hdfs/data,/mnt4/hadoop/hdfs/data 222 | ;; 223 | m1.large) 224 | DFS_NAME_DIR=/mnt/hadoop/hdfs/name,/mnt2/hadoop/hdfs/name 225 | FS_CHECKPOINT_DIR=/mnt/hadoop/hdfs/secondary,/mnt2/hadoop/hdfs/secondary 226 | DFS_DATA_DIR=/mnt/hadoop/hdfs/data,/mnt2/hadoop/hdfs/data 227 | ;; 228 | *) 229 | # "m1.small" or "c1.medium" 230 | DFS_NAME_DIR=/mnt/hadoop/hdfs/name 231 | FS_CHECKPOINT_DIR=/mnt/hadoop/hdfs/secondary 232 | DFS_DATA_DIR=/mnt/hadoop/hdfs/data 233 | ;; 234 | esac 235 | FIRST_MOUNT=/mnt 236 | DFS_REPLICATION=3 237 | fi 238 | 239 | case $INSTANCE_TYPE in 240 | m1.xlarge|c1.xlarge) 241 | prep_disk /mnt2 /dev/sdc true & 242 | disk2_pid=$! 243 | prep_disk /mnt3 /dev/sdd true & 244 | disk3_pid=$! 245 | prep_disk /mnt4 /dev/sde true & 246 | disk4_pid=$! 247 | wait $disk2_pid $disk3_pid $disk4_pid 248 | MAPRED_LOCAL_DIR=/mnt/hadoop/mapred/local,/mnt2/hadoop/mapred/local,/mnt3/hadoop/mapred/local,/mnt4/hadoop/mapred/local 249 | MAX_MAP_TASKS=4 250 | MAX_REDUCE_TASKS=2 251 | CHILD_OPTS=-Xmx2000m 252 | CHILD_ULIMIT=4000000 253 | ;; 254 | m1.large) 255 | prep_disk /mnt2 /dev/sdc true 256 | MAPRED_LOCAL_DIR=/mnt/hadoop/mapred/local,/mnt2/hadoop/mapred/local 257 | MAX_MAP_TASKS=2 258 | MAX_REDUCE_TASKS=1 259 | CHILD_OPTS=-Xmx2000m 260 | CHILD_ULIMIT=4000000 261 | ;; 262 | c1.medium) 263 | MAPRED_LOCAL_DIR=/mnt/hadoop/mapred/local 264 | MAX_MAP_TASKS=4 265 | MAX_REDUCE_TASKS=2 266 | CHILD_OPTS=-Xmx550m 267 | CHILD_ULIMIT=1126400 268 | ;; 269 | *) 270 | # "m1.small" 271 | MAPRED_LOCAL_DIR=/mnt/hadoop/mapred/local 272 | MAX_MAP_TASKS=2 273 | MAX_REDUCE_TASKS=1 274 | CHILD_OPTS=-Xmx550m 275 | CHILD_ULIMIT=1126400 276 | ;; 277 | esac 278 | 279 | make_hadoop_dirs `ls -d /mnt*` 280 | 281 | # Create tmp directory 282 | mkdir /mnt/tmp 283 | chmod a+rwxt /mnt/tmp 284 | 285 | mkdir /etc/hadoop 286 | ln -s $HADOOP_CONF_DIR /etc/hadoop/conf 287 | 288 | ############################################################################## 289 | # Modify this section to customize your Hadoop cluster. 290 | ############################################################################## 291 | cat > $HADOOP_CONF_DIR/hadoop-site.xml < 293 | 294 | 295 | 296 | dfs.block.size 297 | 134217728 298 | true 299 | 300 | 301 | dfs.data.dir 302 | $DFS_DATA_DIR 303 | true 304 | 305 | 306 | dfs.datanode.du.reserved 307 | 1073741824 308 | true 309 | 310 | 311 | dfs.datanode.handler.count 312 | 3 313 | true 314 | 315 | 320 | 321 | dfs.hosts.exclude 322 | $HADOOP_CONF_DIR/exclude 323 | true 324 | 325 | 326 | mapred.hosts.exclude 327 | $HADOOP_CONF_DIR/exclude 328 | true 329 | 330 | 331 | dfs.name.dir 332 | $DFS_NAME_DIR 333 | true 334 | 335 | 336 | dfs.namenode.handler.count 337 | 5 338 | true 339 | 340 | 341 | dfs.permissions 342 | true 343 | true 344 | 345 | 346 | dfs.replication 347 | $DFS_REPLICATION 348 | 349 | 350 | fs.checkpoint.dir 351 | $FS_CHECKPOINT_DIR 352 | true 353 | 354 | 355 | fs.default.name 356 | hdfs://$NN_HOST:8020/ 357 | 358 | 359 | fs.trash.interval 360 | 1440 361 | true 362 | 363 | 364 | hadoop.tmp.dir 365 | /mnt/tmp/hadoop-\${user.name} 366 | true 367 | 368 | 369 | io.file.buffer.size 370 | 65536 371 | 372 | 373 | mapred.child.java.opts 374 | $CHILD_OPTS 375 | 376 | 377 | mapred.child.ulimit 378 | $CHILD_ULIMIT 379 | true 380 | 381 | 382 | mapred.job.tracker 383 | $JT_HOST:8021 384 | 385 | 386 | mapred.job.tracker.handler.count 387 | 5 388 | true 389 | 390 | 391 | mapred.local.dir 392 | $MAPRED_LOCAL_DIR 393 | true 394 | 395 | 396 | mapred.map.tasks.speculative.execution 397 | true 398 | 399 | 400 | mapred.reduce.parallel.copies 401 | 10 402 | 403 | 404 | mapred.reduce.tasks 405 | $CLUSTER_SIZE 406 | 407 | 408 | mapred.reduce.tasks.speculative.execution 409 | false 410 | 411 | 412 | mapred.submit.replication 413 | 10 414 | 415 | 416 | mapred.system.dir 417 | /hadoop/system/mapred 418 | 419 | 420 | mapred.tasktracker.map.tasks.maximum 421 | $MAX_MAP_TASKS 422 | true 423 | 424 | 425 | mapred.tasktracker.reduce.tasks.maximum 426 | $MAX_REDUCE_TASKS 427 | true 428 | 429 | 430 | tasktracker.http.threads 431 | 46 432 | true 433 | 434 | 435 | mapred.compress.map.output 436 | true 437 | 438 | 439 | mapred.output.compression.type 440 | BLOCK 441 | 442 | 443 | hadoop.rpc.socket.factory.class.default 444 | org.apache.hadoop.net.StandardSocketFactory 445 | true 446 | 447 | 448 | hadoop.rpc.socket.factory.class.ClientProtocol 449 | 450 | true 451 | 452 | 453 | hadoop.rpc.socket.factory.class.JobSubmissionProtocol 454 | 455 | true 456 | 457 | 458 | io.compression.codecs 459 | org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec 460 | 461 | 462 | fs.s3.awsAccessKeyId 463 | $AWS_ACCESS_KEY_ID 464 | 465 | 466 | fs.s3.awsSecretAccessKey 467 | $AWS_SECRET_ACCESS_KEY 468 | 469 | 470 | fs.s3n.awsAccessKeyId 471 | $AWS_ACCESS_KEY_ID 472 | 473 | 474 | fs.s3n.awsSecretAccessKey 475 | $AWS_SECRET_ACCESS_KEY 476 | 477 | 478 | EOF 479 | 480 | # Keep PID files in a non-temporary directory 481 | sed -i -e "s|# export HADOOP_PID_DIR=.*|export HADOOP_PID_DIR=/var/run/hadoop|" \ 482 | $HADOOP_CONF_DIR/hadoop-env.sh 483 | mkdir -p /var/run/hadoop 484 | chown -R hadoop:hadoop /var/run/hadoop 485 | 486 | # Set SSH options within the cluster 487 | sed -i -e 's|# export HADOOP_SSH_OPTS=.*|export HADOOP_SSH_OPTS="-o StrictHostKeyChecking=no"|' \ 488 | $HADOOP_CONF_DIR/hadoop-env.sh 489 | 490 | # Hadoop logs should be on the /mnt partition 491 | sed -i -e 's|# export HADOOP_LOG_DIR=.*|export HADOOP_LOG_DIR=/var/log/hadoop/logs|' \ 492 | $HADOOP_CONF_DIR/hadoop-env.sh 493 | rm -rf /var/log/hadoop 494 | mkdir /mnt/hadoop/logs 495 | chown hadoop:hadoop /mnt/hadoop/logs 496 | ln -s /mnt/hadoop/logs /var/log/hadoop 497 | chown -R hadoop:hadoop /var/log/hadoop 498 | 499 | } 500 | 501 | # Sets up small website on cluster. 502 | function setup_web() { 503 | 504 | if which dpkg &> /dev/null; then 505 | apt-get -y install thttpd 506 | WWW_BASE=/var/www 507 | elif which rpm &> /dev/null; then 508 | yum install -y thttpd 509 | chkconfig --add thttpd 510 | WWW_BASE=/var/www/thttpd/html 511 | fi 512 | 513 | cat > $WWW_BASE/index.html << END 514 | 515 | 516 | Hadoop EC2 Cluster 517 | 518 | 519 |

Hadoop EC2 Cluster

520 | To browse the cluster you need to have a proxy configured. 521 | Start the proxy with hadoop-ec2 proxy <cluster_name>, 522 | and point your browser to 523 | this Proxy 524 | Auto-Configuration (PAC) file. To manage multiple proxy configurations, 525 | you may wish to use 526 | FoxyProxy. 527 | 531 | 532 | 533 | END 534 | 535 | service thttpd start 536 | 537 | } 538 | 539 | function start_namenode() { 540 | if which dpkg &> /dev/null; then 541 | AS_HADOOP="su -s /bin/bash - hadoop -c" 542 | elif which rpm &> /dev/null; then 543 | AS_HADOOP="/sbin/runuser -s /bin/bash - hadoop -c" 544 | fi 545 | 546 | # Format HDFS 547 | [ ! -e $FIRST_MOUNT/hadoop/hdfs ] && $AS_HADOOP "$HADOOP_HOME/bin/hadoop namenode -format" 548 | 549 | $AS_HADOOP "$HADOOP_HOME/bin/hadoop-daemon.sh start namenode" 550 | 551 | $AS_HADOOP "$HADOOP_HOME/bin/hadoop dfsadmin -safemode wait" 552 | $AS_HADOOP "$HADOOP_HOME/bin/hadoop fs -mkdir /user" 553 | # The following is questionable, as it allows a user to delete another user 554 | # It's needed to allow users to create their own user directories 555 | $AS_HADOOP "$HADOOP_HOME/bin/hadoop fs -chmod +w /user" 556 | 557 | } 558 | 559 | function start_daemon() { 560 | if which dpkg &> /dev/null; then 561 | AS_HADOOP="su -s /bin/bash - hadoop -c" 562 | elif which rpm &> /dev/null; then 563 | AS_HADOOP="/sbin/runuser -s /bin/bash - hadoop -c" 564 | fi 565 | $AS_HADOOP "$HADOOP_HOME/bin/hadoop-daemon.sh start $1" 566 | } 567 | 568 | register_auto_shutdown 569 | install_user_packages 570 | install_hadoop 571 | configure_hadoop 572 | install_pig 573 | 574 | for role in $(echo "$ROLES" | tr "," "\n"); do 575 | case $role in 576 | nn) 577 | setup_web 578 | start_namenode 579 | ;; 580 | snn) 581 | start_daemon secondarynamenode 582 | ;; 583 | jt) 584 | start_daemon jobtracker 585 | ;; 586 | dn) 587 | start_daemon datanode 588 | ;; 589 | tt) 590 | start_daemon tasktracker 591 | if [ ! -z "$INSTALL_PROFILER" ]; then 592 | install_yourkit 593 | fi 594 | ;; 595 | esac 596 | done 597 | 598 | -------------------------------------------------------------------------------- /example_scripts/hbase-ec2-init-remote.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -x 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | ################################################################################ 19 | # Script that is run on each EC2 instance on boot. It is passed in the EC2 user 20 | # data, so should not exceed 16K in size after gzip compression. 21 | # 22 | # This script is executed by /etc/init.d/ec2-run-user-data, and output is 23 | # logged to /var/log/messages. 24 | # 25 | # This script will set up a Hadoop/HBase cluster. Zookeeper is 26 | # installed and launched on the namenode/jobtracker/master node. This is a 27 | # configuration that is suitable for small clusters or for testing, but not for 28 | # most production environments. 29 | # 30 | # Since the regionservers all need the private dns name of the zookeeper 31 | # machine, that machine must be started before the regionservers. By default, 32 | # a zookeeper is running on the same machine as the master, so you will need to 33 | # first launch the master node, then the slaves: 34 | # 35 | # stratus exec my-cluster launch-master 36 | # stratus exec my-cluster launch-slaves 10 37 | # 38 | # 39 | ################################################################################ 40 | 41 | ################################################################################ 42 | # Initialize variables 43 | ################################################################################ 44 | 45 | # Substitute environment variables passed by the client 46 | export %ENV% 47 | 48 | echo "export %ENV%" >> ~root/.bash_profile 49 | #for some reason, the .bash_profile in some distros does not source .bashrc 50 | cat >> ~root/.bash_profile <> ~root/.bashrc 56 | 57 | # up ulimits if necessary 58 | if [ `ulimit -n` -lt 128000 ]; then 59 | ulimit -n 128000 60 | fi 61 | 62 | HADOOP_VERSION=${HADOOP_VERSION:-0.20.2-cdh3u0} 63 | HADOOP_HOME=/usr/local/hadoop-$HADOOP_VERSION 64 | HADOOP_CONF_DIR=$HADOOP_HOME/conf 65 | 66 | HBASE_VERSION=${HBASE_VERSION:-0.90.1-cdh3u0} 67 | HBASE_HOME=/usr/local/hbase-$HBASE_VERSION 68 | HBASE_CONF_DIR=$HBASE_HOME/conf 69 | 70 | ZK_VERSION=${ZK_VERSION:-3.3.3-cdh3u0} 71 | ZK_HOME=/usr/local/zookeeper-$ZK_VERSION 72 | ZK_CONF_DIR=$ZK_HOME/conf 73 | 74 | PIG_VERSION=${PIG_VERSION:-pig-0.8.0-cdh3u0} 75 | PIG_HOME=/usr/local/pig-$PIG_VERSION 76 | PIG_CONF_DIR=$PIG_HOME/conf 77 | 78 | #HDFS settings to support HBase 79 | DFS_DATANODE_HANDLER_COUNT=10 80 | DFS_DATANODE_MAX_XCIEVERS=10000 81 | #end of HDFS settings 82 | 83 | SELF_HOST=`wget -q -O - http://169.254.169.254/latest/meta-data/public-hostname` 84 | for role in $(echo "$ROLES" | tr "," "\n"); do 85 | case $role in 86 | nn) 87 | NN_HOST=$SELF_HOST 88 | # By default the HBase master and Zookeeper run on the Namenode host 89 | # Zookeeper uses the private IP address of the namenode 90 | ZOOKEEPER_QUORUM=`echo $HOSTNAME` 91 | ;; 92 | jt) 93 | JT_HOST=$SELF_HOST 94 | ;; 95 | esac 96 | done 97 | 98 | # Set up the macro that we will use to execute commands as "hadoop" 99 | if which dpkg &> /dev/null; then 100 | AS_HADOOP="su -s /bin/bash - hadoop -c" 101 | elif which rpm &> /dev/null; then 102 | AS_HADOOP="/sbin/runuser -s /bin/bash - hadoop -c" 103 | fi 104 | 105 | function register_auto_shutdown() { 106 | if [ ! -z "$AUTO_SHUTDOWN" ]; then 107 | shutdown -h +$AUTO_SHUTDOWN >/dev/null & 108 | fi 109 | } 110 | 111 | # Install a list of packages on debian or redhat as appropriate 112 | function install_packages() { 113 | if which dpkg &> /dev/null; then 114 | apt-get update 115 | apt-get -y install $@ 116 | elif which rpm &> /dev/null; then 117 | yum install -y $@ 118 | else 119 | echo "No package manager found." 120 | fi 121 | } 122 | 123 | # Install any user packages specified in the USER_PACKAGES environment variable 124 | function install_user_packages() { 125 | if [ ! -z "$USER_PACKAGES" ]; then 126 | install_packages $USER_PACKAGES 127 | fi 128 | } 129 | 130 | function install_yourkit() { 131 | mkdir /mnt/yjp 132 | YOURKIT_URL="http://www.yourkit.com/download/yjp-9.0.7-linux.tar.bz2" 133 | curl="curl --retry 3 --silent --show-error --fail" 134 | $curl -O $YOURKIT_URL 135 | yourkit_tar_file=`basename $YOURKIT_URL` 136 | tar xjf $yourkit_tar_file -C /mnt/yjp 137 | rm -f $yourkit_tar_file 138 | chown -R hadoop /mnt/yjp 139 | chgrp -R hadoop /mnt/yjp 140 | } 141 | 142 | function install_hadoop() { 143 | #The EBS volumes are already set up with hadoop:hadoop equal to 500:500 144 | if which dpkg &> /dev/null; then 145 | addgroup hadoop --gid 500 146 | adduser --disabled-login --ingroup hadoop --gecos GECOS --uid 500 hadoop 147 | else 148 | groupadd hadoop -g 500 149 | useradd hadoop -u 500 -g 500 150 | fi 151 | 152 | 153 | hadoop_tar_url=http://archive.cloudera.com/cdh/3/hadoop-$HADOOP_VERSION.tar.gz 154 | hadoop_tar_file=`basename $hadoop_tar_url` 155 | 156 | curl="curl --retry 3 --silent --show-error --fail" 157 | $curl -O $hadoop_tar_url 158 | 159 | if [ ! -e $hadoop_tar_file ]; then 160 | echo "Failed to download $hadoop_tar_url. Aborting." 161 | exit 1 162 | fi 163 | 164 | tar zxf $hadoop_tar_file -C /usr/local 165 | cp $HADOOP_HOME/contrib/fairscheduler/hadoop-*-fairscheduler.jar $HADOOP_HOME/lib 166 | rm -f $hadoop_tar_file $hadoop_tar_md5_file 167 | 168 | echo "export HADOOP_HOME=$HADOOP_HOME" >> ~root/.bashrc 169 | echo 'export PATH=$JAVA_HOME/bin:$HADOOP_HOME/bin:$PATH' >> ~root/.bashrc 170 | 171 | #set up the native compression libraries 172 | if [ `arch` == 'x86_64' ]; then 173 | cp $HADOOP_HOME/lib/native/Linux-amd64-64/libhadoop.* /usr/lib/ 174 | else 175 | cp $HADOOP_HOME/lib/native/ Linux-i386-32/libhadoop.* /usr/lib/ 176 | fi 177 | ldconfig -n /usr/lib/ 178 | } 179 | 180 | function install_hbase() { 181 | hbase_tar_url=http://archive.cloudera.com/cdh/3/hbase-$HBASE_VERSION.tar.gz 182 | hbase_tar_file=`basename $hbase_tar_url` 183 | 184 | curl="curl --retry 3 --silent --show-error --fail" 185 | $curl -O $hbase_tar_url 186 | 187 | if [ ! -e $hbase_tar_file ]; then 188 | echo "Failed to download $hbase_tar_url. Aborting." 189 | exit 1 190 | fi 191 | 192 | tar zxf $hbase_tar_file -C /usr/local 193 | rm -f $hbase_tar_file $hbase_tar_md5_file 194 | 195 | echo "export HBASE_HOME=$HBASE_HOME" >> ~root/.bashrc 196 | echo 'export PATH=$JAVA_HOME/bin:$HBASE_HOME/bin:$PATH' >> ~root/.bashrc 197 | } 198 | 199 | function install_zookeeper() { 200 | zk_tar_url=http://archive.cloudera.com/cdh/3/zookeeper-$ZK_VERSION.tar.gz 201 | zk_tar_file=`basename $zk_tar_url` 202 | 203 | curl="curl --retry 3 --silent --show-error --fail" 204 | $curl -O $zk_tar_url 205 | 206 | if [ ! -e $zk_tar_file ]; then 207 | echo "Failed to download $zk_tar_url. Aborting." 208 | exit 1 209 | fi 210 | 211 | tar zxf $zk_tar_file -C /usr/local 212 | rm -f $zk_tar_file $zk_tar_md5_file 213 | 214 | echo "export ZOOKEEPER_HOME=$ZK_HOME" >> ~root/.bashrc 215 | echo 'export PATH=$JAVA_HOME/bin:$ZK_HOME/bin:$PATH' >> ~root/.bashrc 216 | } 217 | 218 | function install_pig() 219 | { 220 | pig_tar_url=http://archive.cloudera.com/cdh/3/$PIG_VERSION.tar.gz 221 | pig_tar_file=`basename $pig_tar_url` 222 | 223 | curl="curl --retry 3 --silent --show-error --fail" 224 | for i in `seq 1 3`; 225 | do 226 | $curl -O $pig_tar_url 227 | done 228 | 229 | if [ ! -e $pig_tar_file ]; then 230 | echo "Failed to download $pig_tar_url. Pig will not be installed." 231 | else 232 | tar zxf $pig_tar_file -C /usr/local 233 | rm -f $pig_tar_file 234 | 235 | if [ ! -e $HADOOP_CONF_DIR ]; then 236 | echo "Hadoop must be installed. Aborting." 237 | exit 1 238 | fi 239 | 240 | cp $HADOOP_CONF_DIR/*.xml $PIG_CONF_DIR/ 241 | 242 | echo "export PIG_HOME=$PIG_HOME" >> ~root/.bashrc 243 | echo 'export PATH=$JAVA_HOME/bin:$PIG_HOME/bin:$PATH' >> ~root/.bashrc 244 | fi 245 | } 246 | 247 | function prep_disk() { 248 | mount=$1 249 | device=$2 250 | automount=${3:-false} 251 | 252 | echo "warning: ERASING CONTENTS OF $device" 253 | mkfs.xfs -f $device 254 | if [ ! -e $mount ]; then 255 | mkdir $mount 256 | fi 257 | mount -o defaults,noatime $device $mount 258 | if $automount ; then 259 | echo "$device $mount xfs defaults,noatime 0 0" >> /etc/fstab 260 | fi 261 | } 262 | 263 | function wait_for_mount { 264 | mount=$1 265 | device=$2 266 | 267 | mkdir $mount 268 | 269 | i=1 270 | echo "Attempting to mount $device" 271 | while true ; do 272 | sleep 10 273 | echo -n "$i " 274 | i=$[$i+1] 275 | mount -o defaults,noatime $device $mount || continue 276 | echo " Mounted." 277 | break; 278 | done 279 | } 280 | 281 | function make_hadoop_dirs { 282 | for mount in "$@"; do 283 | if [ ! -e $mount/hadoop ]; then 284 | mkdir -p $mount/hadoop 285 | chown hadoop:hadoop $mount/hadoop 286 | fi 287 | done 288 | } 289 | 290 | # Configure Hadoop by setting up disks and site file 291 | function configure_hadoop() { 292 | #set up hadoop's env. to have the same path and vars as root 293 | #this ensures that commands work correctly when the user su's to hadoop 294 | cp ~/.bash_profile /home/hadoop/ 295 | cp ~/.bashrc /home/hadoop/ 296 | chown hadoop /home/hadoop/.bash* 297 | chgrp hadoop /home/hadoop/.bash* 298 | 299 | install_packages xfsprogs # needed for XFS 300 | 301 | INSTANCE_TYPE=`wget -q -O - http://169.254.169.254/latest/meta-data/instance-type` 302 | 303 | if [ -n "$EBS_MAPPINGS" ]; then 304 | # EBS_MAPPINGS is like "nn,/ebs1,/dev/sdj;dn,/ebs2,/dev/sdk" 305 | # EBS_MAPPINGS is like "ROLE,MOUNT_POINT,DEVICE;ROLE,MOUNT_POINT,DEVICE" 306 | DFS_NAME_DIR='' 307 | FS_CHECKPOINT_DIR='' 308 | DFS_DATA_DIR='' 309 | for mapping in $(echo "$EBS_MAPPINGS" | tr ";" "\n"); do 310 | role=`echo $mapping | cut -d, -f1` 311 | mount=`echo $mapping | cut -d, -f2` 312 | device=`echo $mapping | cut -d, -f3` 313 | wait_for_mount $mount $device 314 | DFS_NAME_DIR=${DFS_NAME_DIR},"$mount/hadoop/hdfs/name" 315 | FS_CHECKPOINT_DIR=${FS_CHECKPOINT_DIR},"$mount/hadoop/hdfs/secondary" 316 | DFS_DATA_DIR=${DFS_DATA_DIR},"$mount/hadoop/hdfs/data" 317 | FIRST_MOUNT=${FIRST_MOUNT-$mount} 318 | make_hadoop_dirs $mount 319 | done 320 | # Remove leading commas 321 | DFS_NAME_DIR=${DFS_NAME_DIR#?} 322 | FS_CHECKPOINT_DIR=${FS_CHECKPOINT_DIR#?} 323 | DFS_DATA_DIR=${DFS_DATA_DIR#?} 324 | 325 | DFS_REPLICATION=3 # EBS is internally replicated, but we also use HDFS replication for safety 326 | else 327 | case $INSTANCE_TYPE in 328 | m2.2xlarge) 329 | DFS_NAME_DIR=/mnt/hadoop/hdfs/name 330 | FS_CHECKPOINT_DIR=/mnt/hadoop/hdfs/secondary 331 | DFS_DATA_DIR=/mnt/hadoop/hdfs/data 332 | ;; 333 | m1.xlarge|c1.xlarge) 334 | DFS_NAME_DIR=/mnt/hadoop/hdfs/name,/mnt2/hadoop/hdfs/name 335 | FS_CHECKPOINT_DIR=/mnt/hadoop/hdfs/secondary,/mnt2/hadoop/hdfs/secondary 336 | DFS_DATA_DIR=/mnt/hadoop/hdfs/data,/mnt2/hadoop/hdfs/data,/mnt3/hadoop/hdfs/data,/mnt4/hadoop/hdfs/data 337 | ;; 338 | m1.large) 339 | DFS_NAME_DIR=/mnt/hadoop/hdfs/name,/mnt2/hadoop/hdfs/name 340 | FS_CHECKPOINT_DIR=/mnt/hadoop/hdfs/secondary,/mnt2/hadoop/hdfs/secondary 341 | DFS_DATA_DIR=/mnt/hadoop/hdfs/data,/mnt2/hadoop/hdfs/data 342 | ;; 343 | *) 344 | # "m1.small" or "c1.medium" 345 | DFS_NAME_DIR=/mnt/hadoop/hdfs/name 346 | FS_CHECKPOINT_DIR=/mnt/hadoop/hdfs/secondary 347 | DFS_DATA_DIR=/mnt/hadoop/hdfs/data 348 | ;; 349 | esac 350 | FIRST_MOUNT=/mnt 351 | DFS_REPLICATION=3 352 | fi 353 | 354 | case $INSTANCE_TYPE in 355 | m2.2xlarge) 356 | MAPRED_LOCAL_DIR=/mnt/hadoop/mapred/local 357 | MAX_MAP_TASKS=5 358 | MAX_REDUCE_TASKS=3 359 | CHILD_OPTS=-Xmx2000m 360 | CHILD_ULIMIT=4000000 361 | IO_SORT_FACTOR=25 362 | IO_SORT_MB=250 363 | ;; 364 | m1.xlarge|c1.xlarge) 365 | prep_disk /mnt2 /dev/sdc true & 366 | disk2_pid=$! 367 | prep_disk /mnt3 /dev/sdd true & 368 | disk3_pid=$! 369 | prep_disk /mnt4 /dev/sde true & 370 | disk4_pid=$! 371 | wait $disk2_pid $disk3_pid $disk4_pid 372 | MAPRED_LOCAL_DIR=/mnt/hadoop/mapred/local,/mnt2/hadoop/mapred/local,/mnt3/hadoop/mapred/local,/mnt4/hadoop/mapred/local 373 | MAX_MAP_TASKS=4 374 | MAX_REDUCE_TASKS=2 375 | CHILD_OPTS=-Xmx2000m 376 | CHILD_ULIMIT=4000000 377 | IO_SORT_FACTOR=20 378 | IO_SORT_MB=200 379 | ;; 380 | m1.large) 381 | prep_disk /mnt2 /dev/sdc true 382 | MAPRED_LOCAL_DIR=/mnt/hadoop/mapred/local,/mnt2/hadoop/mapred/local 383 | MAX_MAP_TASKS=2 384 | MAX_REDUCE_TASKS=1 385 | CHILD_OPTS=-Xmx2000m 386 | CHILD_ULIMIT=4000000 387 | IO_SORT_FACTOR=10 388 | IO_SORT_MB=100 389 | ;; 390 | c1.medium) 391 | MAPRED_LOCAL_DIR=/mnt/hadoop/mapred/local 392 | MAX_MAP_TASKS=4 393 | MAX_REDUCE_TASKS=2 394 | CHILD_OPTS=-Xmx550m 395 | CHILD_ULIMIT=1126400 396 | IO_SORT_FACTOR=10 397 | IO_SORT_MB=100 398 | ;; 399 | *) 400 | # "m1.small" 401 | MAPRED_LOCAL_DIR=/mnt/hadoop/mapred/local 402 | MAX_MAP_TASKS=2 403 | MAX_REDUCE_TASKS=1 404 | CHILD_OPTS=-Xmx550m 405 | CHILD_ULIMIT=1126400 406 | IO_SORT_FACTOR=10 407 | IO_SORT_MB=100 408 | ;; 409 | esac 410 | 411 | make_hadoop_dirs `ls -d /mnt*` 412 | 413 | # Create tmp directory 414 | mkdir /mnt/tmp 415 | chmod a+rwxt /mnt/tmp 416 | 417 | mkdir /etc/hadoop 418 | ln -s $HADOOP_CONF_DIR /etc/hadoop/conf 419 | 420 | ############################################################################## 421 | # Modify this section to customize your Hadoop cluster. 422 | ############################################################################## 423 | cat > $HADOOP_CONF_DIR/hadoop-site.xml < 425 | 426 | 427 | 428 | dfs.block.size 429 | 134217728 430 | true 431 | 432 | 433 | dfs.data.dir 434 | $DFS_DATA_DIR 435 | true 436 | 437 | 438 | dfs.datanode.du.reserved 439 | 1073741824 440 | true 441 | 442 | 443 | dfs.datanode.handler.count 444 | $DFS_DATANODE_HANDLER_COUNT 445 | true 446 | 447 | 452 | 453 | dfs.hosts.exclude 454 | $HADOOP_CONF_DIR/exclude 455 | true 456 | 457 | 458 | mapred.hosts.exclude 459 | $HADOOP_CONF_DIR/exclude 460 | true 461 | 462 | 463 | dfs.name.dir 464 | $DFS_NAME_DIR 465 | true 466 | 467 | 468 | dfs.namenode.handler.count 469 | 64 470 | true 471 | 472 | 473 | dfs.permissions 474 | true 475 | true 476 | 477 | 478 | dfs.replication 479 | $DFS_REPLICATION 480 | 481 | 482 | dfs.datanode.max.xcievers 483 | $DFS_DATANODE_MAX_XCIEVERS 484 | 485 | 486 | fs.checkpoint.dir 487 | $FS_CHECKPOINT_DIR 488 | true 489 | 490 | 491 | fs.default.name 492 | hdfs://$NN_HOST:8020/ 493 | 494 | 495 | fs.trash.interval 496 | 1440 497 | true 498 | 499 | 500 | hadoop.tmp.dir 501 | /mnt/tmp/hadoop-\${user.name} 502 | true 503 | 504 | 505 | io.file.buffer.size 506 | 65536 507 | 508 | 509 | io.sort.factor 510 | $IO_SORT_FACTOR 511 | 512 | 513 | io.sort.mb 514 | $IO_SORT_MB 515 | 516 | 517 | mapred.child.java.opts 518 | $CHILD_OPTS 519 | 520 | 521 | mapred.child.ulimit 522 | $CHILD_ULIMIT 523 | true 524 | 525 | 526 | mapred.job.tracker 527 | $JT_HOST:8021 528 | 529 | 530 | mapred.job.tracker.handler.count 531 | 64 532 | true 533 | 534 | 535 | mapred.local.dir 536 | $MAPRED_LOCAL_DIR 537 | true 538 | 539 | 540 | mapred.map.tasks.speculative.execution 541 | true 542 | 543 | 544 | mapred.reduce.parallel.copies 545 | 10 546 | 547 | 548 | mapred.reduce.tasks 549 | $CLUSTER_SIZE 550 | 551 | 552 | mapred.reduce.tasks.speculative.execution 553 | false 554 | 555 | 556 | mapred.submit.replication 557 | 10 558 | 559 | 560 | mapred.system.dir 561 | /hadoop/system/mapred 562 | 563 | 564 | mapred.tasktracker.map.tasks.maximum 565 | $MAX_MAP_TASKS 566 | true 567 | 568 | 569 | mapred.tasktracker.reduce.tasks.maximum 570 | $MAX_REDUCE_TASKS 571 | true 572 | 573 | 574 | tasktracker.http.threads 575 | 40 576 | true 577 | 578 | 579 | mapred.output.compress 580 | true 581 | 582 | 583 | mapred.compress.map.output 584 | true 585 | 586 | 587 | mapred.output.compression.type 588 | BLOCK 589 | 590 | 591 | hadoop.rpc.socket.factory.class.default 592 | org.apache.hadoop.net.StandardSocketFactory 593 | true 594 | 595 | 596 | hadoop.rpc.socket.factory.class.ClientProtocol 597 | 598 | true 599 | 600 | 601 | hadoop.rpc.socket.factory.class.JobSubmissionProtocol 602 | 603 | true 604 | 605 | 606 | io.compression.codecs 607 | org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec 608 | 609 | 610 | EOF 611 | 612 | # Keep PID files in a non-temporary directory 613 | sed -i -e "s|# export HADOOP_PID_DIR=.*|export HADOOP_PID_DIR=/var/run/hadoop|" \ 614 | $HADOOP_CONF_DIR/hadoop-env.sh 615 | mkdir -p /var/run/hadoop 616 | chown -R hadoop:hadoop /var/run/hadoop 617 | 618 | # Set SSH options within the cluster 619 | sed -i -e 's|# export HADOOP_SSH_OPTS=.*|export HADOOP_SSH_OPTS="-o StrictHostKeyChecking=no"|' \ 620 | $HADOOP_CONF_DIR/hadoop-env.sh 621 | 622 | # Hadoop logs should be on the /mnt partition 623 | sed -i -e 's|# export HADOOP_LOG_DIR=.*|export HADOOP_LOG_DIR=/var/log/hadoop/logs|' \ 624 | $HADOOP_CONF_DIR/hadoop-env.sh 625 | 626 | rm -rf /var/log/hadoop 627 | mkdir /mnt/hadoop/logs 628 | chown hadoop:hadoop /mnt/hadoop/logs 629 | ln -s /mnt/hadoop/logs /var/log/hadoop 630 | chown -R hadoop:hadoop /var/log/hadoop 631 | 632 | } 633 | 634 | # Sets up the HBase configuration 635 | function configure_hbase() { 636 | 637 | ############################################################################## 638 | # Modify this section to customize your HBase cluster. 639 | ############################################################################## 640 | 641 | HBASE_TMP_DIR=/mnt/hbase 642 | mkdir $HBASE_TMP_DIR 643 | chown hadoop:hadoop $HBASE_TMP_DIR 644 | 645 | ZOOKEEPER_DATA_DIR=/mnt/hbase/zk 646 | mkdir $ZOOKEEPER_DATA_DIR 647 | chown hadoop:hadoop $ZOOKEEPER_DATA_DIR 648 | 649 | cat > $HBASE_CONF_DIR/hbase-site.xml < 651 | 652 | 653 | 654 | hbase.rootdir 655 | hdfs://$NN_HOST:8020/hbase 656 | 657 | 658 | hbase.cluster.distributed 659 | true 660 | 661 | 662 | hbase.regionserver.handler.count 663 | 200 664 | 665 | 666 | hbase.tmp.dir 667 | $HBASE_TMP_DIR 668 | 669 | 670 | dfs.replication 671 | $DFS_REPLICATION 672 | 673 | 674 | 675 | hbase.zookeeper.quorum 676 | $ZOOKEEPER_QUORUM 677 | 678 | 679 | zookeeper.session.timeout 680 | 60000 681 | 682 | 683 | hbase.zookeeper.property.dataDir 684 | $ZOOKEEPER_DATA_DIR 685 | 686 | 687 | hbase.zookeeper.property.maxClientCnxns 688 | 100 689 | 690 | 691 | EOF 692 | 693 | # Override JVM options - use 2G heap for master and 8G for region servers 694 | cat >> $HBASE_CONF_DIR/hbase-env.sh < /dev/null; then 709 | apt-get -y install thttpd 710 | WWW_BASE=/var/www 711 | elif which rpm &> /dev/null; then 712 | yum install -y thttpd 713 | chkconfig --add thttpd 714 | WWW_BASE=/var/www/thttpd/html 715 | fi 716 | 717 | cat > $WWW_BASE/index.html << END 718 | 719 | 720 | Hadoop EC2 Cluster 721 | 722 | 723 |

Hadoop EC2 Cluster

724 | To browse the cluster you need to have a proxy configured. 725 | Start the proxy with hadoop-ec2 proxy <cluster_name>, 726 | and point your browser to 727 | this Proxy 728 | Auto-Configuration (PAC) file. To manage multiple proxy configurations, 729 | you may wish to use 730 | FoxyProxy. 731 | 735 | 736 | 737 | END 738 | 739 | service thttpd start 740 | 741 | } 742 | 743 | function start_namenode() { 744 | 745 | # Format HDFS 746 | [ ! -e $FIRST_MOUNT/hadoop/hdfs ] && $AS_HADOOP "$HADOOP_HOME/bin/hadoop namenode -format" 747 | 748 | $AS_HADOOP "$HADOOP_HOME/bin/hadoop-daemon.sh start namenode" 749 | 750 | #$AS_HADOOP "$HADOOP_HOME/bin/hadoop dfsadmin -safemode wait" 751 | $AS_HADOOP "$HADOOP_HOME/bin/hadoop fs -mkdir /user" 752 | # The following is questionable, as it allows a user to delete another user 753 | # It's needed to allow users to create their own user directories 754 | $AS_HADOOP "$HADOOP_HOME/bin/hadoop fs -chmod +w /user" 755 | 756 | } 757 | 758 | function start_daemon() { 759 | $AS_HADOOP "$HADOOP_HOME/bin/hadoop-daemon.sh start $1" 760 | } 761 | 762 | # Launch the Zookeeper and the HBase master node - these must be started 763 | # before adding region servers 764 | function start_master() { 765 | #Start the zookeeper process first 766 | $AS_HADOOP "$HBASE_HOME/bin/hbase-daemon.sh start zookeeper" 767 | #Then start the master 768 | $AS_HADOOP "$HBASE_HOME/bin/hbase-daemon.sh start master" 769 | } 770 | 771 | # Launch a region server 772 | function start_region() { 773 | $AS_HADOOP "$HBASE_HOME/bin/hbase-daemon.sh start regionserver" 774 | } 775 | 776 | register_auto_shutdown 777 | install_user_packages 778 | install_hadoop 779 | install_hbase 780 | install_zookeeper 781 | configure_hadoop 782 | configure_hbase 783 | install_pig 784 | 785 | for role in $(echo "$ROLES" | tr "," "\n"); do 786 | case $role in 787 | nn) 788 | setup_web 789 | start_namenode 790 | start_master 791 | ;; 792 | snn) 793 | start_daemon secondarynamenode 794 | ;; 795 | jt) 796 | start_daemon jobtracker 797 | ;; 798 | dn) 799 | start_daemon datanode 800 | start_region 801 | ;; 802 | tt) 803 | start_daemon tasktracker 804 | if [ ! -z "$INSTALL_PROFILER" ]; then 805 | install_yourkit 806 | fi 807 | ;; 808 | esac 809 | done 810 | 811 | -------------------------------------------------------------------------------- /example_scripts/zookeeper-ec2-init-remote.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -x 2 | 3 | # Licensed to the Apache Software Foundation (ASF) under one or more 4 | # contributor license agreements. See the NOTICE file distributed with 5 | # this work for additional information regarding copyright ownership. 6 | # The ASF licenses this file to You under the Apache License, Version 2.0 7 | # (the "License"); you may not use this file except in compliance with 8 | # the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | ################################################################################ 19 | # Script that is run on each EC2 instance on boot. It is passed in the EC2 user 20 | # data, so should not exceed 16K in size after gzip compression. 21 | # 22 | # This script is executed by /etc/init.d/ec2-run-user-data, and output is 23 | # logged to /var/log/messages. 24 | ################################################################################ 25 | 26 | ################################################################################ 27 | # Initialize variables 28 | ################################################################################ 29 | 30 | # Substitute environment variables passed by the client 31 | export %ENV% 32 | 33 | ZK_VERSION=${ZK_VERSION:-3.2.2} 34 | ZOOKEEPER_HOME=/usr/local/zookeeper-$ZK_VERSION 35 | ZK_CONF_DIR=/etc/zookeeper/conf 36 | 37 | function register_auto_shutdown() { 38 | if [ ! -z "$AUTO_SHUTDOWN" ]; then 39 | shutdown -h +$AUTO_SHUTDOWN >/dev/null & 40 | fi 41 | } 42 | 43 | # Install a list of packages on debian or redhat as appropriate 44 | function install_packages() { 45 | if which dpkg &> /dev/null; then 46 | apt-get update 47 | apt-get -y install $@ 48 | elif which rpm &> /dev/null; then 49 | yum install -y $@ 50 | else 51 | echo "No package manager found." 52 | fi 53 | } 54 | 55 | # Install any user packages specified in the USER_PACKAGES environment variable 56 | function install_user_packages() { 57 | if [ ! -z "$USER_PACKAGES" ]; then 58 | install_packages $USER_PACKAGES 59 | fi 60 | } 61 | 62 | function install_zookeeper() { 63 | zk_tar_url=http://www.apache.org/dist/hadoop/zookeeper/zookeeper-$ZK_VERSION/zookeeper-$ZK_VERSION.tar.gz 64 | zk_tar_file=`basename $zk_tar_url` 65 | zk_tar_md5_file=`basename $zk_tar_url.md5` 66 | 67 | curl="curl --retry 3 --silent --show-error --fail" 68 | for i in `seq 1 3`; 69 | do 70 | $curl -O $zk_tar_url 71 | $curl -O $zk_tar_url.md5 72 | if md5sum -c $zk_tar_md5_file; then 73 | break; 74 | else 75 | rm -f $zk_tar_file $zk_tar_md5_file 76 | fi 77 | done 78 | 79 | if [ ! -e $zk_tar_file ]; then 80 | echo "Failed to download $zk_tar_url. Aborting." 81 | exit 1 82 | fi 83 | 84 | tar zxf $zk_tar_file -C /usr/local 85 | rm -f $zk_tar_file $zk_tar_md5_file 86 | 87 | echo "export ZOOKEEPER_HOME=$ZOOKEEPER_HOME" >> ~root/.bashrc 88 | echo 'export PATH=$JAVA_HOME/bin:$ZOOKEEPER_HOME/bin:$PATH' >> ~root/.bashrc 89 | } 90 | 91 | function configure_zookeeper() { 92 | mkdir -p /mnt/zookeeper/logs 93 | ln -s /mnt/zookeeper/logs /var/log/zookeeper 94 | mkdir -p /var/log/zookeeper/txlog 95 | mkdir -p $ZK_CONF_DIR 96 | cp $ZOOKEEPER_HOME/conf/log4j.properties $ZK_CONF_DIR 97 | 98 | sed -i -e "s|log4j.rootLogger=INFO, CONSOLE|log4j.rootLogger=INFO, ROLLINGFILE|" \ 99 | -e "s|log4j.appender.ROLLINGFILE.File=zookeeper.log|log4j.appender.ROLLINGFILE.File=/var/log/zookeeper/zookeeper.log|" \ 100 | $ZK_CONF_DIR/log4j.properties 101 | 102 | # Ensure ZooKeeper starts on boot 103 | cat > /etc/rc.local < /dev/null 2>&1 & 105 | EOF 106 | 107 | } 108 | 109 | register_auto_shutdown 110 | install_user_packages 111 | install_zookeeper 112 | configure_zookeeper 113 | -------------------------------------------------------------------------------- /plugins/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/digitalreasoning/PyStratus/c7e25c9e7dcc5a98f8d317c0f9f0985fbf79ca59/plugins/__init__.py -------------------------------------------------------------------------------- /plugins/cassandra/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/digitalreasoning/PyStratus/c7e25c9e7dcc5a98f8d317c0f9f0985fbf79ca59/plugins/cassandra/__init__.py -------------------------------------------------------------------------------- /plugins/cassandra/cli.plugin: -------------------------------------------------------------------------------- 1 | [Core] 2 | Name = cassandra 3 | Module = cli 4 | 5 | [Documentation] 6 | Author = Abe Music 7 | Website = http://github.com/digitalreasoning/PyStratus 8 | Description = A Cassandra CLI implementation for PyStratus 9 | 10 | -------------------------------------------------------------------------------- /plugins/cassandra/cli.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import logging 3 | import urllib 4 | 5 | from optparse import make_option 6 | 7 | from cloud.plugin import CLIPlugin 8 | from cloud.plugin import BASIC_OPTIONS 9 | from cloud.service import InstanceTemplate 10 | from cloud.util import log_cluster_action 11 | from optparse import make_option 12 | from prettytable import PrettyTable 13 | from pprint import pprint 14 | 15 | # Add options here to override what's in the clusters.cfg file 16 | # TODO 17 | 18 | class CassandraServiceCLI(CLIPlugin): 19 | USAGE = """Cassandra service usage: CLUSTER COMMAND [OPTIONS] 20 | where COMMAND and [OPTIONS] may be one of: 21 | 22 | CASSANDRA COMMANDS 23 | ---------------------------------------------------------------------------------- 24 | start-cassandra starts the cassandra service on all nodes 25 | stop-cassandra stops the cassandra service on all nodes 26 | print-ring [INSTANCE_IDX] displays the cluster's ring information 27 | rebalance recalculates tokens evenly and moves nodes 28 | remove-down-nodes removes nodes that are down from the ring 29 | 30 | CLUSTER COMMANDS 31 | ---------------------------------------------------------------------------------- 32 | details list instances in CLUSTER 33 | launch-cluster NUM_NODES launch NUM_NODES Cassandra nodes 34 | expand-cluster NUM_NODES adds new nodes 35 | terminate-cluster terminate all instances in CLUSTER 36 | login log in to the master in CLUSTER over SSH 37 | 38 | STORAGE COMMANDS 39 | ---------------------------------------------------------------------------------- 40 | list-storage list storage volumes for CLUSTER 41 | create-storage NUM_INSTANCES create volumes for NUM_INSTANCES instances 42 | SPEC_FILE for CLUSTER, using SPEC_FILE 43 | delete-storage delete all storage volumes for CLUSTER 44 | """ 45 | 46 | def __init__(self): 47 | super(CassandraServiceCLI, self).__init__() 48 | 49 | #self._logger = logging.getLogger("CassandraServiceCLI") 50 | 51 | def execute_command(self, argv, options_dict): 52 | if len(argv) < 2: 53 | self.print_help() 54 | 55 | self._cluster_name = argv[0] 56 | self._command_name = argv[1] 57 | 58 | # strip off the cluster name and command from argv 59 | argv = argv[2:] 60 | 61 | # handle all known commands and error on an unknown command 62 | if self._command_name == "details": 63 | self.print_instances() 64 | 65 | elif self._command_name == "simple-details": 66 | self.simple_print_instances(argv, options_dict) 67 | 68 | elif self._command_name == "terminate-cluster": 69 | self.terminate_cluster(argv, options_dict) 70 | 71 | elif self._command_name == "launch-cluster": 72 | self.launch_cluster(argv, options_dict) 73 | 74 | elif self._command_name == "expand-cluster": 75 | self.expand_cluster(argv, options_dict) 76 | 77 | elif self._command_name == "replace-down-nodes": 78 | self.replace_down_nodes(argv, options_dict) 79 | 80 | elif self._command_name == "login": 81 | self.login(argv, options_dict) 82 | 83 | elif self._command_name == "run-command": 84 | self.run_command(argv, options_dict) 85 | 86 | elif self._command_name == "transfer-files": 87 | self.transfer_files(argv, options_dict) 88 | 89 | elif self._command_name == "create-storage": 90 | self.create_storage(argv, options_dict) 91 | 92 | elif self._command_name == "delete-storage": 93 | self.delete_storage(argv, options_dict) 94 | 95 | elif self._command_name == "list-storage": 96 | self.print_storage() 97 | 98 | elif self._command_name == "stop-cassandra": 99 | self.stop_cassandra(argv, options_dict) 100 | 101 | elif self._command_name == "start-cassandra": 102 | self.start_cassandra(argv, options_dict) 103 | 104 | elif self._command_name == "print-ring": 105 | self.print_ring(argv, options_dict) 106 | 107 | elif self._command_name == "hack-config-for-multi-region": 108 | self.hack_config_for_multi_region(argv, options_dict) 109 | 110 | elif self._command_name == "rebalance": 111 | self.rebalance(argv, options_dict) 112 | 113 | elif self._command_name == "remove-down-nodes": 114 | self.remove_down_nodes(argv, options_dict) 115 | else: 116 | self.print_help() 117 | 118 | def expand_cluster(self, argv, options_dict): 119 | expected_arguments = ["NUM_INSTANCES"] 120 | opt, args = self.parse_options(self._command_name, 121 | argv, 122 | expected_arguments=expected_arguments, 123 | unbounded_args=True) 124 | opt.update(options_dict) 125 | 126 | number_of_nodes = int(args[0]) 127 | instance_template = InstanceTemplate( 128 | (self.service.CASSANDRA_NODE,), 129 | number_of_nodes, 130 | opt.get('image_id'), 131 | opt.get('instance_type'), 132 | opt.get('key_name'), 133 | opt.get('public_key'), 134 | opt.get('user_data_file'), 135 | opt.get('availability_zone'), 136 | opt.get('user_packages'), 137 | opt.get('auto_shutdown'), 138 | opt.get('env'), 139 | opt.get('security_groups')) 140 | # instance_template.add_env_strings(["CLUSTER_SIZE=%d" % number_of_nodes]) 141 | 142 | print "Expanding cluster by %d instance(s)...please wait." % number_of_nodes 143 | 144 | self.service.expand_cluster(instance_template) 145 | 146 | def replace_down_nodes(self, argv, options_dict): 147 | opt, args = self.parse_options(self._command_name, 148 | argv) 149 | opt.update(options_dict) 150 | 151 | # test files 152 | for key in ['cassandra_config_file']: 153 | if opt.get(key) is not None: 154 | try: 155 | url = urllib.urlopen(opt.get(key)) 156 | data = url.read() 157 | except: 158 | raise 159 | print "The file defined by %s (%s) does not exist. Aborting." % (key, opt.get(key)) 160 | sys.exit(1) 161 | 162 | number_of_nodes = len(self.service.calc_down_nodes()) 163 | instance_template = InstanceTemplate( 164 | (self.service.CASSANDRA_NODE,), 165 | number_of_nodes, 166 | opt.get('image_id'), 167 | opt.get('instance_type'), 168 | opt.get('key_name'), 169 | opt.get('public_key'), 170 | opt.get('user_data_file'), 171 | opt.get('availability_zone'), 172 | opt.get('user_packages'), 173 | opt.get('auto_shutdown'), 174 | opt.get('env'), 175 | opt.get('security_groups')) 176 | # instance_template.add_env_strings(["CLUSTER_SIZE=%d" % number_of_nodes]) 177 | 178 | print "Replacing %d down instance(s)...please wait." % number_of_nodes 179 | 180 | self.service.replace_down_nodes(instance_template, 181 | opt.get('cassandra_config_file')) 182 | 183 | def launch_cluster(self, argv, options_dict): 184 | """ 185 | """ 186 | expected_arguments = ["NUM_INSTANCES"] 187 | opt, args = self.parse_options(self._command_name, 188 | argv, 189 | expected_arguments=expected_arguments) 190 | opt.update(options_dict) 191 | 192 | if self.service.get_instances() : 193 | print "This cluster is already running. It must be terminated prior to being launched again." 194 | sys.exit(1) 195 | 196 | number_of_nodes = int(args[0]) 197 | instance_template = InstanceTemplate( 198 | (self.service.CASSANDRA_NODE,), 199 | number_of_nodes, 200 | opt.get('image_id'), 201 | opt.get('instance_type'), 202 | opt.get('key_name'), 203 | opt.get('public_key'), 204 | opt.get('user_data_file'), 205 | opt.get('availability_zone'), 206 | opt.get('user_packages'), 207 | opt.get('auto_shutdown'), 208 | opt.get('env'), 209 | opt.get('security_groups')) 210 | instance_template.add_env_strings(["CLUSTER_SIZE=%d" % number_of_nodes]) 211 | 212 | print "Launching cluster with %d instance(s)...please wait." % number_of_nodes 213 | 214 | self.service.launch_cluster(instance_template, opt) 215 | 216 | 217 | log_cluster_action(opt.get('config_dir'), self._cluster_name, 218 | "launch-cluster", number_of_nodes, opt.get("instance_type"), 219 | None, "cassandra") 220 | 221 | def stop_cassandra(self, argv, options_dict): 222 | instances = self.service.get_instances() 223 | if not instances: 224 | print "No running instances. Aborting." 225 | sys.exit(1) 226 | 227 | print "Stopping Cassandra service on %d instance(s)...please wait." % len(instances) 228 | self.service.stop_cassandra(instances=instances) 229 | 230 | def start_cassandra(self, argv, options_dict): 231 | instances = self.service.get_instances() 232 | if not instances: 233 | print "No running instances. Aborting." 234 | sys.exit(1) 235 | 236 | print "Starting Cassandra service on %d instance(s)...please wait." % len(instances) 237 | self.service.start_cassandra(instances=instances) 238 | 239 | def print_ring(self, argv, options_dict): 240 | instances = self.service.get_instances() 241 | if not instances: 242 | print("No running instances. Aborting.") 243 | sys.exit(1) 244 | 245 | idx = 0 246 | if len(argv) > 0 : 247 | idx = int(argv[0]) 248 | 249 | print(self.service.print_ring(instances[idx])) 250 | 251 | def hack_config_for_multi_region(self, argv, options_dict): 252 | instances = self.service.get_instances() 253 | if not instances: 254 | print "No running instances. Aborting." 255 | sys.exit(1) 256 | 257 | opt_list = BASIC_OPTIONS + [make_option("--seeds", metavar="SEEDS", action="store", type="str", default="", help="explicit comma separated seed list")] 258 | opt, args = self.parse_options(self._command_name, argv, opt_list) 259 | 260 | self.service.hack_config_for_multi_region(options_dict.get('ssh_options'), opt['seeds']) 261 | 262 | def rebalance(self, argv, options_dict): 263 | instances = self.service.get_instances() 264 | if not instances: 265 | print "No running instances. Aborting." 266 | sys.exit(1) 267 | 268 | opt, args = self.parse_options(self._command_name, argv, [make_option("--offset", metavar="OFFSET", action="store", type=int, default=0, help="token offset")]) 269 | self.service.rebalance(offset=opt['offset']) 270 | 271 | def remove_down_nodes(self, argv, options_dict): 272 | instances = self.service.get_instances() 273 | if not instances: 274 | print "No running instances. Aborting." 275 | sys.exit(1) 276 | 277 | self.service.remove_down_nodes() 278 | 279 | def create_storage(self, argv, options_dict): 280 | opt, args = self.parse_options(self._command_name, argv, BASIC_OPTIONS, 281 | ["NUM_INSTANCES", "SPEC_FILE"]) 282 | opt.update(options_dict) 283 | 284 | role = self.service.CASSANDRA_NODE 285 | number_of_instances = int(args[0]) 286 | spec_file = args[1] 287 | 288 | # FIXME 289 | # check_options_set(opt, ['availability_zone']) 290 | 291 | self.service.create_storage(role, 292 | number_of_instances, 293 | opt.get('availability_zone'), 294 | spec_file) 295 | self.print_storage() 296 | -------------------------------------------------------------------------------- /plugins/cassandra/service.plugin: -------------------------------------------------------------------------------- 1 | [Core] 2 | Name = cassandra 3 | Module = service 4 | 5 | [Documentation] 6 | Author = Abe Music 7 | Website = http://github.com/digitalreasoning/PyStratus 8 | Description = A Cassandra service implementation for PyStratus 9 | -------------------------------------------------------------------------------- /plugins/hadoop/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/digitalreasoning/PyStratus/c7e25c9e7dcc5a98f8d317c0f9f0985fbf79ca59/plugins/hadoop/__init__.py -------------------------------------------------------------------------------- /plugins/hadoop/cli.plugin: -------------------------------------------------------------------------------- 1 | [Core] 2 | Name = hadoop 3 | Module = cli 4 | 5 | [Documentation] 6 | Author = Abe Music 7 | Website = http://github.com/digitalreasoning/PyStratus 8 | Description = A Hadoop CLI implementation for PyStratus 9 | 10 | -------------------------------------------------------------------------------- /plugins/hadoop/cli.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import logging 4 | import urllib 5 | 6 | from cloud.plugin import CLIPlugin 7 | from cloud.plugin import BASIC_OPTIONS 8 | from cloud.service import InstanceTemplate 9 | from cloud.util import log_cluster_action 10 | from optparse import make_option 11 | from prettytable import PrettyTable 12 | 13 | class HadoopServiceCLI(CLIPlugin): 14 | USAGE = """Hadoop service usage: CLUSTER COMMAND [OPTIONS] 15 | where COMMAND and [OPTIONS] may be one of: 16 | 17 | HADOOP COMMANDS 18 | ---------------------------------------------------------------------------------- 19 | launch-master launch or find a master in CLUSTER 20 | launch-slaves NUM_SLAVES launch NUM_SLAVES slaves in CLUSTER 21 | terminate-dead-nodes find and terminate dead nodes in CLUSTER 22 | start-hadoop starts all processes on namenode and datanodes 23 | stop-hadoop stops all processes on namenode and datanodes 24 | send-config-files sends the given config files to each node and 25 | overwrites the existing file in the hadoop 26 | conf directory (BE CAREFUL!) 27 | get-config-files gets the given config files from the namenode 28 | and stores them in the cwd 29 | 30 | HBASE COMMANDS 31 | ---------------------------------------------------------------------------------- 32 | start-hbase starts processes on namenode and datanodes 33 | stop-hbase stops processes on namenode and datanodes 34 | send-hbase-config-files sends the given config files to each node and 35 | overwrites the existing file in the hadoop 36 | conf directory (BE CAREFUL!) 37 | get-hbase-config-files gets the given config files from the namenode 38 | and stores them in the cwd 39 | 40 | CLOUDBASE COMMANDS 41 | ---------------------------------------------------------------------------------- 42 | start-cloudbase starts processes on namenode and datanodes 43 | stop-cloudbase stops proceses on namenode and datanodes 44 | 45 | CLUSTER COMMANDS 46 | ---------------------------------------------------------------------------------- 47 | details list instances in CLUSTER 48 | launch-cluster NUM_SLAVES launch a master and NUM_SLAVES slaves in 49 | CLUSTER 50 | terminate-cluster terminate all instances in CLUSTER 51 | login log in to the master in CLUSTER over SSH 52 | proxy start a SOCKS proxy on localhost into the 53 | CLUSTER 54 | 55 | STORAGE COMMANDS 56 | ---------------------------------------------------------------------------------- 57 | list-storage list storage volumes for CLUSTER 58 | create-storage ROLE NUM_INSTANCES create volumes for NUM_INSTANCES instances of 59 | SPEC_FILE type ROLE for CLUSTER, using SPEC_FILE 60 | delete-storage delete all storage volumes for CLUSTER 61 | """ 62 | 63 | def __init__(self): 64 | super(HadoopServiceCLI, self).__init__() 65 | 66 | def execute_command(self, argv, options_dict): 67 | if len(argv) < 2: 68 | self.print_help() 69 | 70 | self._cluster_name = argv[0] 71 | self._command_name = argv[1] 72 | 73 | # strip off the cluster name and command from argv 74 | argv = argv[2:] 75 | 76 | # get spot configuration 77 | self._spot_config = { 78 | "spot_cluster": True if os.environ.get("SPOT_CLUSTER", options_dict.get("spot_cluster", "false")).lower() == "true" else False, 79 | "master_spot": True if options_dict.get("master_spot", "false").lower() == "true" else False, 80 | "max_price": options_dict.get("max_price", None), 81 | "launch_group": options_dict.get("launch_group", None), 82 | } 83 | 84 | # handle all known commands and error on an unknown command 85 | if self._command_name == "details": 86 | self.print_instances() 87 | 88 | elif self._command_name == "simple-details": 89 | self.simple_print_instances(argv, options_dict) 90 | 91 | elif self._command_name == "proxy": 92 | self.proxy(argv, options_dict) 93 | 94 | elif self._command_name == "terminate-cluster": 95 | self.terminate_cluster(argv, options_dict) 96 | 97 | elif self._command_name == "launch-cluster": 98 | self.launch_cluster(argv, options_dict) 99 | 100 | elif self._command_name == "terminate-dead-nodes": 101 | self.terminate_dead_nodes(argv, options_dict) 102 | 103 | elif self._command_name == "launch-master": 104 | self.launch_master(argv, options_dict) 105 | 106 | elif self._command_name == "launch-slaves": 107 | self.launch_slaves(argv, options_dict) 108 | 109 | elif self._command_name == "start-hadoop": 110 | self.start_hadoop(argv, options_dict) 111 | 112 | elif self._command_name == "stop-hadoop": 113 | self.stop_hadoop(argv, options_dict) 114 | 115 | elif self._command_name == "start-hbase": 116 | self.start_hbase(argv, options_dict) 117 | 118 | elif self._command_name == "stop-hbase": 119 | self.stop_hbase(argv, options_dict) 120 | 121 | elif self._command_name == "send-config-files": 122 | self.send_config_files(argv, options_dict) 123 | 124 | elif self._command_name == "get-config-files": 125 | self.get_config_files(argv, options_dict) 126 | 127 | elif self._command_name == "send-hbase-config-files": 128 | self.send_hbase_config_files(argv, options_dict) 129 | 130 | elif self._command_name == "get-hbase-config-files": 131 | self.get_hbase_config_files(argv, options_dict) 132 | 133 | elif self._command_name == "login": 134 | self.login(argv, options_dict) 135 | 136 | elif self._command_name == "run-command": 137 | self.run_command(argv, options_dict) 138 | 139 | elif self._command_name == "transfer-files": 140 | self.transfer_files(argv, options_dict) 141 | 142 | elif self._command_name == "create-storage": 143 | self.create_storage(argv, options_dict) 144 | 145 | elif self._command_name == "delete-storage": 146 | self.delete_storage(argv, options_dict) 147 | 148 | elif self._command_name == "list-storage": 149 | self.print_storage() 150 | 151 | elif self._command_name == "start-cloudbase": 152 | self.start_cloudbase(argv, options_dict) 153 | 154 | elif self._command_name == "stop-cloudbase": 155 | self.stop_cloudbase(argv, options_dict) 156 | 157 | else: 158 | self.print_help() 159 | 160 | def launch_cluster(self, argv, options_dict): 161 | """ 162 | """ 163 | 164 | expected_arguments = ["NUM_SLAVES"] 165 | opt, args = self.parse_options(self._command_name, 166 | argv, 167 | expected_arguments=expected_arguments) 168 | opt.update(options_dict) 169 | 170 | # if PROVIDER is set in the environment that takes precedence over 171 | # anything in the clusters.cfg; hbase is the default if nothing is set 172 | provider = os.environ.get("PROVIDER", opt.get("provider", "hbase")).lower() 173 | 174 | # default for spot clusters is for the master to NOT be spot; munging 175 | # some things around here if the opposite is specified 176 | spot_cluster_orig = self._spot_config["spot_cluster"] 177 | if spot_cluster_orig and self._spot_config["master_spot"]: 178 | self._spot_config["spot_cluster"] = True 179 | else: 180 | self._spot_config["spot_cluster"] = False 181 | 182 | number_of_slaves = int(args[0]) 183 | master_templates = [ 184 | InstanceTemplate( 185 | ( 186 | self.service.NAMENODE, 187 | self.service.SECONDARY_NAMENODE, 188 | self.service.JOBTRACKER 189 | ), 190 | 1, 191 | opt.get('image_id'), 192 | opt.get('instance_type'), 193 | opt.get('key_name'), 194 | opt.get('public_key'), 195 | opt.get('user_data_file'), 196 | opt.get('availability_zone'), 197 | opt.get('user_packages'), 198 | opt.get('auto_shutdown'), 199 | opt.get('env'), 200 | opt.get('security_groups'), 201 | self._spot_config) # don't want the master to be a spot instance 202 | ] 203 | for it in master_templates: 204 | it.add_env_strings([ 205 | "CLUSTER_SIZE=%d" % (number_of_slaves+1), 206 | "PROVIDER=%s" % (provider) 207 | ]) 208 | 209 | print "Using %s as the backend datastore" % (provider) 210 | 211 | print "Launching cluster with %d instance(s) - starting master...please wait." % (number_of_slaves+1) 212 | 213 | master = self.service.launch_cluster(master_templates, opt.get('client_cidr'), opt.get('config_dir')) 214 | 215 | if master is None: 216 | print "An error occurred starting the master node. Check the logs for more information." 217 | sys.exit(1) 218 | 219 | log_cluster_action(opt.get('config_dir'), self._cluster_name, 220 | "launch-cluster", 1, opt.get("instance_type"), 221 | provider, "hadoop") 222 | 223 | print "Master now running at %s - starting slaves" % master.public_dns_name 224 | 225 | self._spot_config["spot_cluster"] = spot_cluster_orig 226 | 227 | slave_templates = [ 228 | InstanceTemplate( 229 | ( 230 | self.service.DATANODE, 231 | self.service.TASKTRACKER 232 | ), 233 | number_of_slaves, 234 | opt.get('image_id'), 235 | opt.get('instance_type'), 236 | opt.get('key_name'), 237 | opt.get('public_key'), 238 | opt.get('user_data_file'), 239 | opt.get('availability_zone'), 240 | opt.get('user_packages'), 241 | opt.get('auto_shutdown'), 242 | opt.get('env'), 243 | opt.get('security_groups'), 244 | self._spot_config) 245 | ] 246 | 247 | for it in slave_templates: 248 | it.add_env_strings([ 249 | "CLUSTER_SIZE=%d" % (number_of_slaves+1), 250 | "NN_HOST=%s" % master.private_dns_name, 251 | "JT_HOST=%s" % master.private_dns_name, 252 | "ZOOKEEPER_QUORUM=%s" % master.private_dns_name, 253 | "PROVIDER=%s" % (provider) 254 | ]) 255 | 256 | print "Launching %d slave instance(s)...please wait." % (number_of_slaves) 257 | slave = self.service.launch_cluster(slave_templates, opt.get('client_cidr'), opt.get('config_dir')) 258 | 259 | if slave is None: 260 | print "An error occurred starting the slave nodes. Check the logs for more details" 261 | sys.exit(1) 262 | 263 | log_cluster_action(opt.get('config_dir'), self._cluster_name, 264 | "launch-cluster", number_of_slaves, opt.get("instance_type"), 265 | provider, "hadoop") 266 | 267 | #Once the cluster is up, if the provider is Cloudbase, we need to ensure that Cloudbase has been initialized 268 | #and launch the servers 269 | if provider == "cloudbase": 270 | 271 | #log in to the master and run a startup script 272 | print "Provider is cloudbase - starting cloudbase processes ... please wait" 273 | self.service.start_cloudbase(options_dict, 274 | options_dict.get("hadoop_user", "hadoop"), 275 | options_dict.get("ssh_user", "root")) 276 | 277 | print "Finished - browse the cluster at http://%s/" % master.public_dns_name 278 | 279 | self.logger.debug("Startup complete.") 280 | 281 | def launch_master(self, argv, options_dict): 282 | """Launch the master node of a CLUSTER.""" 283 | 284 | opt, args = self.parse_options(self._command_name, argv, BASIC_OPTIONS) 285 | opt.update(options_dict) 286 | 287 | provider = opt.get("provider") 288 | if provider is None: 289 | provider = "hbase" 290 | else: 291 | provider.lower() 292 | 293 | # default for spot clusters is for the master to NOT be spot; munging 294 | # some things around here if the opposite is specified 295 | spot_cluster_orig = self._spot_config["spot_cluster"] 296 | if spot_cluster_orig and self._spot_config["master_spot"]: 297 | self._spot_config["spot_cluster"] = True 298 | else: 299 | self._spot_config["spot_cluster"] = False 300 | 301 | master_templates = [ 302 | InstanceTemplate( 303 | ( 304 | self.service.NAMENODE, 305 | self.service.SECONDARY_NAMENODE, 306 | self.service.JOBTRACKER 307 | ), 308 | 1, 309 | opt.get('image_id'), 310 | opt.get('instance_type'), 311 | opt.get('key_name'), 312 | opt.get('public_key'), 313 | opt.get('user_data_file'), 314 | opt.get('availability_zone'), 315 | opt.get('user_packages'), 316 | opt.get('auto_shutdown'), 317 | opt.get('env'), 318 | opt.get('security_groups'), 319 | self._spot_config) 320 | ] 321 | 322 | for it in master_templates: 323 | it.add_env_strings([ 324 | "PROVIDER=%s" % (provider) 325 | ]) 326 | 327 | print "Launching cluster master...please wait." 328 | jobtracker = self.service.launch_cluster(master_templates, 329 | opt.get('client_cidr'), 330 | opt.get('config_dir')) 331 | 332 | if jobtracker is None: 333 | print "An error occurred started the Hadoop service. Check the logs for more information." 334 | sys.exit(1) 335 | 336 | print "Browse the cluster at http://%s/" % jobtracker.public_dns_name 337 | self.logger.debug("Startup complete.") 338 | 339 | def launch_slaves(self, argv, options_dict): 340 | """Launch slave/datanodes in CLUSTER.""" 341 | 342 | expected_arguments = ["NUM_SLAVES"] 343 | opt, args = self.parse_options(self._command_name, 344 | argv, 345 | expected_arguments=expected_arguments) 346 | opt.update(options_dict) 347 | 348 | provider = opt.get("provider") 349 | if provider is None: 350 | provider = "hbase" 351 | else: 352 | provider.lower() 353 | 354 | try: 355 | number_of_slaves = int(args[0]) 356 | except ValueError: 357 | print("Number of slaves must be an integer") 358 | return 359 | 360 | instance_templates = [ 361 | InstanceTemplate( 362 | ( 363 | self.service.DATANODE, 364 | self.service.TASKTRACKER 365 | ), 366 | number_of_slaves, 367 | opt.get('image_id'), 368 | opt.get('instance_type'), 369 | opt.get('key_name'), 370 | opt.get('public_key'), 371 | opt.get('user_data_file'), 372 | opt.get('availability_zone'), 373 | opt.get('user_packages'), 374 | opt.get('auto_shutdown'), 375 | opt.get('env'), 376 | opt.get('security_groups'), 377 | self._spot_config) 378 | ] 379 | 380 | # @todo - this is originally passed in when creating a cluster from 381 | # scratch, need to figure out what to do if we're growing a cluster 382 | #instance_template.add_env_strings([ 383 | # "CLUSTER_SIZE=%d" % (number_of_slaves+1) 384 | #]) 385 | 386 | print("Launching %s slave%s for %s" % (number_of_slaves, 387 | "" if number_of_slaves==1 else "s", self._cluster_name)) 388 | 389 | # this is needed to filter the jobtracker/namenode down into 390 | # hadoop-site.xml for the new nodes 391 | namenode = self.service.get_namenode() 392 | jobtracker = self.service.get_jobtracker() 393 | for instance_template in instance_templates: 394 | instance_template.add_env_strings([ 395 | "NN_HOST=%s" % namenode.public_dns_name, 396 | "JT_HOST=%s" % jobtracker.public_dns_name, 397 | "ZOOKEEPER_QUORUM=%s" % namenode.private_dns_name, 398 | "PROVIDER=%s" % (provider) 399 | ]) 400 | 401 | # I think this count can be wrong if run too soon after running 402 | # terminate_dead_nodes 403 | existing_tasktrackers = self.service.get_tasktrackers() 404 | num_tasktrackers = len(existing_tasktrackers) if existing_tasktrackers else 0 405 | self.service.launch_cluster(instance_templates, 406 | opt.get('client_cidr'), opt.get('config_dir'), 407 | num_existing_tasktrackers=num_tasktrackers) 408 | 409 | def start_cloudbase(self, argv, options_dict): 410 | """Start the various cloudbase processes on the namenode and slave nodes - initialize the cloudbase instance, if necessary""" 411 | 412 | opt, args = self.parse_options(self._command_name, argv, BASIC_OPTIONS) 413 | opt.update(options_dict) 414 | 415 | self.service.start_cloudbase(options_dict, 416 | options_dict.get("hadoop_user", "hadoop"), 417 | options_dict.get("ssh_user", "root")) 418 | 419 | def stop_cloudbase(self, argv, options_dict): 420 | """Stop the various cloudbase processes on the namenode and slave 421 | nodes""" 422 | 423 | opt, args = self.parse_options(self._command_name, argv, BASIC_OPTIONS) 424 | opt.update(options_dict) 425 | 426 | self.service.stop_cloudbase(options_dict) 427 | 428 | def start_hadoop(self, argv, options_dict): 429 | """Start the various processes on the namenode and slave nodes""" 430 | 431 | opt, args = self.parse_options(self._command_name, argv, BASIC_OPTIONS) 432 | opt.update(options_dict) 433 | 434 | print "Starting hadoop..." 435 | self.service.start_hadoop(options_dict.get("hadoop_user", "hadoop")) 436 | 437 | def stop_hadoop(self, argv, options_dict): 438 | """Stop the various processes on the namenode and slave nodes""" 439 | 440 | x = "n" 441 | while True: 442 | try: 443 | x = raw_input("Are you sure you want to stop Hadoop? (Y/n) ").lower() 444 | if x in ["y", "n"]: 445 | break 446 | print "Value must be either y or n. Try again." 447 | except KeyboardInterrupt: 448 | x = "n" 449 | print "" 450 | break 451 | 452 | if x == "n": 453 | print "Quitting" 454 | sys.exit(1) 455 | 456 | opt, args = self.parse_options(self._command_name, argv, BASIC_OPTIONS) 457 | opt.update(options_dict) 458 | 459 | print "Stopping hadoop..." 460 | self.service.stop_hadoop(options_dict.get("hadoop_user", "hadoop")) 461 | 462 | def start_hbase(self, argv, options_dict): 463 | """Start the various hbase processes on the namenode and slave nodes""" 464 | 465 | opt, args = self.parse_options(self._command_name, argv, BASIC_OPTIONS) 466 | opt.update(options_dict) 467 | 468 | print "Starting hbase..." 469 | self.service.start_hbase(options_dict.get("hadoop_user", "hadoop")) 470 | 471 | def stop_hbase(self, argv, options_dict): 472 | """Stop the various hbase processes on the namenode and slave nodes""" 473 | 474 | x = "n" 475 | while True: 476 | try: 477 | x = raw_input("Are you sure you want to stop HBase? (Y/n) ").lower() 478 | if x in ["y", "n"]: 479 | break 480 | print "Value must be either y or n. Try again." 481 | except KeyboardInterrupt: 482 | x = "n" 483 | print "" 484 | break 485 | 486 | if x == "n": 487 | print "Quitting" 488 | sys.exit(1) 489 | 490 | opt, args = self.parse_options(self._command_name, argv, BASIC_OPTIONS) 491 | opt.update(options_dict) 492 | 493 | print "Stopping hbase..." 494 | self.service.stop_hbase(options_dict.get("hadoop_user", "hadoop")) 495 | 496 | def get_config_files(self, argv, options_dict): 497 | """ 498 | Gets the given config files from the name node and writes them 499 | to the local directory. 500 | """ 501 | 502 | opt, args = self.parse_options(self._command_name, argv, expected_arguments=["FILE*"], unbounded_args=True) 503 | opt.update(options_dict) 504 | 505 | self.service.get_config_files(args, options_dict) 506 | 507 | def send_config_files(self, argv, options_dict): 508 | """ 509 | Sends the given config file to each node in the cluster, overwriting 510 | the file located in hadoop/conf directory. 511 | """ 512 | 513 | opt, args = self.parse_options(self._command_name, argv, expected_arguments=["FILE*"], unbounded_args=True) 514 | opt.update(options_dict) 515 | 516 | self.service.send_config_files(args, options_dict) 517 | 518 | def get_hbase_config_files(self, argv, options_dict): 519 | """ 520 | Gets the given config files from the hbase master node and 521 | writes them to the local directory. 522 | """ 523 | 524 | opt, args = self.parse_options(self._command_name, argv, expected_arguments=["FILE*"], unbounded_args=True) 525 | opt.update(options_dict) 526 | 527 | self.service.get_hbase_config_files(args, options_dict) 528 | 529 | def send_hbase_config_files(self, argv, options_dict): 530 | """ 531 | Sends the given config file to each node in the cluster, overwriting 532 | the file located in hadoop/conf directory. 533 | """ 534 | 535 | opt, args = self.parse_options(self._command_name, argv, expected_arguments=["FILE*"], unbounded_args=True) 536 | opt.update(options_dict) 537 | 538 | self.service.send_hbase_config_files(args, options_dict) 539 | 540 | def terminate_dead_nodes(self, argv, options_dict): 541 | """Find and terminate dead nodes in CLUSTER.""" 542 | 543 | opt, args = self.parse_options(self._command_name, argv, BASIC_OPTIONS) 544 | opt.update(options_dict) 545 | 546 | print("Looking for dead nodes in %s" % self._cluster_name) 547 | dead_nodes = self.service.find_dead_nodes(self._cluster_name, opt) 548 | if not dead_nodes: 549 | print("No dead nodes found") 550 | return 551 | 552 | print ("Found %s dead nodes" % len(dead_nodes)) 553 | self.service.terminate_nodes(dead_nodes, opt) 554 | 555 | def create_storage(self, argv, options_dict): 556 | opt, args = self.parse_options(self._command_name, argv, BASIC_OPTIONS, 557 | ["ROLE", "NUM_INSTANCES", "SPEC_FILE"]) 558 | 559 | opt.update(options_dict) 560 | 561 | role = args[0] 562 | number_of_instances = int(args[1]) 563 | spec_file = args[2] 564 | 565 | valid_roles = (self.service.NAMENODE, self.service.DATANODE) 566 | if role not in valid_roles: 567 | raise RuntimeError("Role must be one of '%s' or '%s'" % valid_roles) 568 | 569 | self.service.create_storage(role, 570 | number_of_instances, 571 | opt.get('availability_zone'), 572 | spec_file) 573 | self.print_storage() 574 | 575 | def proxy(self, argv, options_dict): 576 | instances = self.service.get_instances() 577 | if not instances: 578 | "No running instances. Aborting." 579 | sys.exit(1) 580 | 581 | result = self.service.proxy(ssh_options=options_dict.get('ssh_options'), 582 | instances=instances) 583 | 584 | if result is None: 585 | print "Unable to create proxy. Check logs for more information." 586 | sys.exit(1) 587 | 588 | print "Proxy created..." 589 | print """export HADOOP_CLOUD_PROXY_PID=%s; 590 | echo Proxy pid %s;""" % (result, result) 591 | -------------------------------------------------------------------------------- /plugins/hadoop/service.plugin: -------------------------------------------------------------------------------- 1 | [Core] 2 | Name = hadoop 3 | Module = service 4 | 5 | [Documentation] 6 | Author = Abe Music 7 | Website = http://github.com/digitalreasoning/PyStratus 8 | Description = A Hadoop service implementation for PyStratus 9 | -------------------------------------------------------------------------------- /plugins/hadoop_cassandra_hybrid/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/digitalreasoning/PyStratus/c7e25c9e7dcc5a98f8d317c0f9f0985fbf79ca59/plugins/hadoop_cassandra_hybrid/__init__.py -------------------------------------------------------------------------------- /plugins/hadoop_cassandra_hybrid/cli.plugin: -------------------------------------------------------------------------------- 1 | [Core] 2 | Name = hadoop_cassandra_hybrid 3 | Module = cli 4 | 5 | [Documentation] 6 | Author = Abe Music 7 | Website = http://github.com/digitalreasoning/PyStratus 8 | Description = A hybrid Hadoop/Cassandra CLI implementation for PyStratus 9 | 10 | -------------------------------------------------------------------------------- /plugins/hadoop_cassandra_hybrid/cli.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import logging 3 | import urllib 4 | 5 | from cloud.plugin import CLIPlugin 6 | from cloud.plugin import BASIC_OPTIONS 7 | from cloud.service import InstanceTemplate 8 | from optparse import make_option 9 | from prettytable import PrettyTable 10 | 11 | class HadoopCassandraServiceCLI(CLIPlugin): 12 | USAGE = """Hadoop service usage: CLUSTER COMMAND [OPTIONS] 13 | where COMMAND and [OPTIONS] may be one of: 14 | 15 | HADOOP COMMANDS 16 | ---------------------------------------------------------------------------------- 17 | launch-master launch or find a master in CLUSTER 18 | launch-slaves NUM_SLAVES launch NUM_SLAVES slaves in CLUSTER 19 | 20 | CASSANDRA COMMANDS 21 | ---------------------------------------------------------------------------------- 22 | start-cassandra starts the cassandra service on all nodes 23 | stop-cassandra stops the cassandra service on all nodes 24 | print-ring displays the cluster's ring information 25 | 26 | CLUSTER COMMANDS 27 | ---------------------------------------------------------------------------------- 28 | details list instances in CLUSTER 29 | launch-cluster NUM_SLAVES launch a master and NUM_SLAVES slaves in 30 | CLUSTER 31 | terminate-cluster terminate all instances in CLUSTER 32 | login log in to the master in CLUSTER over SSH 33 | proxy start a SOCKS proxy on localhost into the 34 | CLUSTER 35 | 36 | STORAGE COMMANDS 37 | ---------------------------------------------------------------------------------- 38 | list-storage list storage volumes for CLUSTER 39 | create-storage ROLE NUM_INSTANCES create volumes for NUM_INSTANCES instances of 40 | SPEC_FILE type ROLE for CLUSTER, using SPEC_FILE 41 | delete-storage delete all storage volumes for CLUSTER 42 | """ 43 | 44 | def __init__(self): 45 | super(HadoopCassandraServiceCLI, self).__init__() 46 | 47 | def execute_command(self, argv, options_dict): 48 | if len(argv) < 2: 49 | self.print_help() 50 | 51 | self._cluster_name = argv[0] 52 | self._command_name = argv[1] 53 | 54 | # strip off the cluster name and command from argv 55 | argv = argv[2:] 56 | 57 | # handle all known commands and error on an unknown command 58 | if self._command_name == "details": 59 | self.print_instances() 60 | 61 | elif self._command_name == "simple-details": 62 | self.simple_print_instances(argv, options_dict) 63 | 64 | elif self._command_name == "proxy": 65 | self.proxy(argv, options_dict) 66 | 67 | elif self._command_name == "terminate-cluster": 68 | self.terminate_cluster(argv, options_dict) 69 | 70 | elif self._command_name == "launch-cluster": 71 | self.launch_cluster(argv, options_dict) 72 | 73 | elif self._command_name == "login": 74 | self.login(argv, options_dict) 75 | 76 | elif self._command_name == "run-command": 77 | self.run_command(argv, options_dict) 78 | 79 | elif self._command_name == "transfer-files": 80 | self.transfer_files(argv, options_dict) 81 | 82 | elif self._command_name == "create-storage": 83 | self.create_storage(argv, options_dict) 84 | 85 | elif self._command_name == "delete-storage": 86 | self.delete_storage(argv, options_dict) 87 | 88 | elif self._command_name == "list-storage": 89 | self.print_storage() 90 | 91 | elif self._command_name == "stop-cassandra": 92 | self.stop_cassandra(argv, options_dict) 93 | 94 | elif self._command_name == "start-cassandra": 95 | self.start_cassandra(argv, options_dict) 96 | 97 | elif self._command_name == "print-ring": 98 | self.print_ring(argv, options_dict) 99 | 100 | else: 101 | self.print_help() 102 | 103 | def launch_cluster(self, argv, options_dict): 104 | """ 105 | """ 106 | 107 | expected_arguments = ["NUM_SLAVES"] 108 | opt, args = self.parse_options(self._command_name, 109 | argv, 110 | expected_arguments=expected_arguments) 111 | opt.update(options_dict) 112 | 113 | # check for the cassandra-specific files 114 | if opt.get('cassandra_config_file') is None: 115 | print "ERROR: No cassandra_config_file configured. Aborting." 116 | sys.exit(1) 117 | 118 | if opt.get('keyspace_definitions_file') is None: 119 | print "WARNING: No keyspace_definitions_file configured. You can ignore this for Cassandra v0.6.x" 120 | 121 | # test files 122 | for key in ['cassandra_config_file', 'keyspace_definitions_file']: 123 | if opt.get(key) is not None: 124 | try: 125 | url = urllib.urlopen(opt.get(key)) 126 | data = url.read() 127 | except: 128 | raise 129 | print "The file defined by %s (%s) does not exist. Aborting." % (key, opt.get(key)) 130 | sys.exit(1) 131 | 132 | number_of_slaves = int(args[0]) 133 | instance_templates = [ 134 | InstanceTemplate( 135 | ( 136 | self.service.NAMENODE, 137 | self.service.SECONDARY_NAMENODE, 138 | self.service.JOBTRACKER, 139 | self.service.HADOOP_CASSANDRA_NODE, 140 | ), 141 | 1, 142 | opt.get('image_id'), 143 | opt.get('instance_type'), 144 | opt.get('key_name'), 145 | opt.get('public_key'), 146 | opt.get('user_data_file'), 147 | opt.get('availability_zone'), 148 | opt.get('user_packages'), 149 | opt.get('auto_shutdown'), 150 | opt.get('env'), 151 | opt.get('security_groups')), 152 | InstanceTemplate( 153 | ( 154 | self.service.DATANODE, 155 | self.service.TASKTRACKER, 156 | self.service.CASSANDRA_NODE, 157 | ), 158 | number_of_slaves, 159 | opt.get('image_id'), 160 | opt.get('instance_type'), 161 | opt.get('key_name'), 162 | opt.get('public_key'), 163 | opt.get('user_data_file'), 164 | opt.get('availability_zone'), 165 | opt.get('user_packages'), 166 | opt.get('auto_shutdown'), 167 | opt.get('env'), 168 | opt.get('security_groups')) 169 | ] 170 | 171 | for it in instance_templates: 172 | it.add_env_strings([ 173 | "CLUSTER_SIZE=%d" % (number_of_slaves+1) 174 | ]) 175 | 176 | print "Launching cluster with %d instance(s)...please wait." % (number_of_slaves+1) 177 | jobtracker = self.service.launch_cluster(instance_templates, 178 | opt.get('client_cidr'), 179 | opt.get('config_dir'), 180 | opt.get('ssh_options'), 181 | opt.get('cassandra_config_file'), 182 | opt.get('keyspace_definitions_file')) 183 | 184 | if jobtracker is None: 185 | print "An error occurred started the Hadoop service. Check the logs for more information." 186 | sys.exit(1) 187 | 188 | print "Browse the cluster at http://%s/" % jobtracker.public_dns_name 189 | self.logger.debug("Startup complete.") 190 | 191 | def create_storage(self, argv, options_dict): 192 | opt, args = self.parse_options(self._command_name, argv, BASIC_OPTIONS, 193 | ["ROLE", "NUM_INSTANCES", "SPEC_FILE"]) 194 | 195 | opt.update(options_dict) 196 | 197 | role = args[0] 198 | number_of_instances = int(args[1]) 199 | spec_file = args[2] 200 | 201 | valid_roles = (self.service.NAMENODE, self.service.DATANODE, self.service.CASSANDRA_NODE) 202 | if role not in valid_roles: 203 | raise RuntimeError("Role must be one of %s" % str(valid_roles)) 204 | 205 | self.service.create_storage(role, 206 | number_of_instances, 207 | opt.get('availability_zone'), 208 | spec_file) 209 | self.print_storage() 210 | 211 | def proxy(self, argv, options_dict): 212 | instances = self.service.get_instances() 213 | if not instances: 214 | "No running instances. Aborting." 215 | sys.exit(1) 216 | 217 | result = self.service.proxy(ssh_options=options_dict.get('ssh_options'), 218 | instances=instances) 219 | 220 | if result is None: 221 | print "Unable to create proxy. Check logs for more information." 222 | sys.exit(1) 223 | 224 | print "Proxy created..." 225 | print """export HADOOP_CLOUD_PROXY_PID=%s; 226 | echo Proxy pid %s;""" % (result, result) 227 | 228 | def stop_cassandra(self, argv, options_dict): 229 | instances = self.service.cluster.get_instances_in_role(self.service.DATANODE, "running") 230 | if not instances: 231 | print "No running instances. Aborting." 232 | sys.exit(1) 233 | 234 | print "Stopping Cassandra service on %d instance(s)...please wait." % len(instances) 235 | self.service.stop_cassandra(options_dict.get('ssh_options'), instances=instances) 236 | 237 | def start_cassandra(self, argv, options_dict): 238 | instances = self.service.cluster.get_instances_in_role(self.service.DATANODE, "running") 239 | if not instances: 240 | print "No running instances. Aborting." 241 | sys.exit(1) 242 | 243 | print "Starting Cassandra service on %d instance(s)...please wait." % len(instances) 244 | self.service.start_cassandra(options_dict.get('ssh_options'), instances=instances) 245 | 246 | def print_ring(self, argv, options_dict): 247 | instances = self.service.cluster.get_instances_in_role(self.service.DATANODE, "running") 248 | if not instances: 249 | print "No running instances. Aborting." 250 | sys.exit(1) 251 | 252 | self.service.print_ring(options_dict.get('ssh_options'), instances[0]) 253 | -------------------------------------------------------------------------------- /plugins/hadoop_cassandra_hybrid/service.plugin: -------------------------------------------------------------------------------- 1 | [Core] 2 | Name = hadoop_cassandra_hybrid 3 | Module = service 4 | 5 | [Documentation] 6 | Author = Abe Music 7 | Website = http://github.com/digitalreasoning/PyStratus 8 | Description = A hybrid Hadoop/Cassandra service implementation for PyStratus 9 | -------------------------------------------------------------------------------- /plugins/hadoop_cassandra_hybrid/service.py: -------------------------------------------------------------------------------- 1 | from __future__ import with_statement 2 | 3 | import os 4 | import sys 5 | import time 6 | import subprocess 7 | import urllib 8 | import tempfile 9 | import socket 10 | import re 11 | 12 | from cloud.cluster import TimeoutException 13 | from cloud.service import InstanceTemplate 14 | from cloud.plugin import ServicePlugin 15 | from cloud.util import xstr 16 | from cloud.util import url_get 17 | 18 | from yaml import load as parse_yaml 19 | from yaml import dump as dump_yaml 20 | 21 | try: 22 | from cElementTree import parse as parse_xml 23 | from cElementTree import tostring as dump_xml 24 | from cElementTree import Element 25 | except: 26 | try: 27 | from xml.etree.cElementTree import parse as parse_xml 28 | from xml.etree.cElementTree import tostring as dump_xml 29 | from xml.etree.cElementTree import Element 30 | except: 31 | print "*"*80 32 | print "WARNING: cElementTree module does not exist. Defaulting to elementtree instead." 33 | print "It's recommended that you install the cElementTree module for faster XML parsing." 34 | print "*"*80 35 | from elementtree.ElementTree import parse as parse_xml 36 | from elementtree.ElementTree import parse as parse_xml 37 | from elementtree.ElementTree import Element 38 | 39 | class HadoopCassandraService(ServicePlugin): 40 | """ 41 | """ 42 | NAMENODE = "hybrid_nn" 43 | SECONDARY_NAMENODE = "hybrid_snn" 44 | JOBTRACKER = "hybrid_jt" 45 | DATANODE = "hybrid_dn" 46 | TASKTRACKER = "hybrid_tt" 47 | CASSANDRA_NODE = "hybrid_cn" 48 | HADOOP_CASSANDRA_NODE = "hcn" 49 | 50 | def __init__(self): 51 | super(HadoopCassandraService, self).__init__() 52 | 53 | def get_roles(self): 54 | return [self.NAMENODE] 55 | 56 | def get_instances(self): 57 | """ 58 | Return a list of tuples resembling (role_of_instance, instance) 59 | """ 60 | return self.cluster.get_instances_in_role(self.NAMENODE, "running") + \ 61 | self.cluster.get_instances_in_role(self.DATANODE, "running") 62 | 63 | def launch_cluster(self, instance_templates, client_cidr, config_dir, 64 | ssh_options, cassandra_config_file, 65 | cassandra_keyspace_file=None): 66 | 67 | number_of_tasktrackers = 0 68 | roles = [] 69 | for it in instance_templates: 70 | roles.extend(it.roles) 71 | if self.TASKTRACKER in it.roles: 72 | number_of_tasktrackers += it.number 73 | 74 | singleton_hosts = [] 75 | started_instance_ids = [] 76 | expected_instance_count = sum([it.number for it in instance_templates]) 77 | 78 | for instance_template in instance_templates: 79 | self.logger.debug("Launching %d instance(s) with role(s) %s..." % ( 80 | instance_template.number, 81 | str(instance_template.roles), 82 | )) 83 | self.logger.debug("Instance(s) will have extra environment variables: %s" % ( 84 | singleton_hosts, 85 | )) 86 | instance_template.add_env_strings(singleton_hosts) 87 | instance_ids = self._launch_instances(instance_template) 88 | 89 | if instance_template.number == 1: 90 | if len(instance_ids) != 1: 91 | logger.error("Expected a single '%s' instance, but found %s.", 92 | "".join(instance_template.roles), 93 | len(instance_ids)) 94 | return False 95 | else: 96 | # wait for the instances to start 97 | self.cluster.wait_for_instances(instance_ids) 98 | instance = self.get_instances()[0] 99 | 100 | for role in instance_template.roles: 101 | singleton_host_env = "%s_HOST=%s" % ( 102 | self._sanitize_role_name(role), 103 | instance.public_dns_name, 104 | ) 105 | singleton_hosts.append(singleton_host_env) 106 | 107 | started_instance_ids.extend(instance_ids) 108 | 109 | if len(started_instance_ids) != expected_instance_count: 110 | self.logger.warn("Total number of reported instance ids (%d) " \ 111 | "does not match total requested number (%d)" % \ 112 | (len(started_instance_ids), instance_template.number)) 113 | 114 | self.logger.debug("Waiting for %s instance(s) to start: %s" % \ 115 | (len(started_instance_ids), ", ".join(started_instance_ids))) 116 | time.sleep(1) 117 | 118 | try: 119 | self.cluster.wait_for_instances(started_instance_ids) 120 | except TimeoutException: 121 | self.logger.error("Timeout while waiting for %d instances to start." % \ 122 | len(started_instance_ids)) 123 | 124 | instances = self.get_instances() 125 | 126 | self.logger.debug("Instances started: %s" % (str(instances),)) 127 | 128 | self._create_client_hadoop_site_file(config_dir) 129 | self._authorize_client_ports(client_cidr) 130 | self._attach_storage(roles) 131 | try: 132 | self._wait_for_hadoop(number_of_tasktrackers) 133 | except TimeoutException: 134 | print "Timeout while waiting for Hadoop to start. Please check logs on" + \ 135 | " cluster." 136 | 137 | # cassandra specific instances and setup 138 | cassandra_instances = self.cluster.get_instances_in_role(self.DATANODE, "running") 139 | self._transfer_config_files(ssh_options, 140 | cassandra_config_file, 141 | cassandra_keyspace_file, 142 | instances=cassandra_instances) 143 | self.start_cassandra(ssh_options, 144 | create_keyspaces=(cassandra_keyspace_file is not None), 145 | instances=cassandra_instances) 146 | 147 | return self._get_jobtracker() 148 | 149 | def _sanitize_role_name(self, role): 150 | """ 151 | Replace characters in role name with ones allowed in bash variable names 152 | """ 153 | return role.replace('+', '_').upper() 154 | 155 | 156 | def _get_namenode(self): 157 | instances = self.cluster.get_instances_in_role(self.NAMENODE, "running") 158 | if not instances: 159 | return None 160 | return instances[0] 161 | 162 | def _get_jobtracker(self): 163 | instances = self.cluster.get_instances_in_role(self.JOBTRACKER, "running") 164 | if not instances: 165 | return None 166 | return instances[0] 167 | 168 | def _create_client_hadoop_site_file(self, config_dir): 169 | namenode = self._get_namenode() 170 | jobtracker = self._get_jobtracker() 171 | cluster_dir = os.path.join(config_dir, ".hadoop", self.cluster.name) 172 | aws_access_key_id = os.environ['AWS_ACCESS_KEY_ID'] 173 | aws_secret_access_key = os.environ['AWS_SECRET_ACCESS_KEY'] 174 | 175 | if not os.path.exists(cluster_dir): 176 | os.makedirs(cluster_dir) 177 | 178 | params = { 179 | 'namenode': self._get_namenode().public_dns_name, 180 | 'jobtracker': self._get_jobtracker().public_dns_name, 181 | 'aws_access_key_id': os.environ['AWS_ACCESS_KEY_ID'], 182 | 'aws_secret_access_key': os.environ['AWS_SECRET_ACCESS_KEY'] 183 | } 184 | self.logger.debug("hadoop-site.xml params: %s" % str(params)) 185 | 186 | with open(os.path.join(cluster_dir, 'hadoop-site.xml'), 'w') as f: 187 | f.write(""" 188 | 189 | 190 | 191 | 192 | hadoop.job.ugi 193 | root,root 194 | 195 | 196 | fs.default.name 197 | hdfs://%(namenode)s:8020/ 198 | 199 | 200 | mapred.job.tracker 201 | %(jobtracker)s:8021 202 | 203 | 204 | hadoop.socks.server 205 | localhost:6666 206 | 207 | 208 | hadoop.rpc.socket.factory.class.default 209 | org.apache.hadoop.net.SocksSocketFactory 210 | 211 | 212 | fs.s3.awsAccessKeyId 213 | %(aws_access_key_id)s 214 | 215 | 216 | fs.s3.awsSecretAccessKey 217 | %(aws_secret_access_key)s 218 | 219 | 220 | fs.s3n.awsAccessKeyId 221 | %(aws_access_key_id)s 222 | 223 | 224 | fs.s3n.awsSecretAccessKey 225 | %(aws_secret_access_key)s 226 | 227 | """ % params) 228 | 229 | def _authorize_client_ports(self, client_cidrs=[]): 230 | if not client_cidrs: 231 | self.logger.debug("No client CIDRs specified, using local address.") 232 | client_ip = url_get('http://checkip.amazonaws.com/').strip() 233 | client_cidrs = ("%s/32" % client_ip,) 234 | self.logger.debug("Client CIDRs: %s", client_cidrs) 235 | 236 | namenode = self._get_namenode() 237 | jobtracker = self._get_jobtracker() 238 | 239 | for client_cidr in client_cidrs: 240 | # Allow access to port 80 on namenode from client 241 | self.cluster.authorize_role(self.NAMENODE, 80, 80, client_cidr) 242 | 243 | # Allow access to jobtracker UI on master from client 244 | # (so we can see when the cluster is ready) 245 | self.cluster.authorize_role(self.JOBTRACKER, 50030, 50030, client_cidr) 246 | 247 | # Allow access to namenode and jobtracker via public address from each other 248 | namenode_ip = socket.gethostbyname(namenode.public_dns_name) 249 | jobtracker_ip = socket.gethostbyname(jobtracker.public_dns_name) 250 | self.cluster.authorize_role(self.NAMENODE, 8020, 8020, "%s/32" % namenode_ip) 251 | self.cluster.authorize_role(self.NAMENODE, 8020, 8020, "%s/32" % jobtracker_ip) 252 | self.cluster.authorize_role(self.JOBTRACKER, 8021, 8021, "%s/32" % namenode_ip) 253 | self.cluster.authorize_role(self.JOBTRACKER, 8021, 8021, 254 | "%s/32" % jobtracker_ip) 255 | 256 | def _wait_for_hadoop(self, number, timeout=600): 257 | wait_time = 3 258 | start_time = time.time() 259 | jobtracker = self._get_jobtracker() 260 | if not jobtracker: 261 | return 262 | 263 | self.logger.debug("Waiting for jobtracker to start...") 264 | previous_running = 0 265 | while True: 266 | if (time.time() - start_time >= timeout): 267 | raise TimeoutException() 268 | try: 269 | actual_running = self._number_of_tasktrackers(jobtracker.public_dns_name, 1) 270 | break 271 | except IOError: 272 | pass 273 | self.logger.debug("Sleeping for %d seconds..." % wait_time) 274 | time.sleep(wait_time) 275 | if number > 0: 276 | self.logger.debug("Waiting for %d tasktrackers to start" % number) 277 | while actual_running < number: 278 | if (time.time() - start_time >= timeout): 279 | raise TimeoutException() 280 | try: 281 | actual_running = self._number_of_tasktrackers(jobtracker.public_dns_name, 5, 2) 282 | self.logger.debug("Sleeping for %d seconds..." % wait_time) 283 | time.sleep(wait_time) 284 | previous_running = actual_running 285 | except IOError: 286 | pass 287 | 288 | # The optional ?type=active is a difference between Hadoop 0.18 and 0.20 289 | _NUMBER_OF_TASK_TRACKERS = re.compile(r'(\d+)') 290 | 291 | def _number_of_tasktrackers(self, jt_hostname, timeout, retries=0): 292 | url = "http://%s:50030/jobtracker.jsp" % jt_hostname 293 | jt_page = url_get(url, timeout, retries) 294 | m = self._NUMBER_OF_TASK_TRACKERS.search(jt_page) 295 | if m: 296 | return int(m.group(1)) 297 | return 0 298 | 299 | def proxy(self, ssh_options, instances=None): 300 | if instances is None: 301 | return None 302 | 303 | namenode = self._get_namenode() 304 | if namenode is None: 305 | self.logger.error("No namenode running. Aborting.") 306 | return None 307 | 308 | options = '-o "ConnectTimeout 10" -o "ServerAliveInterval 60" ' \ 309 | '-N -D 6666' 310 | process = subprocess.Popen('ssh %s %s %s' % ( 311 | xstr(ssh_options), 312 | options, 313 | namenode.public_dns_name 314 | ), 315 | stdin=subprocess.PIPE, 316 | stdout=subprocess.PIPE, 317 | stderr=subprocess.PIPE, 318 | shell=True) 319 | 320 | return process.pid 321 | 322 | def _wait_for_cassandra_install(self, instance, ssh_options): 323 | """ 324 | Simply wait for the cassandra directory to be available so that we can begin configuring 325 | the service before starting it 326 | """ 327 | wait_time = 3 328 | command = "ls /usr/local/apache-cassandra" 329 | ssh_command = self._get_standard_ssh_command(instance, ssh_options, command) 330 | self.logger.debug(ssh_command) 331 | timeout = 600 332 | 333 | start_time = time.time() 334 | while True: 335 | if (time.time() - start_time >= timeout): 336 | raise TimeoutException() 337 | retcode = subprocess.call(ssh_command, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) 338 | if retcode == 0: 339 | break 340 | self.logger.debug("Sleeping for %d seconds..." % wait_time) 341 | time.sleep(wait_time) 342 | 343 | def _transfer_config_files(self, ssh_options, config_file, keyspace_file=None, 344 | instances=None): 345 | """ 346 | """ 347 | if instances is None: 348 | instances = self.get_instances() 349 | 350 | self.logger.debug("Waiting for %d Cassandra instance(s) to install..." % len(instances)) 351 | for instance in instances: 352 | self._wait_for_cassandra_install(instance, ssh_options) 353 | 354 | self.logger.debug("Copying configuration files to %d Cassandra instances..." % len(instances)) 355 | 356 | seed_ips = [str(instance.private_dns_name) for instance in instances[:2]] 357 | tokens = self._get_evenly_spaced_tokens_for_n_instances(len(instances)) 358 | 359 | # for each instance, generate a config file from the original file and upload it to 360 | # the cluster node 361 | for i in range(len(instances)): 362 | local_file, remote_file = self._modify_config_file(instances[i], config_file, seed_ips, str(tokens[i])) 363 | 364 | # Upload modified config file 365 | scp_command = 'scp %s -r %s %s:/usr/local/apache-cassandra/conf/%s' % (xstr(ssh_options), 366 | local_file, instances[i].public_dns_name, remote_file) 367 | subprocess.call(scp_command, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) 368 | 369 | # delete temporary file 370 | os.unlink(local_file) 371 | 372 | if keyspace_file: 373 | keyspace_data = urllib.urlopen(keyspace_file).read() 374 | fd, temp_keyspace_file = tempfile.mkstemp(prefix="keyspaces.txt_", text=True) 375 | os.write(fd, keyspace_data) 376 | os.close(fd) 377 | 378 | self.logger.debug("Copying keyspace definition file to first Cassandra instance...") 379 | 380 | # Upload keyspace definitions file 381 | scp_command = 'scp %s -r %s %s:/usr/local/apache-cassandra/conf/keyspaces.txt' % \ 382 | (xstr(ssh_options), temp_keyspace_file, instances[0].public_dns_name) 383 | subprocess.call(scp_command, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) 384 | 385 | # remove temporary file 386 | os.unlink(temp_keyspace_file) 387 | 388 | def _modify_config_file(self, instance, config_file, seed_ips, token): 389 | # XML (0.6.x) 390 | if config_file.endswith(".xml"): 391 | remote_file = "storage-conf.xml" 392 | 393 | xml = parse_xml(urllib.urlopen(config_file)).getroot() 394 | 395 | # Seeds 396 | seeds = xml.find("Seeds") 397 | if seeds is not None: 398 | while seeds.getchildren(): 399 | seeds.remove(seeds.getchildren()[0]) 400 | else: 401 | seeds = Element("Seeds") 402 | xml.append(seeds) 403 | 404 | for seed_ip in seed_ips: 405 | seed = Element("Seed") 406 | seed.text = seed_ip 407 | seeds.append(seed) 408 | 409 | # Initial token 410 | initial_token = xml.find("InitialToken") 411 | if initial_token is None: 412 | initial_token = Element("InitialToken") 413 | xml.append(initial_token) 414 | initial_token.text = token 415 | 416 | # Logs 417 | commit_log_directory = xml.find("CommitLogDirectory") 418 | if commit_log_directory is None: 419 | commit_log_directory = Element("CommitLogDirectory") 420 | xml.append(commit_log_directory) 421 | commit_log_directory.text = "/mnt/cassandra-logs" 422 | 423 | # Data 424 | data_file_directories = xml.find("DataFileDirectories") 425 | if data_file_directories is not None: 426 | while data_file_directories.getchildren(): 427 | data_file_directories.remove(data_file_directories.getchildren()[0]) 428 | else: 429 | data_file_directories = Element("DataFileDirectories") 430 | xml.append(data_file_directories) 431 | data_file_directory = Element("DataFileDirectory") 432 | data_file_directory.text = "/mnt/cassandra-data" 433 | data_file_directories.append(data_file_directory) 434 | 435 | 436 | # listen address 437 | listen_address = xml.find("ListenAddress") 438 | if listen_address is None: 439 | listen_address = Element("ListenAddress") 440 | xml.append(listen_address) 441 | listen_address.text = "" 442 | 443 | # thrift address 444 | thrift_address = xml.find("ThriftAddress") 445 | if thrift_address is None: 446 | thrift_address = Element("ThriftAddress") 447 | xml.append(thrift_address) 448 | thrift_address.text = "" 449 | 450 | fd, temp_file = tempfile.mkstemp(prefix='storage-conf.xml_', text=True) 451 | os.write(fd, dump_xml(xml)) 452 | os.close(fd) 453 | 454 | # YAML (0.7.x) 455 | elif config_file.endswith(".yaml"): 456 | remote_file = "cassandra.yaml" 457 | 458 | yaml = parse_yaml(urllib.urlopen(config_file)) 459 | yaml['seeds'] = seed_ips 460 | yaml['initial_token'] = token 461 | yaml['data_file_directories'] = ['/mnt/cassandra-data'] 462 | yaml['commitlog_directory'] = '/mnt/cassandra-logs' 463 | yaml['listen_address'] = str(instance.private_dns_name) 464 | yaml['rpc_address'] = str(instance.public_dns_name) 465 | 466 | fd, temp_file = tempfile.mkstemp(prefix='cassandra.yaml_', text=True) 467 | os.write(fd, dump_yaml(yaml)) 468 | os.close(fd) 469 | else: 470 | raise Exception("Configuration file must be one of xml or yaml") 471 | 472 | return temp_file, remote_file 473 | 474 | def _get_evenly_spaced_tokens_for_n_instances(self, n): 475 | return [i*(2**127/n) for i in range(1,n+1)] 476 | 477 | def _create_keyspaces_from_definitions_file(self, instance, ssh_options): 478 | # TODO: Keyspaces could already exist...how do I check this? 479 | # TODO: Can it be an arbitrary node? 480 | 481 | self.logger.debug("Creating keyspaces using Thrift API via keyspaces_definitions_file...") 482 | 483 | # test for the keyspace file first 484 | command = "ls /usr/local/apache-cassandra/conf/keyspaces.txt" 485 | ssh_command = self._get_standard_ssh_command(instance, ssh_options, command) 486 | retcode = subprocess.call(ssh_command, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) 487 | 488 | if retcode != 0: 489 | self.logger.warn("Unable to find /usr/local/apache-cassandra/conf/keyspaces.txt. Skipping keyspace generation.") 490 | return 491 | else: 492 | self.logger.debug("Found keyspaces.txt...Proceeding with keyspace generation.") 493 | 494 | command = "/usr/local/apache-cassandra/bin/cassandra-cli --host %s --batch " \ 495 | "< /usr/local/apache-cassandra/conf/keyspaces.txt" % instance.private_dns_name 496 | ssh_command = self._get_standard_ssh_command(instance, ssh_options, command) 497 | retcode = subprocess.call(ssh_command, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) 498 | 499 | # TODO: do this or not? 500 | # remove keyspace file 501 | #command = "rm -rf /usr/local/apache-cassandra/conf/keyspaces.txt" 502 | #ssh_command = self._get_standard_ssh_command(instance, ssh_options, command) 503 | #subprocess.call(ssh_command, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) 504 | 505 | def print_ring(self, ssh_options, instance=None): 506 | if instance is None: 507 | instance = self.get_instances()[0] 508 | 509 | print "\nRing configuration..." 510 | print "NOTE: May not be accurate if the cluster just started." 511 | command = "/usr/local/apache-cassandra/bin/nodetool -h localhost ring" 512 | ssh_command = self._get_standard_ssh_command(instance, ssh_options, command) 513 | subprocess.call(ssh_command, shell=True) 514 | 515 | def start_cassandra(self, ssh_options, create_keyspaces=False, instances=None): 516 | if instances is None: 517 | instances = self.get_instances() 518 | 519 | self.logger.debug("Starting Cassandra service on %d instance(s)..." % len(instances)) 520 | 521 | for instance in instances: 522 | # if checks to see if cassandra is already running 523 | command = "if [ ! -f /root/cassandra.pid ]; then `nohup /usr/local/apache-cassandra/bin/cassandra -p /root/cassandra.pid &> /root/cassandra.out &`; fi" 524 | ssh_command = self._get_standard_ssh_command(instance, ssh_options, command) 525 | retcode = subprocess.call(ssh_command, shell=True) 526 | 527 | if retcode != 0: 528 | self.logger.warn("Return code for starting Cassandra: %d" % retcode) 529 | 530 | # test connection 531 | self.logger.debug("Testing connection to each Cassandra instance...") 532 | 533 | timeout = 600 534 | temp_instances = instances[:] 535 | start_time = time.time() 536 | while len(temp_instances) > 0: 537 | if (time.time() - start_time >= timeout): 538 | raise TimeoutException() 539 | 540 | command = "/usr/local/apache-cassandra/bin/nodetool -h %s ring" % temp_instances[-1].private_dns_name 541 | ssh_command = self._get_standard_ssh_command(temp_instances[-1], ssh_options, command) 542 | retcode = subprocess.call(ssh_command, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) 543 | 544 | if retcode == 0: 545 | temp_instances.pop() 546 | else: 547 | self.logger.warn("Return code for 'nodetool ring' on '%s': %d" % (temp_instances[-1].id, retcode)) 548 | 549 | if create_keyspaces: 550 | self._create_keyspaces_from_definitions_file(instances[0], ssh_options) 551 | else: 552 | self.logger.debug("create_keyspaces is False. Skipping keyspace generation.") 553 | 554 | # TODO: Do I need to wait for the keyspaces to propagate before printing the ring? 555 | # print ring after everything started 556 | self.print_ring(ssh_options, instances[0]) 557 | 558 | self.logger.debug("Startup complete.") 559 | 560 | def stop_cassandra(self, ssh_options, instances=None): 561 | if instances is None: 562 | instances = self.get_instances() 563 | 564 | for instance in instances: 565 | command = "kill `cat /root/cassandra.pid`" 566 | ssh_command = self._get_standard_ssh_command(instance, ssh_options, command) 567 | retcode = subprocess.call(ssh_command, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) 568 | 569 | def login(self, instance, ssh_options): 570 | ssh_command = self._get_standard_ssh_command(instance, ssh_options) 571 | subprocess.call(ssh_command, shell=True) 572 | -------------------------------------------------------------------------------- /plugins/simple/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jbunting' 2 | -------------------------------------------------------------------------------- /plugins/simple/cli.plugin: -------------------------------------------------------------------------------- 1 | [Core] 2 | Name = simple 3 | Module = cli 4 | 5 | [Documentation] 6 | Author = Abe Music 7 | Website = http://github.com/digitalreasoning/PyStratus 8 | Description = A simple CLI implementation for PyStratus 9 | 10 | -------------------------------------------------------------------------------- /plugins/simple/cli.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import logging 4 | import urllib 5 | 6 | from cloud.plugin import CLIPlugin 7 | from cloud.plugin import BASIC_OPTIONS 8 | from cloud.service import InstanceTemplate 9 | from optparse import make_option 10 | from prettytable import PrettyTable 11 | from pprint import pprint 12 | 13 | # Add options here to override what's in the clusters.cfg file 14 | # TODO 15 | 16 | class SimpleServiceCLI(CLIPlugin): 17 | USAGE = """Simple service usage: CLUSTER COMMAND [OPTIONS] 18 | where COMMAND and [OPTIONS] may be one of: 19 | 20 | APPLICATION COMMANDS 21 | ---------------------------------------------------------------------------------- 22 | launch-load-balancer launch a load balancer for CLUSTER 23 | launch-nodes NUM_NODES launch NUM_NODES nodes in CLUSTER 24 | start-nodes start the nodes 25 | stop-nodes stop the nodes 26 | start-load-balancer start the load balancer 27 | stop-load-balancer stop the load balancer 28 | 29 | CLUSTER COMMANDS 30 | ---------------------------------------------------------------------------------- 31 | details list instances in CLUSTER 32 | launch-cluster NUM_NODES launch NUM_NODES Cassandra nodes 33 | expand-cluster NUM_NODES adds new nodes 34 | terminate-cluster terminate all instances in CLUSTER 35 | login log in to the master in CLUSTER over SSH 36 | 37 | STORAGE COMMANDS 38 | ---------------------------------------------------------------------------------- 39 | list-storage list storage volumes for CLUSTER 40 | create-storage NUM_INSTANCES create volumes for NUM_INSTANCES instances 41 | SPEC_FILE for CLUSTER, using SPEC_FILE 42 | delete-storage delete all storage volumes for CLUSTER 43 | """ 44 | # transfer FILE DESTINATION transfer a file to all nodes 45 | # execute COMMAND execute a command on all nodes 46 | 47 | def __init__(self): 48 | super(SimpleServiceCLI, self).__init__() 49 | 50 | #self._logger = logging.getLogger("CassandraServiceCLI") 51 | 52 | def execute_command(self, argv, options_dict): 53 | if len(argv) < 2: 54 | self.print_help() 55 | 56 | self._cluster_name = argv[0] 57 | self._command_name = argv[1] 58 | 59 | # strip off the cluster name and command from argv 60 | argv = argv[2:] 61 | 62 | # get spot configuration 63 | self._spot_config = { 64 | "spot_cluster": True if os.environ.get("SPOT_CLUSTER", options_dict.get("spot_cluster", "false")).lower() == "true" else False, 65 | "max_price": options_dict.get("max_price", None), 66 | "launch_group": options_dict.get("launch_group", None), 67 | } 68 | 69 | # handle all known commands and error on an unknown command 70 | if self._command_name == "details": 71 | self.print_instances() 72 | 73 | elif self._command_name == "simple-details": 74 | self.simple_print_instances(argv, options_dict) 75 | 76 | elif self._command_name == "terminate-cluster": 77 | self.terminate_cluster(argv, options_dict) 78 | 79 | elif self._command_name == "launch-cluster": 80 | self.launch_cluster(argv, options_dict) 81 | 82 | elif self._command_name == "expand-cluster": 83 | self.expand_cluster(argv, options_dict) 84 | 85 | elif self._command_name == "login": 86 | self.login(argv, options_dict) 87 | 88 | elif self._command_name == "run-command": 89 | self.run_command(argv, options_dict) 90 | 91 | elif self._command_name == "transfer-files": 92 | self.transfer_files(argv, options_dict) 93 | 94 | elif self._command_name == "create-storage": 95 | self.create_storage(argv, options_dict) 96 | 97 | elif self._command_name == "delete-storage": 98 | self.delete_storage(argv, options_dict) 99 | 100 | elif self._command_name == "list-storage": 101 | self.print_storage() 102 | 103 | else: 104 | self.print_help() 105 | 106 | def expand_cluster(self, argv, options_dict): 107 | expected_arguments = ["NUM_INSTANCES"] 108 | opt, args = self.parse_options(self._command_name, 109 | argv, 110 | expected_arguments=expected_arguments, 111 | unbounded_args=True) 112 | opt.update(options_dict) 113 | 114 | number_of_nodes = int(args[0]) 115 | instance_template = InstanceTemplate( 116 | (self.service.SIMPLE_NODE,), 117 | number_of_nodes, 118 | opt.get('image_id'), 119 | opt.get('instance_type'), 120 | opt.get('key_name'), 121 | opt.get('public_key'), 122 | opt.get('user_data_file'), 123 | opt.get('availability_zone'), 124 | opt.get('user_packages'), 125 | opt.get('auto_shutdown'), 126 | opt.get('env'), 127 | opt.get('security_groups'), 128 | self._spot_config 129 | ) 130 | 131 | # instance_template.add_env_strings(["CLUSTER_SIZE=%d" % number_of_nodes]) 132 | 133 | print "Expanding cluster by %d instance(s)...please wait." % number_of_nodes 134 | 135 | self.service.expand_cluster(instance_template, 136 | opt.get('ssh_options'),opt.get('wait_dir', '/')) 137 | 138 | def launch_cluster(self, argv, options_dict): 139 | """ 140 | """ 141 | expected_arguments = ["NUM_INSTANCES"] 142 | opt, args = self.parse_options(self._command_name, 143 | argv, 144 | expected_arguments=expected_arguments) 145 | opt.update(options_dict) 146 | 147 | number_of_nodes = int(args[0]) 148 | instance_template = InstanceTemplate( 149 | (self.service.SIMPLE_NODE,), 150 | number_of_nodes, 151 | opt.get('image_id'), 152 | opt.get('instance_type'), 153 | opt.get('key_name'), 154 | opt.get('public_key'), 155 | opt.get('user_data_file'), 156 | opt.get('availability_zone'), 157 | opt.get('user_packages'), 158 | opt.get('auto_shutdown'), 159 | opt.get('env'), 160 | opt.get('security_groups'), 161 | self._spot_config 162 | ) 163 | 164 | instance_template.add_env_strings(["CLUSTER_SIZE=%d" % number_of_nodes]) 165 | 166 | print "Launching cluster with %d instance(s)...please wait." % number_of_nodes 167 | 168 | self.service.launch_cluster(instance_template, 169 | opt.get('ssh_options'),opt.get('wait_dir', '/')) 170 | 171 | def create_storage(self, argv, options_dict): 172 | opt, args = self.parse_options(self._command_name, argv, BASIC_OPTIONS, 173 | ["NUM_INSTANCES", "SPEC_FILE"]) 174 | opt.update(options_dict) 175 | 176 | role = self.service.SIMPLE_NODE 177 | number_of_instances = int(args[0]) 178 | spec_file = args[1] 179 | 180 | # FIXME 181 | # check_options_set(opt, ['availability_zone']) 182 | 183 | self.service.create_storage(role, 184 | number_of_instances, 185 | opt.get('availability_zone'), 186 | spec_file) 187 | self.print_storage() 188 | -------------------------------------------------------------------------------- /plugins/simple/service.plugin: -------------------------------------------------------------------------------- 1 | [Core] 2 | Name = simple 3 | Module = service 4 | 5 | [Documentation] 6 | Author = Abe Music 7 | Website = http://github.com/digitalreasoning/PyStratus 8 | Description = A simple service implementation for PyStratus 9 | -------------------------------------------------------------------------------- /plugins/simple/service.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import subprocess 5 | import urllib 6 | import tempfile 7 | 8 | from cloud.cluster import TimeoutException 9 | from cloud.service import InstanceTemplate 10 | from cloud.plugin import ServicePlugin 11 | from cloud.util import xstr 12 | 13 | from yaml import load as parse_yaml 14 | from yaml import dump as dump_yaml 15 | 16 | try: 17 | from cElementTree import parse as parse_xml 18 | from cElementTree import tostring as dump_xml 19 | from cElementTree import Element 20 | except: 21 | try: 22 | from xml.etree.cElementTree import parse as parse_xml 23 | from xml.etree.cElementTree import tostring as dump_xml 24 | from xml.etree.cElementTree import Element 25 | except: 26 | print "*"*80 27 | print "WARNING: cElementTree module does not exist. Defaulting to elementtree instead." 28 | print "It's recommended that you install the cElementTree module for faster XML parsing." 29 | print "*"*80 30 | from elementtree.ElementTree import parse as parse_xml 31 | from elementtree.ElementTree import parse as parse_xml 32 | from elementtree.ElementTree import Element 33 | 34 | class SimpleService(ServicePlugin): 35 | """ 36 | """ 37 | SIMPLE_NODE = "sn" 38 | 39 | def __init__(self): 40 | super(SimpleService, self).__init__() 41 | 42 | def get_roles(self): 43 | return [self.SIMPLE_NODE] 44 | 45 | def get_instances(self): 46 | return self.cluster.get_instances_in_role(self.SIMPLE_NODE, "running") 47 | 48 | def _wait_for_install(self, instance, ssh_options, wait_dir): 49 | """ 50 | Simply wait for the 'wait' directory to be available so that we can begin configuring 51 | the service before starting it 52 | """ 53 | wait_time = 3 54 | errcount = 0 55 | command = "ls %s" % wait_dir 56 | ssh_command = self._get_standard_ssh_command(instance, ssh_options, command) 57 | 58 | self.logger.info("Waiting for install with command %s" % ssh_command) 59 | while True: 60 | if errcount >= 10: 61 | raise TimeoutException("Maximum errors exceeded.") 62 | try: 63 | subprocess.check_output(ssh_command, shell=True, stderr=subprocess.STDOUT) 64 | break 65 | except subprocess.CalledProcessError, e: 66 | error = e.output.strip() 67 | retcode = e.returncode 68 | if retcode != 255: 69 | print error 70 | print "Return code: %d" % retcode 71 | elif retcode == 255 and "connection refused" in error.lower(): 72 | print "Connection refused error. Typically means SSH services have not been started yet. Retrying." 73 | errcount += 1 74 | else: 75 | print "SSH error. Cause: %s" % e.output.strip() 76 | print "Return code: %d" % retcode 77 | raise 78 | 79 | self.logger.debug("Sleeping for %d seconds..." % wait_time) 80 | time.sleep(wait_time) 81 | 82 | def expand_cluster(self, instance_template, ssh_options, wait_dir): 83 | instances = self.get_instances() 84 | 85 | instance_ids = self._launch_instances(instance_template) 86 | 87 | if len(instance_ids) != instance_template.number: 88 | self.logger.warn("Number of reported instance ids (%d) " \ 89 | "does not match requested number (%d)" % \ 90 | (len(instance_ids), instance_template.number)) 91 | self.logger.debug("Waiting for %s instance(s) to start: %s" % \ 92 | (instance_template.number, ", ".join(instance_ids))) 93 | time.sleep(1) 94 | 95 | try: 96 | self.cluster.wait_for_instances(instance_ids) 97 | self.logger.debug("%d instances started" % (instance_template.number,)) 98 | except TimeoutException: 99 | self.logger.error("Timeout while waiting for %s instance to start." % \ 100 | ",".join(instance_template.roles)) 101 | 102 | instances = self.get_instances() 103 | self.logger.debug("We have %d current instances...", len(instances)) 104 | new_instances = [instance for instance in instances if instance.id in instance_ids] 105 | if(len(new_instances) != len(instance_ids)) : 106 | raise Exception("Could only find %d new instances, expected %s" % (len(new_instances), str(instance_ids))) 107 | 108 | for instance in instances: 109 | self._wait_for_install(instance, ssh_options, wait_dir) 110 | self.logger.info("Instances started: %s" % (str(new_instances),)) 111 | 112 | self._attach_storage(instance_template.roles) 113 | 114 | 115 | def launch_cluster(self, instance_template, ssh_options, wait_dir): 116 | """ 117 | """ 118 | if self.get_instances() : 119 | raise Exception("This cluster is already running. It must be terminated prior to being launched again.") 120 | 121 | self.expand_cluster(instance_template, ssh_options, wait_dir) 122 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from setuptools import setup, find_packages 17 | 18 | version = __import__('cloud').VERSION 19 | 20 | setup(name='stratus', 21 | version=version, 22 | description='Scripts for running various services on cloud providers', 23 | license = 'Apache License (2.0)', 24 | author = 'Abe Music - Digital Reasoning Systems, Inc.', 25 | author_email = 'abe.music@digitalreasoning.com', 26 | packages=['cloud', 'cloud.providers', 'cloud.plugins', 'cloud.plugins.cassandra', 'cloud.plugins.hadoop', 'cloud.plugins.hadoop_cassandra_hybrid', 'cloud.plugins.simple'], 27 | package_dir = {'cloud.plugins': 'plugins'}, 28 | scripts=['stratus'], 29 | include_package_data=True, 30 | package_data = {'': ['*.plugin']}, 31 | install_requires = ['boto==2.0','python-dateutil==1.5','simplejson','prettytable==0.5','yapsy==1.8','fabric','PyYAML'], 32 | ) 33 | --------------------------------------------------------------------------------