├── .gitignore ├── LICENSE ├── README.md ├── config.yml.example ├── emr_loader.py ├── environment.yml └── scripts ├── bootstrap_actions.sh └── pyspark_quick_setup.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.gitignore.io/api/python 2 | 3 | ### Python ### 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | env/ 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *,cover 49 | .hypothesis/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # dotenv 82 | .env 83 | 84 | # virtualenv 85 | .venv/ 86 | venv/ 87 | ENV/ 88 | 89 | # Spyder project settings 90 | .spyderproject 91 | 92 | # Rope project settings 93 | .ropeproject 94 | 95 | # Project specific 96 | config.yml 97 | .idea/ 98 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Dat Tran 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EMR Bootstrap PySpark with Anaconda 2 | 3 | This code should help to jump start PySpark with Anaconda on AWS. 4 | 5 | ## Getting Started 6 | 1. `conda env create -f environment.yml` 7 | 2. Fill in all the required information e.g. aws access key, secret acess key etc. into the `config.yml.example` file and rename it to `config.yml` 8 | 3. Run it `python emr_loader.py` 9 | 10 | ## Requirements 11 | - [Anaconda 3](https://www.continuum.io/downloads) 12 | - [AWS Account](https://aws.amazon.com/) 13 | 14 | ## Copyright 15 | 16 | See [LICENSE](LICENSE) for details. 17 | Copyright (c) 2016 [Dat Tran](http://www.dat-tran.com/). -------------------------------------------------------------------------------- /config.yml.example: -------------------------------------------------------------------------------- 1 | emr: 2 | aws_access_key: 3 | aws_secret_access_key: 4 | region_name: 5 | cluster_name: 6 | instance_count: 7 | master_instance_type: 8 | slave_instance_type: 9 | key_name: 10 | subnet_id: 11 | log_uri: s3:// 12 | software_version: 13 | script_bucket_name: -------------------------------------------------------------------------------- /emr_loader.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import botocore 3 | import yaml 4 | import time 5 | import logging 6 | 7 | logger = logging.getLogger(__name__) 8 | logger.setLevel(logging.DEBUG) 9 | ch = logging.StreamHandler() 10 | ch.setLevel(logging.DEBUG) 11 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 12 | ch.setFormatter(formatter) 13 | logger.addHandler(ch) 14 | 15 | 16 | class EMRLoader(object): 17 | def __init__(self, aws_access_key, aws_secret_access_key, region_name, 18 | cluster_name, instance_count, master_instance_type, slave_instance_type, 19 | key_name, subnet_id, log_uri, software_version, script_bucket_name): 20 | self.aws_access_key = aws_access_key 21 | self.aws_secret_access_key = aws_secret_access_key 22 | self.region_name = region_name 23 | self.cluster_name = cluster_name 24 | self.instance_count = instance_count 25 | self.master_instance_type = master_instance_type 26 | self.slave_instance_type = slave_instance_type 27 | self.key_name = key_name 28 | self.subnet_id = subnet_id 29 | self.log_uri = log_uri 30 | self.software_version = software_version 31 | self.script_bucket_name = script_bucket_name 32 | 33 | def boto_client(self, service): 34 | client = boto3.client(service, 35 | aws_access_key_id=self.aws_access_key, 36 | aws_secret_access_key=self.aws_secret_access_key, 37 | region_name=self.region_name) 38 | return client 39 | 40 | def load_cluster(self): 41 | response = self.boto_client("emr").run_job_flow( 42 | Name=self.cluster_name, 43 | LogUri=self.log_uri, 44 | ReleaseLabel=self.software_version, 45 | Instances={ 46 | 'MasterInstanceType': self.master_instance_type, 47 | 'SlaveInstanceType': self.slave_instance_type, 48 | 'InstanceCount': self.instance_count, 49 | 'KeepJobFlowAliveWhenNoSteps': True, 50 | 'TerminationProtected': False, 51 | 'Ec2KeyName': self.key_name, 52 | 'Ec2SubnetId': self.subnet_id 53 | }, 54 | Applications=[ 55 | { 56 | 'Name': 'Spark' 57 | } 58 | ], 59 | BootstrapActions=[ 60 | { 61 | 'Name': 'Install Conda', 62 | 'ScriptBootstrapAction': { 63 | 'Path': 's3://{script_bucket_name}/bootstrap_actions.sh'.format( 64 | script_bucket_name=self.script_bucket_name), 65 | } 66 | }, 67 | ], 68 | VisibleToAllUsers=True, 69 | JobFlowRole='EMR_EC2_DefaultRole', 70 | ServiceRole='EMR_DefaultRole' 71 | ) 72 | logger.info(response) 73 | return response 74 | 75 | def add_step(self, job_flow_id, master_dns): 76 | response = self.boto_client("emr").add_job_flow_steps( 77 | JobFlowId=job_flow_id, 78 | Steps=[ 79 | { 80 | 'Name': 'setup - copy files', 81 | 'ActionOnFailure': 'CANCEL_AND_WAIT', 82 | 'HadoopJarStep': { 83 | 'Jar': 'command-runner.jar', 84 | 'Args': ['aws', 's3', 'cp', 85 | 's3://{script_bucket_name}/pyspark_quick_setup.sh'.format( 86 | script_bucket_name=self.script_bucket_name), 87 | '/home/hadoop/'] 88 | } 89 | }, 90 | { 91 | 'Name': 'setup pyspark with conda', 92 | 'ActionOnFailure': 'CANCEL_AND_WAIT', 93 | 'HadoopJarStep': { 94 | 'Jar': 'command-runner.jar', 95 | 'Args': ['sudo', 'bash', '/home/hadoop/pyspark_quick_setup.sh', master_dns] 96 | } 97 | } 98 | ] 99 | ) 100 | logger.info(response) 101 | return response 102 | 103 | def create_bucket_on_s3(self, bucket_name): 104 | s3 = self.boto_client("s3") 105 | try: 106 | logger.info("Bucket already exists.") 107 | s3.head_bucket(Bucket=bucket_name) 108 | except botocore.exceptions.ClientError as e: 109 | logger.info("Bucket does not exist: {error}. I will create it!".format(error=e)) 110 | s3.create_bucket(Bucket=bucket_name) 111 | 112 | def upload_to_s3(self, file_name, bucket_name, key_name): 113 | s3 = self.boto_client("s3") 114 | logger.info( 115 | "Upload file '{file_name}' to bucket '{bucket_name}'".format(file_name=file_name, bucket_name=bucket_name)) 116 | s3.upload_file(file_name, bucket_name, key_name) 117 | 118 | 119 | def main(): 120 | logger.info( 121 | "*******************************************+**********************************************************") 122 | logger.info("Load config and set up client.") 123 | with open("config.yml", "r") as file: 124 | config = yaml.load(file) 125 | config_emr = config.get("emr") 126 | 127 | emr_loader = EMRLoader( 128 | aws_access_key=config_emr.get("aws_access_key"), 129 | aws_secret_access_key=config_emr.get("aws_secret_access_key"), 130 | region_name=config_emr.get("region_name"), 131 | cluster_name=config_emr.get("cluster_name"), 132 | instance_count=config_emr.get("instance_count"), 133 | master_instance_type=config_emr.get("master_instance_type"), 134 | slave_instance_type=config_emr.get("slave_instance_type"), 135 | key_name=config_emr.get("key_name"), 136 | subnet_id=config_emr.get("subnet_id"), 137 | log_uri=config_emr.get("log_uri"), 138 | software_version=config_emr.get("software_version"), 139 | script_bucket_name=config_emr.get("script_bucket_name") 140 | ) 141 | 142 | logger.info( 143 | "*******************************************+**********************************************************") 144 | logger.info("Check if bucket exists otherwise create it and upload files to S3.") 145 | emr_loader.create_bucket_on_s3(bucket_name=config_emr.get("script_bucket_name")) 146 | emr_loader.upload_to_s3("scripts/bootstrap_actions.sh", bucket_name=config_emr.get("script_bucket_name"), 147 | key_name="bootstrap_actions.sh") 148 | emr_loader.upload_to_s3("scripts/pyspark_quick_setup.sh", bucket_name=config_emr.get("script_bucket_name"), 149 | key_name="pyspark_quick_setup.sh") 150 | 151 | logger.info( 152 | "*******************************************+**********************************************************") 153 | logger.info("Create cluster and run boostrap.") 154 | emr_response = emr_loader.load_cluster() 155 | emr_client = emr_loader.boto_client("emr") 156 | 157 | while True: 158 | job_response = emr_client.describe_cluster( 159 | ClusterId=emr_response.get("JobFlowId") 160 | ) 161 | time.sleep(10) 162 | if job_response.get("Cluster").get("MasterPublicDnsName") is not None: 163 | master_dns = job_response.get("Cluster").get("MasterPublicDnsName") 164 | 165 | step = True 166 | 167 | job_state = job_response.get("Cluster").get("Status").get("State") 168 | job_state_reason = job_response.get("Cluster").get("Status").get("StateChangeReason").get("Message") 169 | 170 | if job_state in ["WAITING", "TERMINATED", "TERMINATED_WITH_ERRORS"]: 171 | step = False 172 | logger.info( 173 | "Script stops with state: {job_state} " 174 | "and reason: {job_state_reason}".format(job_state=job_state, job_state_reason=job_state_reason)) 175 | break 176 | else: 177 | logger.info(job_response) 178 | 179 | if step: 180 | logger.info( 181 | "*******************************************+**********************************************************") 182 | logger.info("Run steps.") 183 | add_step_response = emr_loader.add_step(emr_response.get("JobFlowId"), master_dns) 184 | 185 | while True: 186 | list_steps_response = emr_client.list_steps(ClusterId=emr_response.get("JobFlowId"), 187 | StepStates=["COMPLETED"]) 188 | time.sleep(10) 189 | if len(list_steps_response.get("Steps")) == len( 190 | add_step_response.get("StepIds")): # make sure that all steps are completed 191 | break 192 | else: 193 | logger.info(emr_client.list_steps(ClusterId=emr_response.get("JobFlowId"))) 194 | else: 195 | logger.info("Cannot run steps.") 196 | 197 | 198 | if __name__ == "__main__": 199 | main() 200 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: emr-bootstrap-pyspark 2 | channels: 3 | - spacy 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - boto3=1.4.0=py35_0 8 | - conda-forge::botocore=1.4.49=py35_0 9 | - conda-forge::ca-certificates=2016.8.31=0 10 | - conda-forge::certifi=2016.8.31=py35_0 11 | - conda-forge::docutils=0.12=py35_0 12 | - conda-forge::jmespath=0.9.0=py35_0 13 | - conda-forge::ncurses=5.9=9 14 | - conda-forge::openssl=1.0.2h=2 15 | - conda-forge::pip=8.1.2=py35_0 16 | - conda-forge::python=3.5.2=2 17 | - conda-forge::python-dateutil=2.5.3=py35_0 18 | - conda-forge::pyyaml=3.11=py35_0 19 | - conda-forge::readline=6.2=0 20 | - conda-forge::s3transfer=0.1.8=py35_0 21 | - conda-forge::setuptools=27.1.2=py35_0 22 | - conda-forge::six=1.10.0=py35_0 23 | - conda-forge::sqlite=3.13.0=1 24 | - conda-forge::tk=8.5.19=0 25 | - conda-forge::wheel=0.29.0=py35_0 26 | - conda-forge::xz=5.2.2=0 27 | - conda-forge::yaml=0.1.6=0 28 | - conda-forge::zlib=1.2.8=3 29 | - pip: 30 | - botocore==1.4.49 31 | - certifi==2016.8.31 32 | - docutils==0.12 33 | - jmespath==0.9.0 34 | - pip==8.1.2 35 | - python-dateutil==2.5.3 36 | - pyyaml==3.11 37 | - s3transfer==0.1.8 38 | - setuptools==27.1.2.post20161105 39 | - six==1.10.0 40 | - wheel==0.29.0 41 | prefix: /Users/datitran/anaconda/envs/emr-bootstrap-pyspark 42 | 43 | -------------------------------------------------------------------------------- /scripts/bootstrap_actions.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # install conda 4 | wget --quiet https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh \ 5 | && /bin/bash ~/miniconda.sh -b -p $HOME/conda 6 | 7 | echo -e '\nexport PATH=$HOME/conda/bin:$PATH' >> $HOME/.bashrc && source $HOME/.bashrc 8 | 9 | # install packages 10 | conda install -y ipython jupyter -------------------------------------------------------------------------------- /scripts/pyspark_quick_setup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # bind conda to spark 4 | echo -e "\nexport PYSPARK_PYTHON=/home/hadoop/conda/bin/python" >> /etc/spark/conf/spark-env.sh 5 | echo "export PYSPARK_DRIVER_PYTHON=/home/hadoop/conda/bin/jupyter" >> /etc/spark/conf/spark-env.sh 6 | echo "export PYSPARK_DRIVER_PYTHON_OPTS='notebook --no-browser --ip=$1'" >> /etc/spark/conf/spark-env.sh --------------------------------------------------------------------------------