├── .gitignore
├── LICENSE
├── README.md
├── config.yml.example
├── emr_loader.py
├── environment.yml
└── scripts
    ├── bootstrap_actions.sh
    └── pyspark_quick_setup.sh


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Created by https://www.gitignore.io/api/python
 2 | 
 3 | ### Python ###
 4 | # Byte-compiled / optimized / DLL files
 5 | __pycache__/
 6 | *.py[cod]
 7 | *$py.class
 8 | 
 9 | # C extensions
10 | *.so
11 | 
12 | # Distribution / packaging
13 | .Python
14 | env/
15 | build/
16 | develop-eggs/
17 | dist/
18 | downloads/
19 | eggs/
20 | .eggs/
21 | lib/
22 | lib64/
23 | parts/
24 | sdist/
25 | var/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | 
30 | # PyInstaller
31 | #  Usually these files are written by a python script from a template
32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 | 
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 | 
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *,cover
49 | .hypothesis/
50 | 
51 | # Translations
52 | *.mo
53 | *.pot
54 | 
55 | # Django stuff:
56 | *.log
57 | local_settings.py
58 | 
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 | 
63 | # Scrapy stuff:
64 | .scrapy
65 | 
66 | # Sphinx documentation
67 | docs/_build/
68 | 
69 | # PyBuilder
70 | target/
71 | 
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 | 
75 | # pyenv
76 | .python-version
77 | 
78 | # celery beat schedule file
79 | celerybeat-schedule
80 | 
81 | # dotenv
82 | .env
83 | 
84 | # virtualenv
85 | .venv/
86 | venv/
87 | ENV/
88 | 
89 | # Spyder project settings
90 | .spyderproject
91 | 
92 | # Rope project settings
93 | .ropeproject
94 | 
95 | # Project specific
96 | config.yml
97 | .idea/
98 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Dat Tran
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # EMR Bootstrap PySpark with Anaconda
 2 | 
 3 | This code should help to jump start PySpark with Anaconda on AWS.
 4 | 
 5 | ## Getting Started
 6 | 1. `conda env create -f environment.yml`
 7 | 2. Fill in all the required information e.g. aws access key, secret acess key etc. into the `config.yml.example` file and rename it to `config.yml`
 8 | 3. Run it `python emr_loader.py`
 9 | 
10 | ## Requirements
11 | - [Anaconda 3](https://www.continuum.io/downloads)
12 | - [AWS Account](https://aws.amazon.com/)
13 | 
14 | ## Copyright
15 | 
16 | See [LICENSE](LICENSE) for details.
17 | Copyright (c) 2016 [Dat Tran](http://www.dat-tran.com/).


--------------------------------------------------------------------------------
/config.yml.example:
--------------------------------------------------------------------------------
 1 | emr:
 2 |   aws_access_key: <AWS Access Key>
 3 |   aws_secret_access_key: <AWS Secret Access Key>
 4 |   region_name: <Region Name>
 5 |   cluster_name: <Cluster Name>
 6 |   instance_count: <Number of Instances>
 7 |   master_instance_type: <Instance Type for Master>
 8 |   slave_instance_type: <Instance Type for Slave>
 9 |   key_name: <Ec2 Keyname>
10 |   subnet_id: <Subnet ID e.g. for M4 Instances>
11 |   log_uri: s3://<Your S3 Location to the Logs>
12 |   software_version: <emr-5.x.x>
13 |   script_bucket_name: <Name of Bucket to be created on S3>


--------------------------------------------------------------------------------
/emr_loader.py:
--------------------------------------------------------------------------------
  1 | import boto3
  2 | import botocore
  3 | import yaml
  4 | import time
  5 | import logging
  6 | 
  7 | logger = logging.getLogger(__name__)
  8 | logger.setLevel(logging.DEBUG)
  9 | ch = logging.StreamHandler()
 10 | ch.setLevel(logging.DEBUG)
 11 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 12 | ch.setFormatter(formatter)
 13 | logger.addHandler(ch)
 14 | 
 15 | 
 16 | class EMRLoader(object):
 17 |     def __init__(self, aws_access_key, aws_secret_access_key, region_name,
 18 |                  cluster_name, instance_count, master_instance_type, slave_instance_type,
 19 |                  key_name, subnet_id, log_uri, software_version, script_bucket_name):
 20 |         self.aws_access_key = aws_access_key
 21 |         self.aws_secret_access_key = aws_secret_access_key
 22 |         self.region_name = region_name
 23 |         self.cluster_name = cluster_name
 24 |         self.instance_count = instance_count
 25 |         self.master_instance_type = master_instance_type
 26 |         self.slave_instance_type = slave_instance_type
 27 |         self.key_name = key_name
 28 |         self.subnet_id = subnet_id
 29 |         self.log_uri = log_uri
 30 |         self.software_version = software_version
 31 |         self.script_bucket_name = script_bucket_name
 32 | 
 33 |     def boto_client(self, service):
 34 |         client = boto3.client(service,
 35 |                               aws_access_key_id=self.aws_access_key,
 36 |                               aws_secret_access_key=self.aws_secret_access_key,
 37 |                               region_name=self.region_name)
 38 |         return client
 39 | 
 40 |     def load_cluster(self):
 41 |         response = self.boto_client("emr").run_job_flow(
 42 |             Name=self.cluster_name,
 43 |             LogUri=self.log_uri,
 44 |             ReleaseLabel=self.software_version,
 45 |             Instances={
 46 |                 'MasterInstanceType': self.master_instance_type,
 47 |                 'SlaveInstanceType': self.slave_instance_type,
 48 |                 'InstanceCount': self.instance_count,
 49 |                 'KeepJobFlowAliveWhenNoSteps': True,
 50 |                 'TerminationProtected': False,
 51 |                 'Ec2KeyName': self.key_name,
 52 |                 'Ec2SubnetId': self.subnet_id
 53 |             },
 54 |             Applications=[
 55 |                 {
 56 |                     'Name': 'Spark'
 57 |                 }
 58 |             ],
 59 |             BootstrapActions=[
 60 |                 {
 61 |                     'Name': 'Install Conda',
 62 |                     'ScriptBootstrapAction': {
 63 |                         'Path': 's3://{script_bucket_name}/bootstrap_actions.sh'.format(
 64 |                             script_bucket_name=self.script_bucket_name),
 65 |                     }
 66 |                 },
 67 |             ],
 68 |             VisibleToAllUsers=True,
 69 |             JobFlowRole='EMR_EC2_DefaultRole',
 70 |             ServiceRole='EMR_DefaultRole'
 71 |         )
 72 |         logger.info(response)
 73 |         return response
 74 | 
 75 |     def add_step(self, job_flow_id, master_dns):
 76 |         response = self.boto_client("emr").add_job_flow_steps(
 77 |             JobFlowId=job_flow_id,
 78 |             Steps=[
 79 |                 {
 80 |                     'Name': 'setup - copy files',
 81 |                     'ActionOnFailure': 'CANCEL_AND_WAIT',
 82 |                     'HadoopJarStep': {
 83 |                         'Jar': 'command-runner.jar',
 84 |                         'Args': ['aws', 's3', 'cp',
 85 |                                  's3://{script_bucket_name}/pyspark_quick_setup.sh'.format(
 86 |                                      script_bucket_name=self.script_bucket_name),
 87 |                                  '/home/hadoop/']
 88 |                     }
 89 |                 },
 90 |                 {
 91 |                     'Name': 'setup pyspark with conda',
 92 |                     'ActionOnFailure': 'CANCEL_AND_WAIT',
 93 |                     'HadoopJarStep': {
 94 |                         'Jar': 'command-runner.jar',
 95 |                         'Args': ['sudo', 'bash', '/home/hadoop/pyspark_quick_setup.sh', master_dns]
 96 |                     }
 97 |                 }
 98 |             ]
 99 |         )
100 |         logger.info(response)
101 |         return response
102 | 
103 |     def create_bucket_on_s3(self, bucket_name):
104 |         s3 = self.boto_client("s3")
105 |         try:
106 |             logger.info("Bucket already exists.")
107 |             s3.head_bucket(Bucket=bucket_name)
108 |         except botocore.exceptions.ClientError as e:
109 |             logger.info("Bucket does not exist: {error}. I will create it!".format(error=e))
110 |             s3.create_bucket(Bucket=bucket_name)
111 | 
112 |     def upload_to_s3(self, file_name, bucket_name, key_name):
113 |         s3 = self.boto_client("s3")
114 |         logger.info(
115 |             "Upload file '{file_name}' to bucket '{bucket_name}'".format(file_name=file_name, bucket_name=bucket_name))
116 |         s3.upload_file(file_name, bucket_name, key_name)
117 | 
118 | 
119 | def main():
120 |     logger.info(
121 |         "*******************************************+**********************************************************")
122 |     logger.info("Load config and set up client.")
123 |     with open("config.yml", "r") as file:
124 |         config = yaml.load(file)
125 |     config_emr = config.get("emr")
126 | 
127 |     emr_loader = EMRLoader(
128 |         aws_access_key=config_emr.get("aws_access_key"),
129 |         aws_secret_access_key=config_emr.get("aws_secret_access_key"),
130 |         region_name=config_emr.get("region_name"),
131 |         cluster_name=config_emr.get("cluster_name"),
132 |         instance_count=config_emr.get("instance_count"),
133 |         master_instance_type=config_emr.get("master_instance_type"),
134 |         slave_instance_type=config_emr.get("slave_instance_type"),
135 |         key_name=config_emr.get("key_name"),
136 |         subnet_id=config_emr.get("subnet_id"),
137 |         log_uri=config_emr.get("log_uri"),
138 |         software_version=config_emr.get("software_version"),
139 |         script_bucket_name=config_emr.get("script_bucket_name")
140 |     )
141 | 
142 |     logger.info(
143 |         "*******************************************+**********************************************************")
144 |     logger.info("Check if bucket exists otherwise create it and upload files to S3.")
145 |     emr_loader.create_bucket_on_s3(bucket_name=config_emr.get("script_bucket_name"))
146 |     emr_loader.upload_to_s3("scripts/bootstrap_actions.sh", bucket_name=config_emr.get("script_bucket_name"),
147 |                             key_name="bootstrap_actions.sh")
148 |     emr_loader.upload_to_s3("scripts/pyspark_quick_setup.sh", bucket_name=config_emr.get("script_bucket_name"),
149 |                             key_name="pyspark_quick_setup.sh")
150 | 
151 |     logger.info(
152 |         "*******************************************+**********************************************************")
153 |     logger.info("Create cluster and run boostrap.")
154 |     emr_response = emr_loader.load_cluster()
155 |     emr_client = emr_loader.boto_client("emr")
156 | 
157 |     while True:
158 |         job_response = emr_client.describe_cluster(
159 |             ClusterId=emr_response.get("JobFlowId")
160 |         )
161 |         time.sleep(10)
162 |         if job_response.get("Cluster").get("MasterPublicDnsName") is not None:
163 |             master_dns = job_response.get("Cluster").get("MasterPublicDnsName")
164 | 
165 |         step = True
166 | 
167 |         job_state = job_response.get("Cluster").get("Status").get("State")
168 |         job_state_reason = job_response.get("Cluster").get("Status").get("StateChangeReason").get("Message")
169 | 
170 |         if job_state in ["WAITING", "TERMINATED", "TERMINATED_WITH_ERRORS"]:
171 |             step = False
172 |             logger.info(
173 |                 "Script stops with state: {job_state} "
174 |                 "and reason: {job_state_reason}".format(job_state=job_state, job_state_reason=job_state_reason))
175 |             break
176 |         else:
177 |             logger.info(job_response)
178 | 
179 |     if step:
180 |         logger.info(
181 |             "*******************************************+**********************************************************")
182 |         logger.info("Run steps.")
183 |         add_step_response = emr_loader.add_step(emr_response.get("JobFlowId"), master_dns)
184 | 
185 |         while True:
186 |             list_steps_response = emr_client.list_steps(ClusterId=emr_response.get("JobFlowId"),
187 |                                                         StepStates=["COMPLETED"])
188 |             time.sleep(10)
189 |             if len(list_steps_response.get("Steps")) == len(
190 |                     add_step_response.get("StepIds")):  # make sure that all steps are completed
191 |                 break
192 |             else:
193 |                 logger.info(emr_client.list_steps(ClusterId=emr_response.get("JobFlowId")))
194 |     else:
195 |         logger.info("Cannot run steps.")
196 | 
197 | 
198 | if __name__ == "__main__":
199 |     main()
200 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: emr-bootstrap-pyspark
 2 | channels:
 3 | - spacy
 4 | - conda-forge
 5 | - defaults
 6 | dependencies:
 7 | - boto3=1.4.0=py35_0
 8 | - conda-forge::botocore=1.4.49=py35_0
 9 | - conda-forge::ca-certificates=2016.8.31=0
10 | - conda-forge::certifi=2016.8.31=py35_0
11 | - conda-forge::docutils=0.12=py35_0
12 | - conda-forge::jmespath=0.9.0=py35_0
13 | - conda-forge::ncurses=5.9=9
14 | - conda-forge::openssl=1.0.2h=2
15 | - conda-forge::pip=8.1.2=py35_0
16 | - conda-forge::python=3.5.2=2
17 | - conda-forge::python-dateutil=2.5.3=py35_0
18 | - conda-forge::pyyaml=3.11=py35_0
19 | - conda-forge::readline=6.2=0
20 | - conda-forge::s3transfer=0.1.8=py35_0
21 | - conda-forge::setuptools=27.1.2=py35_0
22 | - conda-forge::six=1.10.0=py35_0
23 | - conda-forge::sqlite=3.13.0=1
24 | - conda-forge::tk=8.5.19=0
25 | - conda-forge::wheel=0.29.0=py35_0
26 | - conda-forge::xz=5.2.2=0
27 | - conda-forge::yaml=0.1.6=0
28 | - conda-forge::zlib=1.2.8=3
29 | - pip:
30 |   - botocore==1.4.49
31 |   - certifi==2016.8.31
32 |   - docutils==0.12
33 |   - jmespath==0.9.0
34 |   - pip==8.1.2
35 |   - python-dateutil==2.5.3
36 |   - pyyaml==3.11
37 |   - s3transfer==0.1.8
38 |   - setuptools==27.1.2.post20161105
39 |   - six==1.10.0
40 |   - wheel==0.29.0
41 | prefix: /Users/datitran/anaconda/envs/emr-bootstrap-pyspark
42 | 
43 | 


--------------------------------------------------------------------------------
/scripts/bootstrap_actions.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # install conda
 4 | wget --quiet https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh \
 5 |     && /bin/bash ~/miniconda.sh -b -p $HOME/conda
 6 | 
 7 | echo -e '\nexport PATH=$HOME/conda/bin:$PATH' >> $HOME/.bashrc && source $HOME/.bashrc
 8 | 
 9 | # install packages
10 | conda install -y ipython jupyter


--------------------------------------------------------------------------------
/scripts/pyspark_quick_setup.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | # bind conda to spark
4 | echo -e "\nexport PYSPARK_PYTHON=/home/hadoop/conda/bin/python" >> /etc/spark/conf/spark-env.sh
5 | echo "export PYSPARK_DRIVER_PYTHON=/home/hadoop/conda/bin/jupyter" >> /etc/spark/conf/spark-env.sh
6 | echo "export PYSPARK_DRIVER_PYTHON_OPTS='notebook --no-browser --ip=$1'" >> /etc/spark/conf/spark-env.sh


--------------------------------------------------------------------------------