├── .gitignore
├── Pipfile
├── Pipfile.lock
├── README.md
├── build_dependencies.sh
├── configs
    └── etl_config.json
├── dependencies
    ├── __init__.py
    ├── logging.py
    └── spark.py
├── jobs
    └── etl_job.py
├── packages.zip
└── tests
    ├── __init__.py
    ├── test_data
        ├── employees
        │   ├── ._SUCCESS.crc
        │   ├── .part-00000-9abf32a3-db43-42e1-9639-363ef11c0d1c-c000.snappy.parquet.crc
        │   ├── _SUCCESS
        │   └── part-00000-9abf32a3-db43-42e1-9639-363ef11c0d1c-c000.snappy.parquet
        └── employees_report
        │   ├── ._SUCCESS.crc
        │   ├── .part-00000-4a609ba3-0404-48bb-bb22-2fec3e2f1e68-c000.snappy.parquet.crc
        │   ├── _SUCCESS
        │   └── part-00000-4a609ba3-0404-48bb-bb22-2fec3e2f1e68-c000.snappy.parquet
    └── test_etl_job.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea/*
 2 | */__pycache__/*
 3 | *?metastore_db/*
 4 | */spark_warehouse/*
 5 | .mypy_cache/
 6 | .vscode/*
 7 | .venv
 8 | venv/*
 9 | loaded_data/*
10 | derby.log
11 | .DS_Store
12 | 


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | 
 8 | [dev-packages]
 9 | pyspark = "==2.4.0"
10 | ipython = "*"
11 | "flake8" = "*"
12 | 
13 | [requires]
14 | python_version = "3.6"
15 | 


--------------------------------------------------------------------------------
/Pipfile.lock:
--------------------------------------------------------------------------------
  1 | {
  2 |     "_meta": {
  3 |         "hash": {
  4 |             "sha256": "6233634aaea8fcac205da57dd6cf6994a6cd2163c027a93883786699f221f912"
  5 |         },
  6 |         "pipfile-spec": 6,
  7 |         "requires": {
  8 |             "python_version": "3.6"
  9 |         },
 10 |         "sources": [
 11 |             {
 12 |                 "name": "pypi",
 13 |                 "url": "https://pypi.org/simple",
 14 |                 "verify_ssl": true
 15 |             }
 16 |         ]
 17 |     },
 18 |     "default": {},
 19 |     "develop": {
 20 |         "backcall": {
 21 |             "hashes": [
 22 |                 "sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4",
 23 |                 "sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2"
 24 |             ],
 25 |             "version": "==0.1.0"
 26 |         },
 27 |         "decorator": {
 28 |             "hashes": [
 29 |                 "sha256:33cd704aea07b4c28b3eb2c97d288a06918275dac0ecebdaf1bc8a48d98adb9e",
 30 |                 "sha256:cabb249f4710888a2fc0e13e9a16c343d932033718ff62e1e9bc93a9d3a9122b"
 31 |             ],
 32 |             "version": "==4.3.2"
 33 |         },
 34 |         "entrypoints": {
 35 |             "hashes": [
 36 |                 "sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19",
 37 |                 "sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451"
 38 |             ],
 39 |             "version": "==0.3"
 40 |         },
 41 |         "flake8": {
 42 |             "hashes": [
 43 |                 "sha256:0323db2e3a72faa2c4cdd61ea87594b9cb343fc4dfa5c24d6b43059d7ba29d0e",
 44 |                 "sha256:a7951ade4814e5e5364bdce1e73862cbacf2bbb9b509a9bf8c130a0414cf0722"
 45 |             ],
 46 |             "index": "pypi",
 47 |             "version": "==3.7.2"
 48 |         },
 49 |         "ipython": {
 50 |             "hashes": [
 51 |                 "sha256:6a9496209b76463f1dec126ab928919aaf1f55b38beb9219af3fe202f6bbdd12",
 52 |                 "sha256:f69932b1e806b38a7818d9a1e918e5821b685715040b48e59c657b3c7961b742"
 53 |             ],
 54 |             "index": "pypi",
 55 |             "version": "==7.2.0"
 56 |         },
 57 |         "ipython-genutils": {
 58 |             "hashes": [
 59 |                 "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8",
 60 |                 "sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8"
 61 |             ],
 62 |             "version": "==0.2.0"
 63 |         },
 64 |         "jedi": {
 65 |             "hashes": [
 66 |                 "sha256:571702b5bd167911fe9036e5039ba67f820d6502832285cde8c881ab2b2149fd",
 67 |                 "sha256:c8481b5e59d34a5c7c42e98f6625e633f6ef59353abea6437472c7ec2093f191"
 68 |             ],
 69 |             "version": "==0.13.2"
 70 |         },
 71 |         "mccabe": {
 72 |             "hashes": [
 73 |                 "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
 74 |                 "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
 75 |             ],
 76 |             "version": "==0.6.1"
 77 |         },
 78 |         "parso": {
 79 |             "hashes": [
 80 |                 "sha256:4b8f9ed80c3a4a3191aa3261505d868aa552dd25649cb13a7d73b6b7315edf2d",
 81 |                 "sha256:5a120be2e8863993b597f1c0437efca799e90e0793c98ae5d4e34ebd00140e31"
 82 |             ],
 83 |             "version": "==0.3.2"
 84 |         },
 85 |         "pexpect": {
 86 |             "hashes": [
 87 |                 "sha256:2a8e88259839571d1251d278476f3eec5db26deb73a70be5ed5dc5435e418aba",
 88 |                 "sha256:3fbd41d4caf27fa4a377bfd16fef87271099463e6fa73e92a52f92dfee5d425b"
 89 |             ],
 90 |             "markers": "sys_platform != 'win32'",
 91 |             "version": "==4.6.0"
 92 |         },
 93 |         "pickleshare": {
 94 |             "hashes": [
 95 |                 "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca",
 96 |                 "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56"
 97 |             ],
 98 |             "version": "==0.7.5"
 99 |         },
100 |         "prompt-toolkit": {
101 |             "hashes": [
102 |                 "sha256:88002cc618cacfda8760c4539e76c3b3f148ecdb7035a3d422c7ecdc90c2a3ba",
103 |                 "sha256:c6655a12e9b08edb8cf5aeab4815fd1e1bdea4ad73d3bbf269cf2e0c4eb75d5e",
104 |                 "sha256:df5835fb8f417aa55e5cafadbaeb0cf630a1e824aad16989f9f0493e679ec010"
105 |             ],
106 |             "version": "==2.0.8"
107 |         },
108 |         "ptyprocess": {
109 |             "hashes": [
110 |                 "sha256:923f299cc5ad920c68f2bc0bc98b75b9f838b93b599941a6b63ddbc2476394c0",
111 |                 "sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f"
112 |             ],
113 |             "version": "==0.6.0"
114 |         },
115 |         "py4j": {
116 |             "hashes": [
117 |                 "sha256:721189616b3a7d28212dfb2e7c6a1dd5147b03105f1fc37ff2432acd0e863fa5",
118 |                 "sha256:a950fe7de1bfd247a0a4dddb9118f332d22a89e01e0699135ea8038c15ee1293"
119 |             ],
120 |             "version": "==0.10.7"
121 |         },
122 |         "pycodestyle": {
123 |             "hashes": [
124 |                 "sha256:95a2219d12372f05704562a14ec30bc76b05a5b297b21a5dfe3f6fac3491ae56",
125 |                 "sha256:e40a936c9a450ad81df37f549d676d127b1b66000a6c500caa2b085bc0ca976c"
126 |             ],
127 |             "version": "==2.5.0"
128 |         },
129 |         "pyflakes": {
130 |             "hashes": [
131 |                 "sha256:5e8c00e30c464c99e0b501dc160b13a14af7f27d4dffb529c556e30a159e231d",
132 |                 "sha256:f277f9ca3e55de669fba45b7393a1449009cff5a37d1af10ebb76c52765269cd"
133 |             ],
134 |             "version": "==2.1.0"
135 |         },
136 |         "pygments": {
137 |             "hashes": [
138 |                 "sha256:5ffada19f6203563680669ee7f53b64dabbeb100eb51b61996085e99c03b284a",
139 |                 "sha256:e8218dd399a61674745138520d0d4cf2621d7e032439341bc3f647bff125818d"
140 |             ],
141 |             "version": "==2.3.1"
142 |         },
143 |         "pyspark": {
144 |             "hashes": [
145 |                 "sha256:c9d7b7c5e91b13488b657e364ff392a80b2e374b182138e5ec8702a1822bffdc"
146 |             ],
147 |             "index": "pypi",
148 |             "version": "==2.4.0"
149 |         },
150 |         "six": {
151 |             "hashes": [
152 |                 "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
153 |                 "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
154 |             ],
155 |             "version": "==1.12.0"
156 |         },
157 |         "traitlets": {
158 |             "hashes": [
159 |                 "sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",
160 |                 "sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9"
161 |             ],
162 |             "version": "==4.3.2"
163 |         },
164 |         "wcwidth": {
165 |             "hashes": [
166 |                 "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
167 |                 "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
168 |             ],
169 |             "version": "==0.1.7"
170 |         }
171 |     }
172 | }
173 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PySpark Example Project
  2 | 
  3 | This document is designed to be read in parallel with the code in the `pyspark-template-project` repository. Together, these constitute what we consider to be a 'best practices' approach to writing ETL jobs using Apache Spark and its Python ('PySpark') APIs. This project addresses the following topics:
  4 | 
  5 | - how to structure ETL code in such a way that it can be easily tested and debugged;
  6 | - how to pass configuration parameters to a PySpark job;
  7 | - how to handle dependencies on other modules and packages; and,
  8 | - what constitutes a 'meaningful' test for an ETL job.
  9 | 
 10 | ## ETL Project Structure
 11 | 
 12 | The basic project structure is as follows:
 13 | 
 14 | ```bash
 15 | root/
 16 |  |-- configs/
 17 |  |   |-- etl_config.json
 18 |  |-- dependencies/
 19 |  |   |-- logging.py
 20 |  |   |-- spark.py
 21 |  |-- jobs/
 22 |  |   |-- etl_job.py
 23 |  |-- tests/
 24 |  |   |-- test_data/
 25 |  |   |-- | -- employees/
 26 |  |   |-- | -- employees_report/
 27 |  |   |-- test_etl_job.py
 28 |  |   build_dependencies.sh
 29 |  |   packages.zip
 30 |  |   Pipfile
 31 |  |   Pipfile.lock
 32 | ```
 33 | 
 34 | The main Python module containing the ETL job (which will be sent to the Spark cluster), is `jobs/etl_job.py`. Any external configuration parameters required by `etl_job.py` are stored in JSON format in `configs/etl_config.json`. Additional modules that support this job can be kept in the `dependencies` folder (more on this later). In the project's root we include `build_dependencies.sh`, which is a bash script for building these dependencies into a zip-file to be sent to the cluster (`packages.zip`). Unit test modules are kept in the `tests` folder and small chunks of representative input and output data, to be used with the tests, are kept in `tests/test_data` folder.
 35 | 
 36 | ## Structure of an ETL Job
 37 | 
 38 | In order to facilitate easy debugging and testing, we recommend that the 'Transformation' step be isolated from the 'Extract' and 'Load' steps, into its own function - taking input data arguments in the form of DataFrames and returning the transformed data as a single DataFrame. Then, the code that surrounds the use of the transformation function in the `main()` job function, is concerned with Extracting the data, passing it to the transformation function and then Loading (or writing) the results to their ultimate destination. Testing is simplified, as mock or test data can be passed to the transformation function and the results explicitly verified, which would not be possible if all of the ETL code resided in `main()` and referenced production data sources and destinations.
 39 | 
 40 | More generally, transformation functions should be designed to be _idempotent_. This is a technical way of saying that the repeated application of the transformation function should have no impact on the fundamental state of output data, until the moment the input data changes. One of the key advantages of idempotent ETL jobs, is that they can be set to run repeatedly (e.g. by using `cron` to trigger the `spark-submit` command above, on a pre-defined schedule), rather than having to factor-in potential dependencies on other ETL jobs completing successfully.
 41 | 
 42 | ## Passing Configuration Parameters to the ETL Job
 43 | 
 44 | Although it is possible to pass arguments to `etl_job.py`, as you would for any generic Python module running as a 'main' program  - by specifying them after the module's filename and then parsing these command line arguments - this can get very complicated, very quickly, especially when there are lot of parameters (e.g. credentials for multiple databases, table names, SQL snippets, etc.). This also makes debugging the code from within a Python interpreter extremely awkward, as you don't have access to the command line arguments that would ordinarily be passed to the code, when calling it from the command line.
 45 | 
 46 | A much more effective solution is to send Spark a separate file - e.g. using the `--files configs/etl_config.json` flag with `spark-submit` - containing the configuration in JSON format, which can be parsed into a Python dictionary in one line of code with `json.loads(config_file_contents)`. Testing the code from within a Python interactive console session is also greatly simplified, as all one has to do to access configuration parameters for testing, is to copy and paste the contents of the file - e.g.,
 47 | 
 48 | ```python
 49 | import json
 50 | 
 51 | config = json.loads("""{"field": "value"}""")
 52 | ```
 53 | 
 54 | For the exact details of how the configuration file is located, opened and parsed, please see the `start_spark()` function in `dependencies/spark.py` (also discussed further below), which in addition to parsing the configuration file sent to Spark (and returning it as a Python dictionary), also launches the Spark driver program (the application) on the cluster and retrieves the Spark logger at the same time.
 55 | 
 56 | ## Packaging ETL Job Dependencies
 57 | 
 58 | In this project, functions that can be used across different ETL jobs are kept in a module called `dependencies` and referenced in specific job modules using, for example,
 59 | 
 60 | ```python
 61 | from dependencies.spark import start_spark
 62 | ```
 63 | 
 64 | This package, together with any additional dependencies referenced within it, must be copied to each Spark node for all jobs that use `dependencies` to run. This can be achieved in one of several ways:
 65 | 
 66 | 1. send all dependencies as a `zip` archive together with the job, using `--py-files` with Spark submit;
 67 | 2. formally package and upload `dependencies` to somewhere like the `PyPI` archive (or a private version) and then run `pip3 install dependencies` on each node; or,
 68 | 3. a combination of manually copying new modules (e.g. `dependencies`) to the Python path of each node and using `pip3 install` for additional dependencies (e.g. for `requests`).
 69 | 
 70 | Option (1) is by far the easiest and most flexible approach, so we will make use of this for now. To make this task easier, especially when modules such as `dependencies` have additional dependencies (e.g. the `requests` package), we have provided the `build_dependencies.sh` bash script for automating the production of `packages.zip`, given a list of dependencies documented in `Pipfile` and managed by the `pipenv` python application (discussed below).
 71 | 
 72 | ## Running the ETL job
 73 | 
 74 | Assuming that the `$SPARK_HOME` environment variable points to your local Spark installation folder, then the ETL job can be run from the project's root directory using the following command from the terminal,
 75 | 
 76 | ```bash
 77 | $SPARK_HOME/bin/spark-submit \
 78 | --master local[*] \
 79 | --packages 'com.somesparkjar.dependency:1.0.0' \
 80 | --py-files packages.zip \
 81 | --files configs/etl_config.json \
 82 | jobs/etl_job.py
 83 | ```
 84 | 
 85 | Briefly, the options supplied serve the following purposes:
 86 | 
 87 | - `--master local[*]` - the address of the Spark cluster to start the job on. If you have a Spark cluster in operation (either in single-executor mode locally, or something larger in the cloud) and want to send the job there, then modify this with the appropriate Spark IP - e.g. `spark://the-clusters-ip-address:7077`;
 88 | - `--packages 'com.somesparkjar.dependency:1.0.0,...'` - Maven coordinates for any JAR dependencies required by the job (e.g. JDBC driver for connecting to a relational database);
 89 | - `--files configs/etl_config.json` - the (optional) path to any config file that may be required by the ETL job;
 90 | - `--py-files packages.zip` - archive containing Python dependencies (modules) referenced by the job; and,
 91 | - `jobs/etl_job.py` - the Python module file containing the ETL job to execute.
 92 | 
 93 | Full details of all possible options can be found [here](http://spark.apache.org/docs/latest/submitting-applications.html). Note, that we have left some options to be defined within the job (which is actually a Spark application) - e.g. `spark.cores.max` and `spark.executor.memory` are defined in the Python script as it is felt that the job should explicitly contain the requests for the required cluster resources.
 94 | 
 95 | ## Debugging Spark Jobs Using `start_spark`
 96 | 
 97 | It is not practical to test and debug Spark jobs by sending them to a cluster using `spark-submit` and examining stack traces for clues on what went wrong. A more productive workflow is to use an interactive console session (e.g. IPython) or a debugger (e.g. the `pdb` package in the Python standard library or the Python debugger in Visual Studio Code). In practice, however, it can be hard to test and debug Spark jobs in this way, as they implicitly rely on arguments that are sent to `spark-submit`, which are not available in a console or debug session.
 98 | 
 99 | We wrote the `start_spark` function - found in `dependencies/spark.py` - to facilitate the development of Spark jobs that are aware of the context in which they are being executed - i.e. as `spark-submit` jobs or within an IPython console, etc. The expected location of the Spark and job configuration parameters required by the job, is contingent on which execution context has been detected. The docstring for `start_spark` gives the precise details,
100 | 
101 | ```python
102 | def start_spark(app_name='my_spark_app', master='local[*]', jar_packages=[],
103 |                 files=[], spark_config={}):
104 |     """Start Spark session, get Spark logger and load config files.
105 | 
106 |     Start a Spark session on the worker node and register the Spark
107 |     application with the cluster. Note, that only the app_name argument
108 |     will apply when this is called from a script sent to spark-submit.
109 |     All other arguments exist solely for testing the script from within
110 |     an interactive Python console.
111 | 
112 |     This function also looks for a file ending in 'config.json' that
113 |     can be sent with the Spark job. If it is found, it is opened,
114 |     the contents parsed (assuming it contains valid JSON for the ETL job
115 |     configuration) into a dict of ETL job configuration parameters,
116 |     which are returned as the last element in the tuple returned by
117 |     this function. If the file cannot be found then the return tuple
118 |     only contains the Spark session and Spark logger objects and None
119 |     for config.
120 | 
121 |     The function checks the enclosing environment to see if it is being
122 |     run from inside an interactive console session or from an
123 |     environment which has a `DEBUG` environment variable set (e.g.
124 |     setting `DEBUG=1` as an environment variable as part of a debug
125 |     configuration within an IDE such as Visual Studio Code or PyCharm.
126 |     In this scenario, the function uses all available function arguments
127 |     to start a PySpark driver from the local PySpark package as opposed
128 |     to using the spark-submit and Spark cluster defaults. This will also
129 |     use local module imports, as opposed to those in the zip archive
130 |     sent to spark via the --py-files flag in spark-submit.
131 | 
132 |     :param app_name: Name of Spark app.
133 |     :param master: Cluster connection details (defaults to local[*]).
134 |     :param jar_packages: List of Spark JAR package names.
135 |     :param files: List of files to send to Spark cluster (master and
136 |         workers).
137 |     :param spark_config: Dictionary of config key-value pairs.
138 |     :return: A tuple of references to the Spark session, logger and
139 |         config dict (only if available).
140 |     """
141 | 
142 |     # ...
143 | 
144 |     return spark_sess, spark_logger, config_dict
145 | ```
146 | 
147 | For example, the following code snippet,
148 | 
149 | ```python
150 | spark, log, config = start_spark(
151 |     app_name='my_etl_job',
152 |     jar_packages=['com.somesparkjar.dependency:1.0.0'],
153 |     files=['configs/etl_config.json'])
154 | ```
155 | 
156 | Will use the arguments provided to `start_spark` to setup the Spark job if executed from an interactive console session or debugger, but will look for the same arguments sent via `spark-submit` if that is how the job has been executed.
157 | 
158 | ## Automated Testing
159 | 
160 | In order to test with Spark, we use the `pyspark` Python package, which is bundled with the Spark JARs required to programmatically start-up and tear-down a local Spark instance, on a per-test-suite basis (we recommend using the `setUp` and `tearDown` methods in `unittest.TestCase` to do this once per test-suite). Note, that using `pyspark` to run Spark is an alternative way of developing with Spark as opposed to using the PySpark shell or `spark-submit`.
161 | 
162 | Given that we have chosen to structure our ETL jobs in such a way as to isolate the 'Transformation' step into its own function (see 'Structure of an ETL job' above), we are free to feed it a small slice of 'real-world' production data that has been persisted locally - e.g. in `tests/test_data` or some easily accessible network directory - and check it against known results (e.g. computed manually or interactively within a Python interactive console session).
163 | 
164 | To execute the example unit test for this project run,
165 | 
166 | ```bash
167 | pipenv run python -m unittest tests/test_*.py
168 | ```
169 | 
170 | If you're wondering what the `pipenv` command is, then read the next section.
171 | 
172 | ## Managing Project Dependencies using Pipenv
173 | 
174 | We use [pipenv](https://docs.pipenv.org) for managing project dependencies and Python environments (i.e. virtual environments). All direct packages dependencies (e.g. NumPy may be used in a User Defined Function), as well as all the packages used during development (e.g. PySpark, flake8 for code linting, IPython for interactive console sessions, etc.), are described in the `Pipfile`. Their **precise** downstream dependencies are described in `Pipfile.lock`.
175 | 
176 | ### Installing Pipenv
177 | 
178 | To get started with Pipenv, first of all download it - assuming that there is a global version of Python available on your system and on the PATH, then this can be achieved by running the following command,
179 | 
180 | ```bash
181 | pip3 install pipenv
182 | ```
183 | 
184 | Pipenv is also available to install from many non-Python package managers. For example, on OS X it can be installed using the [Homebrew](https://brew.sh) package manager, with the following terminal command,
185 | 
186 | ```bash
187 | brew install pipenv
188 | ```
189 | 
190 | For more information, including advanced configuration options, see the [official pipenv documentation](https://docs.pipenv.org).
191 | 
192 | ### Installing this Projects' Dependencies
193 | 
194 | Make sure that you're in the project's root directory (the same one in which the `Pipfile` resides), and then run,
195 | 
196 | ```bash
197 | pipenv install --dev
198 | ```
199 | 
200 | This will install all of the direct project dependencies as well as the development dependencies (the latter a consequence of the `--dev` flag).
201 | 
202 | ### Running Python and IPython from the Project's Virtual Environment
203 | 
204 | In order to continue development in a Python environment that precisely mimics the one the project was initially developed with, use Pipenv from the command line as follows,
205 | 
206 | ```bash
207 | pipenv run python3
208 | ```
209 | 
210 | The `python3` command could just as well be `ipython3`, for example,
211 | 
212 | ```bash
213 | pipenv run ipython
214 | ```
215 | 
216 | This will fire-up an IPython console session *where the default Python 3 kernel includes all of the direct and development project dependencies* - this is our preference.
217 | 
218 | ### Pipenv Shells
219 | 
220 | Prepending `pipenv` to every command you want to run within the context of your Pipenv-managed virtual environment can get very tedious. This can be avoided by entering into a Pipenv-managed shell,
221 | 
222 | ```bash
223 | pipenv shell
224 | ```
225 | 
226 | This is equivalent to 'activating' the virtual environment; any command will now be executed within the virtual environment. Use `exit` to leave the shell session.
227 | 
228 | ### Automatic Loading of Environment Variables
229 | 
230 | Pipenv will automatically pick-up and load any environment variables declared in the `.env` file, located in the package's root directory. For example, adding,
231 | 
232 | ```bash
233 | SPARK_HOME=applications/spark-2.3.1/bin
234 | DEBUG=1
235 | ```
236 | 
237 | Will enable access to these variables within any Python program -e.g. via a call to `os.environ['SPARK_HOME']`. Note, that if any security credentials are placed here, then this file **must** be removed from source control - i.e. add `.env` to the `.gitignore` file to prevent potential security risks.


--------------------------------------------------------------------------------
/build_dependencies.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # check to see if pipenv is installed
 4 | if [ -x "$(which pipenv)" ]
 5 | then
 6 |     # check that Pipfile.lock exists in root directory
 7 |     if [ ! -e Pipfile.lock ]
 8 |     then
 9 |         echo 'ERROR - cannot find Pipfile.lock'
10 |         exit 1
11 |     fi
12 | 
13 |     # use Pipenv to create a requirement.txt file
14 |     echo '... creating requirements.txt from Pipfile.lock'
15 |     pipenv lock -r > requirements.txt
16 | 
17 |     # install packages to a temporary directory and zip it
18 |     touch requirements.txt  # safeguard in case there are no packages
19 |     pip3 install -r requirements.txt --target ./packages
20 | 
21 |     # check to see if there are any external dependencies
22 |     # if not then create an empty file to seed zip with
23 |     if [ -z "$(ls -A packages)" ]
24 |     then
25 |         touch packages/empty.txt
26 |     fi
27 | 
28 |     # zip dependencies
29 |     if [ ! -d packages ]
30 |     then 
31 |         echo 'ERROR - pip failed to import dependencies'
32 |         exit 1
33 |     fi
34 | 
35 |     cd packages
36 |     zip -9mrv packages.zip .
37 |     mv packages.zip ..
38 |     cd ..
39 | 
40 |     # remove temporary directory and requirements.txt
41 |     rm -rf packages
42 |     rm requirements.txt
43 |     
44 |     # add local modules
45 |     echo '... adding all modules from local utils package'
46 |     zip -ru9 packages.zip dependencies -x dependencies/__pycache__/\*
47 | 
48 |     exit 0
49 | else
50 |     echo 'ERROR - pipenv is not installed --> run `pip3 install pipenv` to load pipenv into global site packages or install via a system package manager.'
51 |     exit 1
52 | fi
53 | 


--------------------------------------------------------------------------------
/configs/etl_config.json:
--------------------------------------------------------------------------------
1 | {
2 |   "steps_per_floor": 21
3 | }


--------------------------------------------------------------------------------
/dependencies/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexIoannides/pyspark-example-project/eeee0c2b9af79fdd7c5d86fe56466c147b487e26/dependencies/__init__.py


--------------------------------------------------------------------------------
/dependencies/logging.py:
--------------------------------------------------------------------------------
 1 | """
 2 | logging
 3 | ~~~~~~~
 4 | 
 5 | This module contains a class that wraps the log4j object instantiated
 6 | by the active SparkContext, enabling Log4j logging for PySpark using.
 7 | """
 8 | 
 9 | 
10 | class Log4j(object):
11 |     """Wrapper class for Log4j JVM object.
12 | 
13 |     :param spark: SparkSession object.
14 |     """
15 | 
16 |     def __init__(self, spark):
17 |         # get spark app details with which to prefix all messages
18 |         conf = spark.sparkContext.getConf()
19 |         app_id = conf.get('spark.app.id')
20 |         app_name = conf.get('spark.app.name')
21 | 
22 |         log4j = spark._jvm.org.apache.log4j
23 |         message_prefix = '<' + app_name + ' ' + app_id + '>'
24 |         self.logger = log4j.LogManager.getLogger(message_prefix)
25 | 
26 |     def error(self, message):
27 |         """Log an error.
28 | 
29 |         :param: Error message to write to log
30 |         :return: None
31 |         """
32 |         self.logger.error(message)
33 |         return None
34 | 
35 |     def warn(self, message):
36 |         """Log a warning.
37 | 
38 |         :param: Warning message to write to log
39 |         :return: None
40 |         """
41 |         self.logger.warn(message)
42 |         return None
43 | 
44 |     def info(self, message):
45 |         """Log information.
46 | 
47 |         :param: Information message to write to log
48 |         :return: None
49 |         """
50 |         self.logger.info(message)
51 |         return None
52 | 


--------------------------------------------------------------------------------
/dependencies/spark.py:
--------------------------------------------------------------------------------
  1 | """
  2 | spark.py
  3 | ~~~~~~~~
  4 | 
  5 | Module containing helper function for use with Apache Spark
  6 | """
  7 | 
  8 | import __main__
  9 | 
 10 | from os import environ, listdir, path
 11 | import json
 12 | from pyspark import SparkFiles
 13 | from pyspark.sql import SparkSession
 14 | 
 15 | from dependencies import logging
 16 | 
 17 | 
 18 | def start_spark(app_name='my_spark_app', master='local[*]', jar_packages=[],
 19 |                 files=[], spark_config={}):
 20 |     """Start Spark session, get Spark logger and load config files.
 21 | 
 22 |     Start a Spark session on the worker node and register the Spark
 23 |     application with the cluster. Note, that only the app_name argument
 24 |     will apply when this is called from a script sent to spark-submit.
 25 |     All other arguments exist solely for testing the script from within
 26 |     an interactive Python console.
 27 | 
 28 |     This function also looks for a file ending in 'config.json' that
 29 |     can be sent with the Spark job. If it is found, it is opened,
 30 |     the contents parsed (assuming it contains valid JSON for the ETL job
 31 |     configuration) into a dict of ETL job configuration parameters,
 32 |     which are returned as the last element in the tuple returned by
 33 |     this function. If the file cannot be found then the return tuple
 34 |     only contains the Spark session and Spark logger objects and None
 35 |     for config.
 36 | 
 37 |     The function checks the enclosing environment to see if it is being
 38 |     run from inside an interactive console session or from an
 39 |     environment which has a `DEBUG` environment variable set (e.g.
 40 |     setting `DEBUG=1` as an environment variable as part of a debug
 41 |     configuration within an IDE such as Visual Studio Code or PyCharm.
 42 |     In this scenario, the function uses all available function arguments
 43 |     to start a PySpark driver from the local PySpark package as opposed
 44 |     to using the spark-submit and Spark cluster defaults. This will also
 45 |     use local module imports, as opposed to those in the zip archive
 46 |     sent to spark via the --py-files flag in spark-submit.
 47 | 
 48 |     :param app_name: Name of Spark app.
 49 |     :param master: Cluster connection details (defaults to local[*]).
 50 |     :param jar_packages: List of Spark JAR package names.
 51 |     :param files: List of files to send to Spark cluster (master and
 52 |         workers).
 53 |     :param spark_config: Dictionary of config key-value pairs.
 54 |     :return: A tuple of references to the Spark session, logger and
 55 |         config dict (only if available).
 56 |     """
 57 | 
 58 |     # detect execution environment
 59 |     flag_repl = not(hasattr(__main__, '__file__'))
 60 |     flag_debug = 'DEBUG' in environ.keys()
 61 | 
 62 |     if not (flag_repl or flag_debug):
 63 |         # get Spark session factory
 64 |         spark_builder = (
 65 |             SparkSession
 66 |             .builder
 67 |             .appName(app_name))
 68 |     else:
 69 |         # get Spark session factory
 70 |         spark_builder = (
 71 |             SparkSession
 72 |             .builder
 73 |             .master(master)
 74 |             .appName(app_name))
 75 | 
 76 |         # create Spark JAR packages string
 77 |         spark_jars_packages = ','.join(list(jar_packages))
 78 |         spark_builder.config('spark.jars.packages', spark_jars_packages)
 79 | 
 80 |         spark_files = ','.join(list(files))
 81 |         spark_builder.config('spark.files', spark_files)
 82 | 
 83 |         # add other config params
 84 |         for key, val in spark_config.items():
 85 |             spark_builder.config(key, val)
 86 | 
 87 |     # create session and retrieve Spark logger object
 88 |     spark_sess = spark_builder.getOrCreate()
 89 |     spark_logger = logging.Log4j(spark_sess)
 90 | 
 91 |     # get config file if sent to cluster with --files
 92 |     spark_files_dir = SparkFiles.getRootDirectory()
 93 |     config_files = [filename
 94 |                     for filename in listdir(spark_files_dir)
 95 |                     if filename.endswith('config.json')]
 96 | 
 97 |     if config_files:
 98 |         path_to_config_file = path.join(spark_files_dir, config_files[0])
 99 |         with open(path_to_config_file, 'r') as config_file:
100 |             config_dict = json.load(config_file)
101 |         spark_logger.warn('loaded config from ' + config_files[0])
102 |     else:
103 |         spark_logger.warn('no config file found')
104 |         config_dict = None
105 | 
106 |     return spark_sess, spark_logger, config_dict
107 | 


--------------------------------------------------------------------------------
/jobs/etl_job.py:
--------------------------------------------------------------------------------
  1 | """
  2 | etl_job.py
  3 | ~~~~~~~~~~
  4 | 
  5 | This Python module contains an example Apache Spark ETL job definition
  6 | that implements best practices for production ETL jobs. It can be
  7 | submitted to a Spark cluster (or locally) using the 'spark-submit'
  8 | command found in the '/bin' directory of all Spark distributions
  9 | (necessary for running any Spark job, locally or otherwise). For
 10 | example, this example script can be executed as follows,
 11 | 
 12 |     $SPARK_HOME/bin/spark-submit \
 13 |     --master spark://localhost:7077 \
 14 |     --py-files packages.zip \
 15 |     --files configs/etl_config.json \
 16 |     jobs/etl_job.py
 17 | 
 18 | where packages.zip contains Python modules required by ETL job (in
 19 | this example it contains a class to provide access to Spark's logger),
 20 | which need to be made available to each executor process on every node
 21 | in the cluster; etl_config.json is a text file sent to the cluster,
 22 | containing a JSON object with all of the configuration parameters
 23 | required by the ETL job; and, etl_job.py contains the Spark application
 24 | to be executed by a driver process on the Spark master node.
 25 | 
 26 | For more details on submitting Spark applications, please see here:
 27 | http://spark.apache.org/docs/latest/submitting-applications.html
 28 | 
 29 | Our chosen approach for structuring jobs is to separate the individual
 30 | 'units' of ETL - the Extract, Transform and Load parts - into dedicated
 31 | functions, such that the key Transform steps can be covered by tests
 32 | and jobs or called from within another environment (e.g. a Jupyter or
 33 | Zeppelin notebook).
 34 | """
 35 | 
 36 | from pyspark.sql import Row
 37 | from pyspark.sql.functions import col, concat_ws, lit
 38 | 
 39 | from dependencies.spark import start_spark
 40 | 
 41 | 
 42 | def main():
 43 |     """Main ETL script definition.
 44 | 
 45 |     :return: None
 46 |     """
 47 |     # start Spark application and get Spark session, logger and config
 48 |     spark, log, config = start_spark(
 49 |         app_name='my_etl_job',
 50 |         files=['configs/etl_config.json'])
 51 | 
 52 |     # log that main ETL job is starting
 53 |     log.warn('etl_job is up-and-running')
 54 | 
 55 |     # execute ETL pipeline
 56 |     data = extract_data(spark)
 57 |     data_transformed = transform_data(data, config['steps_per_floor'])
 58 |     load_data(data_transformed)
 59 | 
 60 |     # log the success and terminate Spark application
 61 |     log.warn('test_etl_job is finished')
 62 |     spark.stop()
 63 |     return None
 64 | 
 65 | 
 66 | def extract_data(spark):
 67 |     """Load data from Parquet file format.
 68 | 
 69 |     :param spark: Spark session object.
 70 |     :return: Spark DataFrame.
 71 |     """
 72 |     df = (
 73 |         spark
 74 |         .read
 75 |         .parquet('tests/test_data/employees'))
 76 | 
 77 |     return df
 78 | 
 79 | 
 80 | def transform_data(df, steps_per_floor_):
 81 |     """Transform original dataset.
 82 | 
 83 |     :param df: Input DataFrame.
 84 |     :param steps_per_floor_: The number of steps per-floor at 43 Tanner
 85 |         Street.
 86 |     :return: Transformed DataFrame.
 87 |     """
 88 |     df_transformed = (
 89 |         df
 90 |         .select(
 91 |             col('id'),
 92 |             concat_ws(
 93 |                 ' ',
 94 |                 col('first_name'),
 95 |                 col('second_name')).alias('name'),
 96 |                (col('floor') * lit(steps_per_floor_)).alias('steps_to_desk')))
 97 | 
 98 |     return df_transformed
 99 | 
100 | 
101 | def load_data(df):
102 |     """Collect data locally and write to CSV.
103 | 
104 |     :param df: DataFrame to print.
105 |     :return: None
106 |     """
107 |     (df
108 |      .coalesce(1)
109 |      .write
110 |      .csv('loaded_data', mode='overwrite', header=True))
111 |     return None
112 | 
113 | 
114 | def create_test_data(spark, config):
115 |     """Create test data.
116 | 
117 |     This function creates both both pre- and post- transformation data
118 |     saved as Parquet files in tests/test_data. This will be used for
119 |     unit tests as well as to load as part of the example ETL job.
120 |     :return: None
121 |     """
122 |     # create example data from scratch
123 |     local_records = [
124 |         Row(id=1, first_name='Dan', second_name='Germain', floor=1),
125 |         Row(id=2, first_name='Dan', second_name='Sommerville', floor=1),
126 |         Row(id=3, first_name='Alex', second_name='Ioannides', floor=2),
127 |         Row(id=4, first_name='Ken', second_name='Lai', floor=2),
128 |         Row(id=5, first_name='Stu', second_name='White', floor=3),
129 |         Row(id=6, first_name='Mark', second_name='Sweeting', floor=3),
130 |         Row(id=7, first_name='Phil', second_name='Bird', floor=4),
131 |         Row(id=8, first_name='Kim', second_name='Suter', floor=4)
132 |     ]
133 | 
134 |     df = spark.createDataFrame(local_records)
135 | 
136 |     # write to Parquet file format
137 |     (df
138 |      .coalesce(1)
139 |      .write
140 |      .parquet('tests/test_data/employees', mode='overwrite'))
141 | 
142 |     # create transformed version of data
143 |     df_tf = transform_data(df, config['steps_per_floor'])
144 | 
145 |     # write transformed version of data to Parquet
146 |     (df_tf
147 |      .coalesce(1)
148 |      .write
149 |      .parquet('tests/test_data/employees_report', mode='overwrite'))
150 | 
151 |     return None
152 | 
153 | 
154 | # entry point for PySpark ETL application
155 | if __name__ == '__main__':
156 |     main()
157 | 


--------------------------------------------------------------------------------
/packages.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexIoannides/pyspark-example-project/eeee0c2b9af79fdd7c5d86fe56466c147b487e26/packages.zip


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexIoannides/pyspark-example-project/eeee0c2b9af79fdd7c5d86fe56466c147b487e26/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_data/employees/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/tests/test_data/employees/.part-00000-9abf32a3-db43-42e1-9639-363ef11c0d1c-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexIoannides/pyspark-example-project/eeee0c2b9af79fdd7c5d86fe56466c147b487e26/tests/test_data/employees/.part-00000-9abf32a3-db43-42e1-9639-363ef11c0d1c-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/tests/test_data/employees/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexIoannides/pyspark-example-project/eeee0c2b9af79fdd7c5d86fe56466c147b487e26/tests/test_data/employees/_SUCCESS


--------------------------------------------------------------------------------
/tests/test_data/employees/part-00000-9abf32a3-db43-42e1-9639-363ef11c0d1c-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexIoannides/pyspark-example-project/eeee0c2b9af79fdd7c5d86fe56466c147b487e26/tests/test_data/employees/part-00000-9abf32a3-db43-42e1-9639-363ef11c0d1c-c000.snappy.parquet


--------------------------------------------------------------------------------
/tests/test_data/employees_report/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/tests/test_data/employees_report/.part-00000-4a609ba3-0404-48bb-bb22-2fec3e2f1e68-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexIoannides/pyspark-example-project/eeee0c2b9af79fdd7c5d86fe56466c147b487e26/tests/test_data/employees_report/.part-00000-4a609ba3-0404-48bb-bb22-2fec3e2f1e68-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/tests/test_data/employees_report/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexIoannides/pyspark-example-project/eeee0c2b9af79fdd7c5d86fe56466c147b487e26/tests/test_data/employees_report/_SUCCESS


--------------------------------------------------------------------------------
/tests/test_data/employees_report/part-00000-4a609ba3-0404-48bb-bb22-2fec3e2f1e68-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexIoannides/pyspark-example-project/eeee0c2b9af79fdd7c5d86fe56466c147b487e26/tests/test_data/employees_report/part-00000-4a609ba3-0404-48bb-bb22-2fec3e2f1e68-c000.snappy.parquet


--------------------------------------------------------------------------------
/tests/test_etl_job.py:
--------------------------------------------------------------------------------
 1 | """
 2 | test_etl_job.py
 3 | ~~~~~~~~~~~~~~~
 4 | 
 5 | This module contains unit tests for the transformation steps of the ETL
 6 | job defined in etl_job.py. It makes use of a local version of PySpark
 7 | that is bundled with the PySpark package.
 8 | """
 9 | import unittest
10 | 
11 | import json
12 | 
13 | from pyspark.sql.functions import mean
14 | 
15 | from dependencies.spark import start_spark
16 | from jobs.etl_job import transform_data
17 | 
18 | 
19 | class SparkETLTests(unittest.TestCase):
20 |     """Test suite for transformation in etl_job.py
21 |     """
22 | 
23 |     def setUp(self):
24 |         """Start Spark, define config and path to test data
25 |         """
26 |         self.config = json.loads("""{"steps_per_floor": 21}""")
27 |         self.spark, *_ = start_spark()
28 |         self.test_data_path = 'tests/test_data/'
29 | 
30 |     def tearDown(self):
31 |         """Stop Spark
32 |         """
33 |         self.spark.stop()
34 | 
35 |     def test_transform_data(self):
36 |         """Test data transformer.
37 | 
38 |         Using small chunks of input data and expected output data, we
39 |         test the transformation step to make sure it's working as
40 |         expected.
41 |         """
42 |         # assemble
43 |         input_data = (
44 |             self.spark
45 |             .read
46 |             .parquet(self.test_data_path + 'employees'))
47 | 
48 |         expected_data = (
49 |             self.spark
50 |             .read
51 |             .parquet(self.test_data_path + 'employees_report'))
52 | 
53 |         expected_cols = len(expected_data.columns)
54 |         expected_rows = expected_data.count()
55 |         expected_avg_steps = (
56 |             expected_data
57 |             .agg(mean('steps_to_desk').alias('avg_steps_to_desk'))
58 |             .collect()[0]
59 |             ['avg_steps_to_desk'])
60 | 
61 |         # act
62 |         data_transformed = transform_data(input_data, 21)
63 | 
64 |         cols = len(expected_data.columns)
65 |         rows = expected_data.count()
66 |         avg_steps = (
67 |             expected_data
68 |             .agg(mean('steps_to_desk').alias('avg_steps_to_desk'))
69 |             .collect()[0]
70 |             ['avg_steps_to_desk'])
71 | 
72 |         # assert
73 |         self.assertEqual(expected_cols, cols)
74 |         self.assertEqual(expected_rows, rows)
75 |         self.assertEqual(expected_avg_steps, avg_steps)
76 |         self.assertTrue([col in expected_data.columns
77 |                          for col in data_transformed.columns])
78 | 
79 | 
80 | if __name__ == '__main__':
81 |     unittest.main()
82 | 


--------------------------------------------------------------------------------