├── .gitignore
├── LICENSE
├── README.md
├── app
    ├── Makefile
    ├── config.json
    ├── jobs
    │   ├── __init__.py
    │   ├── movie_genres.py
    │   └── movies.py
    ├── main.py
    ├── requirements.txt
    ├── shared
    │   ├── __init__.py
    │   └── udfs.py
    └── tests
    │   ├── pytest.ini
    │   ├── test_movies.py
    │   └── test_udfs.py
├── get_data.py
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 pchrabka
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # pySpark-pyData
 2 | 
 3 | This is an example pySpark application created for pyData Warsaw 2019 talk.
 4 | 
 5 | This application uses MovieLens data set as a source data. This data can be downloaded here https://grouplens.org/datasets/movielens/
 6 |  or using `get_data.py` script included in this repository.
 7 |  
 8 | App development steps:  
 9 | [v1.0](https://github.com/pchrabka/pySpark-pyData/tree/v1.0) - Initial version  
10 | [v2.0](https://github.com/pchrabka/pySpark-pyData/tree/v2.0) - Added config file  
11 | [v3.0](https://github.com/pchrabka/pySpark-pyData/tree/v3.0) - Added main.py  
12 | [v4.0](https://github.com/pchrabka/pySpark-pyData/tree/v4.0) - Added Makefile  
13 | [v5.0](https://github.com/pchrabka/pySpark-pyData/tree/v5.0) - Added UDFs  
14 | [v6.0](https://github.com/pchrabka/pySpark-pyData/tree/v6.0) - Added third party dependency  
15 | [v7.0](https://github.com/pchrabka/pySpark-pyData/tree/v7.0) - Added tests


--------------------------------------------------------------------------------
/app/Makefile:
--------------------------------------------------------------------------------
1 | build:
2 | 	rm -rf ./dist && mkdir ./dist
3 | 	cp ./main.py ./dist
4 | 	cp ./config.json ./dist
5 | 	zip -r dist/jobs.zip jobs
6 | 	zip -r dist/shared.zip shared
7 | 	docker run --rm -v $(PWD):/foo -w /foo lambci/lambda:build-python3.7 \
8 | 	pip install -r requirements.txt -t ./dist/libs
9 | 	cd ./dist/libs && zip -r -D ../libs.zip .


--------------------------------------------------------------------------------
/app/config.json:
--------------------------------------------------------------------------------
1 | {
2 |   "app_name": "MoviesETL",
3 |   "source_data_path": "data/ml-latest-small",
4 |   "output_data_path": "data/output"
5 | }
6 | 


--------------------------------------------------------------------------------
/app/jobs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pchrabka/PySpark-PyData/d71064804982db2353543b0051d802d9765a0680/app/jobs/__init__.py


--------------------------------------------------------------------------------
/app/jobs/movie_genres.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql.functions import col, split, explode
 2 | 
 3 | 
 4 | def _extract_data(spark, config):
 5 |     """ Load data from csv file """
 6 |     return (
 7 |         spark.read.format("csv")
 8 |         .option("header", "true")
 9 |         .load(f"{config.get('source_data_path')}/movies.csv")
10 |     )
11 | 
12 | 
13 | def _transform_data(raw_df):
14 |     """ Transform raw dataframe """
15 |     return raw_df.select(
16 |         col("movieId"), explode(split(col("genres"), "\\|")).alias("genre")
17 |     )
18 | 
19 | 
20 | def _load_data(config, transformed_df):
21 |     """ Save data to parquet file """
22 |     transformed_df.write.mode("overwrite").parquet(
23 |         f"{config.get('output_data_path')}/movie_genres"
24 |     )
25 | 
26 | 
27 | def run_job(spark, config):
28 |     """ Run movie_genres job """
29 |     _load_data(config, _transform_data(_extract_data(spark, config)))
30 | 


--------------------------------------------------------------------------------
/app/jobs/movies.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql.functions import col, expr
 2 | from shared.udfs import get_movie_title_udf, get_movie_year_udf
 3 | 
 4 | 
 5 | def _extract_data(spark, config):
 6 |     """ Load data from csv file """
 7 |     return (
 8 |         spark.read.format("csv")
 9 |         .option("header", "true")
10 |         .load(f"{config.get('source_data_path')}/movies.csv")
11 |     )
12 | 
13 | 
14 | def _transform_data(raw_df):
15 |     """ Transform raw dataframe """
16 |     return raw_df.select(
17 |         col("movieId"),
18 |         get_movie_title_udf("title").alias("title"),
19 |         get_movie_year_udf("title").alias("year"),
20 |     )
21 | 
22 | 
23 | def _load_data(config, transformed_df):
24 |     """ Save data to parquet file """
25 |     transformed_df.write.mode("overwrite").parquet(
26 |         f"{config.get('output_data_path')}/movies"
27 |     )
28 | 
29 | 
30 | def run_job(spark, config):
31 |     """ Run movies job """
32 |     _load_data(config, _transform_data(_extract_data(spark, config)))
33 | 


--------------------------------------------------------------------------------
/app/main.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import importlib
 3 | import argparse
 4 | from pyspark.sql import SparkSession
 5 | 
 6 | 
 7 | def _parse_arguments():
 8 |     """ Parse arguments provided by spark-submit commend"""
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument("--job", required=True)
11 |     return parser.parse_args()
12 | 
13 | 
14 | def main():
15 |     """ Main function excecuted by spark-submit command"""
16 |     args = _parse_arguments()
17 | 
18 |     with open("config.json", "r") as config_file:
19 |         config = json.load(config_file)
20 | 
21 |     spark = SparkSession.builder.appName(config.get("app_name")).getOrCreate()
22 | 
23 |     job_module = importlib.import_module(f"jobs.{args.job}")
24 |     job_module.run_job(spark, config)
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     main()
29 | 


--------------------------------------------------------------------------------
/app/requirements.txt:
--------------------------------------------------------------------------------
1 | parse==1.12.0


--------------------------------------------------------------------------------
/app/shared/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pchrabka/PySpark-PyData/d71064804982db2353543b0051d802d9765a0680/app/shared/__init__.py


--------------------------------------------------------------------------------
/app/shared/udfs.py:
--------------------------------------------------------------------------------
 1 | from parse import parse
 2 | from pyspark.sql.functions import udf
 3 | from pyspark.sql.types import StringType, IntegerType
 4 | 
 5 | TITLE_COLUMN_TEMPLATE = "{title} ({year})"
 6 | 
 7 | 
 8 | def _get_movie_title(title_column):
 9 |     parsed = parse(TITLE_COLUMN_TEMPLATE, title_column)
10 |     return parsed.named.get("title") if parsed else None
11 | 
12 | 
13 | def _get_movie_year(title_column):
14 |     parsed = parse(TITLE_COLUMN_TEMPLATE, title_column)
15 |     year = parsed.named.get("year") if parsed else None
16 |     return int(year) if year and year.isdigit() else None
17 | 
18 | 
19 | get_movie_title_udf = udf(_get_movie_title, StringType())
20 | get_movie_year_udf = udf(_get_movie_year, IntegerType())
21 | 


--------------------------------------------------------------------------------
/app/tests/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | filterwarnings =
3 |     ignore:Using or importing the ABCs from 'collections' instead of from 'collections',
4 |     ignore:.*the imp module is deprecated in favour of importlib.*


--------------------------------------------------------------------------------
/app/tests/test_movies.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import pandas as pd
 4 | from jobs import movies
 5 | 
 6 | 
 7 | class TestMoviesJob:
 8 |     def test_transform_data(self, spark_session):
 9 |         test_data = spark_session.createDataFrame(
10 |             [(1, "Toy Story (1995)", "Adventure"), (160646, "Goat (2016)", "Drama")],
11 |             ["movieId", "title", "genres"],
12 |         )
13 | 
14 |         expected_data = spark_session.createDataFrame(
15 |             [(1, "Toy Story", 1995), (160646, "Goat", 2016)],
16 |             ["movieId", "title", "year"],
17 |         ).toPandas()
18 | 
19 |         real_data = movies._transform_data(test_data).toPandas()
20 | 
21 |         pd.testing.assert_frame_equal(real_data, expected_data, check_dtype=False)
22 | 
23 |     def test_run_job(self, spark_session, mocker):
24 |         test_config = {"output_data_path": "test_data_output"}
25 |         shutil.rmtree(test_config.get("output_data_path"))
26 |         test_data = spark_session.createDataFrame(
27 |             [(1, "Toy Story (1995)", "Adventure"), (160646, "Goat (2016)", "Drama")],
28 |             ["movieId", "title", "genres"],
29 |         )
30 |         mocker.patch.object(movies, "_extract_data")
31 |         movies._extract_data.return_value = test_data
32 |         movies.run_job(spark_session, test_config)
33 |         assert os.path.exists(test_config.get("output_data_path"))
34 | 


--------------------------------------------------------------------------------
/app/tests/test_udfs.py:
--------------------------------------------------------------------------------
 1 | from shared import udfs
 2 | 
 3 | 
 4 | class TestUDFs:
 5 |     def test_get_movie_title_value(self):
 6 |         test_value = "Toys Story (1995)"
 7 |         expected_value = "Toys Story"
 8 |         assert udfs._get_movie_title(test_value) == expected_value
 9 | 
10 |     def test_get_movie_year_value(self):
11 |         test_value = "Toys Story (1995)"
12 |         expected_value = 1995
13 |         assert udfs._get_movie_year(test_value) == expected_value
14 | 


--------------------------------------------------------------------------------
/get_data.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import zipfile
 3 | import io
 4 | 
 5 | zip_file_url = "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
 6 | output_path = "data/"
 7 | 
 8 | response = requests.get(zip_file_url)
 9 | 
10 | if response.status_code == 200:
11 |     zip_file = zipfile.ZipFile(io.BytesIO(response.content))
12 |     zip_file.extractall(output_path)
13 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas==0.25.3
2 | parse==1.12.0
3 | pytest==5.3.0
4 | pytest-cov==2.8.1
5 | pytest-mock==1.12.1
6 | pytest-spark==0.5.2
7 | requests==2.22.0
8 | 


--------------------------------------------------------------------------------