├── .gitignore ├── LICENSE ├── README.md ├── app ├── Makefile ├── config.json ├── jobs │ ├── __init__.py │ ├── movie_genres.py │ └── movies.py ├── main.py ├── requirements.txt ├── shared │ ├── __init__.py │ └── udfs.py └── tests │ ├── pytest.ini │ ├── test_movies.py │ └── test_udfs.py ├── get_data.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 pchrabka 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pySpark-pyData 2 | 3 | This is an example pySpark application created for pyData Warsaw 2019 talk. 4 | 5 | This application uses MovieLens data set as a source data. This data can be downloaded here https://grouplens.org/datasets/movielens/ 6 | or using `get_data.py` script included in this repository. 7 | 8 | App development steps: 9 | [v1.0](https://github.com/pchrabka/pySpark-pyData/tree/v1.0) - Initial version 10 | [v2.0](https://github.com/pchrabka/pySpark-pyData/tree/v2.0) - Added config file 11 | [v3.0](https://github.com/pchrabka/pySpark-pyData/tree/v3.0) - Added main.py 12 | [v4.0](https://github.com/pchrabka/pySpark-pyData/tree/v4.0) - Added Makefile 13 | [v5.0](https://github.com/pchrabka/pySpark-pyData/tree/v5.0) - Added UDFs 14 | [v6.0](https://github.com/pchrabka/pySpark-pyData/tree/v6.0) - Added third party dependency 15 | [v7.0](https://github.com/pchrabka/pySpark-pyData/tree/v7.0) - Added tests -------------------------------------------------------------------------------- /app/Makefile: -------------------------------------------------------------------------------- 1 | build: 2 | rm -rf ./dist && mkdir ./dist 3 | cp ./main.py ./dist 4 | cp ./config.json ./dist 5 | zip -r dist/jobs.zip jobs 6 | zip -r dist/shared.zip shared 7 | docker run --rm -v $(PWD):/foo -w /foo lambci/lambda:build-python3.7 \ 8 | pip install -r requirements.txt -t ./dist/libs 9 | cd ./dist/libs && zip -r -D ../libs.zip . -------------------------------------------------------------------------------- /app/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "app_name": "MoviesETL", 3 | "source_data_path": "data/ml-latest-small", 4 | "output_data_path": "data/output" 5 | } 6 | -------------------------------------------------------------------------------- /app/jobs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pchrabka/PySpark-PyData/d71064804982db2353543b0051d802d9765a0680/app/jobs/__init__.py -------------------------------------------------------------------------------- /app/jobs/movie_genres.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql.functions import col, split, explode 2 | 3 | 4 | def _extract_data(spark, config): 5 | """ Load data from csv file """ 6 | return ( 7 | spark.read.format("csv") 8 | .option("header", "true") 9 | .load(f"{config.get('source_data_path')}/movies.csv") 10 | ) 11 | 12 | 13 | def _transform_data(raw_df): 14 | """ Transform raw dataframe """ 15 | return raw_df.select( 16 | col("movieId"), explode(split(col("genres"), "\\|")).alias("genre") 17 | ) 18 | 19 | 20 | def _load_data(config, transformed_df): 21 | """ Save data to parquet file """ 22 | transformed_df.write.mode("overwrite").parquet( 23 | f"{config.get('output_data_path')}/movie_genres" 24 | ) 25 | 26 | 27 | def run_job(spark, config): 28 | """ Run movie_genres job """ 29 | _load_data(config, _transform_data(_extract_data(spark, config))) 30 | -------------------------------------------------------------------------------- /app/jobs/movies.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql.functions import col, expr 2 | from shared.udfs import get_movie_title_udf, get_movie_year_udf 3 | 4 | 5 | def _extract_data(spark, config): 6 | """ Load data from csv file """ 7 | return ( 8 | spark.read.format("csv") 9 | .option("header", "true") 10 | .load(f"{config.get('source_data_path')}/movies.csv") 11 | ) 12 | 13 | 14 | def _transform_data(raw_df): 15 | """ Transform raw dataframe """ 16 | return raw_df.select( 17 | col("movieId"), 18 | get_movie_title_udf("title").alias("title"), 19 | get_movie_year_udf("title").alias("year"), 20 | ) 21 | 22 | 23 | def _load_data(config, transformed_df): 24 | """ Save data to parquet file """ 25 | transformed_df.write.mode("overwrite").parquet( 26 | f"{config.get('output_data_path')}/movies" 27 | ) 28 | 29 | 30 | def run_job(spark, config): 31 | """ Run movies job """ 32 | _load_data(config, _transform_data(_extract_data(spark, config))) 33 | -------------------------------------------------------------------------------- /app/main.py: -------------------------------------------------------------------------------- 1 | import json 2 | import importlib 3 | import argparse 4 | from pyspark.sql import SparkSession 5 | 6 | 7 | def _parse_arguments(): 8 | """ Parse arguments provided by spark-submit commend""" 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument("--job", required=True) 11 | return parser.parse_args() 12 | 13 | 14 | def main(): 15 | """ Main function excecuted by spark-submit command""" 16 | args = _parse_arguments() 17 | 18 | with open("config.json", "r") as config_file: 19 | config = json.load(config_file) 20 | 21 | spark = SparkSession.builder.appName(config.get("app_name")).getOrCreate() 22 | 23 | job_module = importlib.import_module(f"jobs.{args.job}") 24 | job_module.run_job(spark, config) 25 | 26 | 27 | if __name__ == "__main__": 28 | main() 29 | -------------------------------------------------------------------------------- /app/requirements.txt: -------------------------------------------------------------------------------- 1 | parse==1.12.0 -------------------------------------------------------------------------------- /app/shared/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pchrabka/PySpark-PyData/d71064804982db2353543b0051d802d9765a0680/app/shared/__init__.py -------------------------------------------------------------------------------- /app/shared/udfs.py: -------------------------------------------------------------------------------- 1 | from parse import parse 2 | from pyspark.sql.functions import udf 3 | from pyspark.sql.types import StringType, IntegerType 4 | 5 | TITLE_COLUMN_TEMPLATE = "{title} ({year})" 6 | 7 | 8 | def _get_movie_title(title_column): 9 | parsed = parse(TITLE_COLUMN_TEMPLATE, title_column) 10 | return parsed.named.get("title") if parsed else None 11 | 12 | 13 | def _get_movie_year(title_column): 14 | parsed = parse(TITLE_COLUMN_TEMPLATE, title_column) 15 | year = parsed.named.get("year") if parsed else None 16 | return int(year) if year and year.isdigit() else None 17 | 18 | 19 | get_movie_title_udf = udf(_get_movie_title, StringType()) 20 | get_movie_year_udf = udf(_get_movie_year, IntegerType()) 21 | -------------------------------------------------------------------------------- /app/tests/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | filterwarnings = 3 | ignore:Using or importing the ABCs from 'collections' instead of from 'collections', 4 | ignore:.*the imp module is deprecated in favour of importlib.* -------------------------------------------------------------------------------- /app/tests/test_movies.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import pandas as pd 4 | from jobs import movies 5 | 6 | 7 | class TestMoviesJob: 8 | def test_transform_data(self, spark_session): 9 | test_data = spark_session.createDataFrame( 10 | [(1, "Toy Story (1995)", "Adventure"), (160646, "Goat (2016)", "Drama")], 11 | ["movieId", "title", "genres"], 12 | ) 13 | 14 | expected_data = spark_session.createDataFrame( 15 | [(1, "Toy Story", 1995), (160646, "Goat", 2016)], 16 | ["movieId", "title", "year"], 17 | ).toPandas() 18 | 19 | real_data = movies._transform_data(test_data).toPandas() 20 | 21 | pd.testing.assert_frame_equal(real_data, expected_data, check_dtype=False) 22 | 23 | def test_run_job(self, spark_session, mocker): 24 | test_config = {"output_data_path": "test_data_output"} 25 | shutil.rmtree(test_config.get("output_data_path")) 26 | test_data = spark_session.createDataFrame( 27 | [(1, "Toy Story (1995)", "Adventure"), (160646, "Goat (2016)", "Drama")], 28 | ["movieId", "title", "genres"], 29 | ) 30 | mocker.patch.object(movies, "_extract_data") 31 | movies._extract_data.return_value = test_data 32 | movies.run_job(spark_session, test_config) 33 | assert os.path.exists(test_config.get("output_data_path")) 34 | -------------------------------------------------------------------------------- /app/tests/test_udfs.py: -------------------------------------------------------------------------------- 1 | from shared import udfs 2 | 3 | 4 | class TestUDFs: 5 | def test_get_movie_title_value(self): 6 | test_value = "Toys Story (1995)" 7 | expected_value = "Toys Story" 8 | assert udfs._get_movie_title(test_value) == expected_value 9 | 10 | def test_get_movie_year_value(self): 11 | test_value = "Toys Story (1995)" 12 | expected_value = 1995 13 | assert udfs._get_movie_year(test_value) == expected_value 14 | -------------------------------------------------------------------------------- /get_data.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import zipfile 3 | import io 4 | 5 | zip_file_url = "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip" 6 | output_path = "data/" 7 | 8 | response = requests.get(zip_file_url) 9 | 10 | if response.status_code == 200: 11 | zip_file = zipfile.ZipFile(io.BytesIO(response.content)) 12 | zip_file.extractall(output_path) 13 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==0.25.3 2 | parse==1.12.0 3 | pytest==5.3.0 4 | pytest-cov==2.8.1 5 | pytest-mock==1.12.1 6 | pytest-spark==0.5.2 7 | requests==2.22.0 8 | --------------------------------------------------------------------------------