├── .gitignore ├── .pre-commit-config.yaml ├── Dockerfile ├── README.md ├── docker-compose.yaml ├── git-hooks ├── copy_hooks.sh └── hooks │ └── pre-commit ├── requirements.txt ├── src └── main.py └── tests └── test_main.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.2.0 4 | hooks: 5 | - id: check-yaml 6 | - id: end-of-file-fixer 7 | - id: trailing-whitespace 8 | - repo: https://github.com/pre-commit/mirrors-mypy 9 | rev: 'v0.942' # Use the sha you want to point at 10 | hooks: 11 | - id: mypy 12 | - repo: https://github.com/pre-commit/mirrors-pylint 13 | rev: '' # Use the sha / tag you want to point at 14 | hooks: 15 | - id: pylint -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.04 2 | 3 | RUN apt-get update && \ 4 | apt-get install -y default-jdk scala wget vim software-properties-common python3.8 python3-pip curl unzip libpq-dev build-essential libssl-dev libffi-dev python3-dev&& \ 5 | apt-get clean 6 | 7 | RUN wget https://archive.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-hadoop3.2.tgz && \ 8 | tar xvf spark-3.0.1-bin-hadoop3.2.tgz && \ 9 | mv spark-3.0.1-bin-hadoop3.2/ /usr/local/spark && \ 10 | ln -s /usr/local/spark spark 11 | 12 | WORKDIR app 13 | COPY . /app 14 | RUN pip3 install cython==0.29.21 numpy==1.18.5 && pip3 install pytest && pip3 install -r requirements.txt 15 | ENV PYSPARK_PYTHON=python3 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dataEngineeringTemplate 2 | Template for Data Engineering and Data Pipeline projects 3 | 4 | ## Project Overview 5 | This is a high level description of the project, what it is trying to accomplish. 6 | 1. Add your requirements to the `requirements.txt` file for Python pip packages. 7 | 2. Add any nessesary installations to the Dockerfile. 8 | 9 | ## Architecture 10 | This is a high level description of the tool(s) and decisions around why those tool(s) were choosen. 11 | 12 | ## Testing 13 | This is instructions on how to test this repo. All tests are located inside the `tests` folder. We are using `pytest`. 14 | Run the following steps. 15 | 16 | 1. docker build --tag my-project . 17 | 2. `docker-compose up test` 18 | 19 | Add your unit tests to files inside the `tests` folder ... name your files `test_somename.py` 20 | 21 | ## Data Flow 22 | High level description of data source(s) and sink(s), as well as the general pattern and data flow through the pipeline. 23 | Discuss any assumptions made. 24 | 25 | ## Hooks 26 | If you have your own hooks, you can add them to git-hooks. 27 | 28 | Use this command to add them to the appropriate folder then commit. 29 | 30 | `sh git-hooks/copy_hooks.sh` 31 | 32 | Whatever is copied from git-hooks/copy_hooks.sh will replace anything set up using the pre-commit. -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: "3.9" 2 | services: 3 | test: 4 | environment: 5 | - PYTHONPATH=./src 6 | image: "my-project" 7 | volumes: 8 | - .:/app 9 | command: python3 -m pytest 10 | -------------------------------------------------------------------------------- /git-hooks/copy_hooks.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | repo_dir=$(git rev-parse --show-toplevel) 4 | cp $repo_dir/git-hooks/hooks/* $repo_dir/.git/hooks 5 | chmod +x $repo_dir/.git/hooks/* -------------------------------------------------------------------------------- /git-hooks/hooks/pre-commit: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | current_branch=`git rev-parse --abbrev-ref HEAD` 4 | if [[ $current_branch =~ master|main ]]; then 5 | message="Please don't commit directly to $current_branch." 6 | echo -e "\033[1;31mERROR: $message\033[0m"; 7 | exit 1 8 | fi 9 | 10 | repo_dir=`git rev-parse --show-toplevel` 11 | 12 | message="[Policy] Checking code in $repo_dir with black..." 13 | echo -e "\033[1;34mInfo: $message\033[0m" 14 | 15 | black -t py39 --check $repo_dir 16 | 17 | if [ $? -eq 1 ]; then 18 | message="[Policy] Black check failed, please use black to format your code." 19 | echo -e "\033[1;31mERROR: $message\033[0m"; 20 | exit 1 21 | else 22 | message="[Policy] Passed black checking." 23 | echo -e "\033[1;32mOK: $message\033[0m" 24 | exit 0 25 | fi -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pyspark==3.2.1 2 | pandas==1.0.5 3 | -------------------------------------------------------------------------------- /src/main.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | 4 | def main(arg1: int = 1) -> int: 5 | # Main entry point for the pipeline. 6 | return arg1 + 1 7 | 8 | 9 | if __name__ == '__main__': 10 | x = main() 11 | -------------------------------------------------------------------------------- /tests/test_main.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from src import main 3 | 4 | 5 | def test_main() 6 | x = main.main() 7 | assert x = 2 8 | 9 | --------------------------------------------------------------------------------