├── {{cookiecutter.project_name}} ├── config │ ├── airflow_database.env │ ├── airflow_container.env │ ├── jupyter.env │ ├── shared_database.env │ ├── superset_database.env │ ├── minio.env │ ├── postgres.env │ └── superset_container.env ├── shared │ ├── requirements.txt │ ├── dags │ │ └── example_dag.py │ └── notebooks │ │ ├── example-minio-connection.ipynb │ │ └── example-postgres-connection.ipynb ├── services │ ├── jupyter │ │ └── Dockerfile │ ├── apistar │ │ ├── api │ │ │ ├── app.py │ │ │ └── custom_routes.py │ │ └── Dockerfile │ └── postgres │ │ ├── 01_shared_create_db.sh │ │ ├── 03_superset_create_db.sh │ │ ├── 05_airflow_create_db.sh │ │ ├── 04_airflow_create_user.sh │ │ ├── 02_superset_create_user.sh │ │ └── 00_shared_create_user.sh ├── README.md └── docker-compose.yml ├── architecture.png ├── cookiecutter.json ├── LICENSE └── README.md /{{cookiecutter.project_name}}/config/airflow_database.env: -------------------------------------------------------------------------------- 1 | AIRFLOW_PASSWORD=airflow 2 | -------------------------------------------------------------------------------- /{{cookiecutter.project_name}}/config/airflow_container.env: -------------------------------------------------------------------------------- 1 | LOAD_EX=n 2 | EXECUTOR=Local 3 | -------------------------------------------------------------------------------- /{{cookiecutter.project_name}}/config/jupyter.env: -------------------------------------------------------------------------------- 1 | JUPYTER_PASSWORD={{cookiecutter.jupyter_password}} 2 | -------------------------------------------------------------------------------- /architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jgoerner/data-science-stack-cookiecutter/HEAD/architecture.png -------------------------------------------------------------------------------- /{{cookiecutter.project_name}}/config/shared_database.env: -------------------------------------------------------------------------------- 1 | SHARED_PASSWORD={{cookiecutter.shared_db_password}} 2 | -------------------------------------------------------------------------------- /{{cookiecutter.project_name}}/config/superset_database.env: -------------------------------------------------------------------------------- 1 | SUPERSET_PASSWORD={{cookiecutter.superset_db_password}} 2 | -------------------------------------------------------------------------------- /{{cookiecutter.project_name}}/shared/requirements.txt: -------------------------------------------------------------------------------- 1 | minio==4.0.0 2 | numpy==1.12.1 3 | pandas==0.19.2 4 | scikit-learn==0.19.1 5 | scipy==0.19.1 6 | -------------------------------------------------------------------------------- /{{cookiecutter.project_name}}/config/minio.env: -------------------------------------------------------------------------------- 1 | MINIO_ACCESS_KEY={{cookiecutter.minio_access_key}} 2 | MINIO_SECRET_KEY={{cookiecutter.minio_secret_key}} 3 | -------------------------------------------------------------------------------- /{{cookiecutter.project_name}}/config/postgres.env: -------------------------------------------------------------------------------- 1 | POSTGRES_PASSWORD={{cookiecutter.postgres_db_password}} 2 | SHARED_PASSWORD={{cookiecutter.shared_db_password}} 3 | -------------------------------------------------------------------------------- /{{cookiecutter.project_name}}/config/superset_container.env: -------------------------------------------------------------------------------- 1 | ADMIN_PWD={{cookiecutter.superset_admin_password}} 2 | SUP_META_DB_URI=postgres://superset:{{cookiecutter.superset_db_password}}@postgres/superset 3 | -------------------------------------------------------------------------------- /{{cookiecutter.project_name}}/services/jupyter/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM jupyter/scipy-notebook 2 | 3 | USER root 4 | 5 | RUN pip install --proxy=${http_proxy}\ 6 | psycopg2-binary\ 7 | minio 8 | 9 | USER jovyan 10 | -------------------------------------------------------------------------------- /{{cookiecutter.project_name}}/services/apistar/api/app.py: -------------------------------------------------------------------------------- 1 | from apistar import App, Include 2 | 3 | import custom_routes 4 | 5 | routes = [ 6 | Include("/api", name="api", routes=custom_routes.routes), 7 | ] 8 | 9 | app = App(routes) 10 | 11 | if __name__ == "__main__": 12 | app.serve("127.0.0.1", 9090, debug=True) 13 | -------------------------------------------------------------------------------- /{{cookiecutter.project_name}}/services/postgres/01_shared_create_db.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e # exit immediately if a command exits with a non-zero status. 3 | 4 | POSTGRES="psql --username postgres" 5 | 6 | # create database for superset 7 | echo "Creating database: shared" 8 | $POSTGRES < 3 | 4 | WORKDIR /usr/src/app 5 | 6 | RUN apk add --update \ 7 | postgresql-dev \ 8 | build-base \ 9 | openblas-dev \ 10 | gfortran \ 11 | && rm -rf /var/cache/apk/* 12 | 13 | RUN pip install --no-cache-dir --proxy=${http_proxy}\ 14 | numpy===1.14.3 15 | 16 | RUN pip install --no-cache-dir --proxy=${http_proxy}\ 17 | apistar==0.5.41 \ 18 | gunicorn==19.8.1 \ 19 | dill===0.2.7.1 \ 20 | minio===4.0.0 \ 21 | psycopg2-binary===2.7.4 \ 22 | scipy \ 23 | scikit-learn 24 | 25 | RUN pip install --no-cache-dir --no-deps --proxy=${http_proxy}\ 26 | pandas 27 | 28 | 29 | EXPOSE 8000 30 | -------------------------------------------------------------------------------- /{{cookiecutter.project_name}}/shared/dags/example_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | import logging 3 | import os 4 | 5 | from airflow import DAG 6 | from airflow.operators.python_operator import PythonOperator 7 | 8 | ### TASK ### 9 | def log_hello_world(): 10 | # fetching the data 11 | logging.info("Hello wonderful world!") 12 | 13 | ### DAG ### 14 | default_args={ 15 | "owner":"airflow", 16 | "depends_on_past":"false", 17 | "start_date":datetime.today()-timedelta(days=1), 18 | } 19 | 20 | dag = DAG( 21 | dag_id="dag_hello_world", 22 | default_args=default_args, 23 | schedule_interval=timedelta(seconds=30)) 24 | 25 | task = PythonOperator( 26 | task_id="task_hello_world", 27 | python_callable=log_hello_world, 28 | dag=dag, 29 | ) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Joshua Görner 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Science Stack - Cookiecutter 2 | [![Maintainers Wanted](https://img.shields.io/badge/maintainers-wanted-red.svg)](https://github.com/pickhardt/maintainers-wanted) 3 | 4 | Cookiecutter to launch an awesome Data Science toolstack in Docker. 5 | 6 | # See it in action 7 | [![asciicast](https://asciinema.org/a/CcZ9duIdP0pBfZWxw5Nt1tFVZ.png)](https://asciinema.org/a/CcZ9duIdP0pBfZWxw5Nt1tFVZ) 8 | 9 | # Overall Architecture 10 | ![architecture](./architecture.png) 11 | 12 | # Used Variables 13 | The following table provides an overview about parameter, that are queried by cookiecutter (and why) 14 | 15 | | Name | Description | Injected in Services | 16 | | --- | --- | --- | 17 | | **project_name** | *Name of your project* | - | 18 | | **jupyter_password** | *Password to protect your Jupyter service* | Jupyter | 19 | | **postgres_db_password** | *Password of standard postgres user* | Postgres | 20 | | **shared_db_password** | *Password for shared database* | Airflow
Jupyter
Postgres | 21 | | **superset_db_password** | *Password for superset database* | Postgres
Superset | 22 | | **superset_admin_password** | *Password for superset admin user* | Superset | 23 | | **minio_access_key** | *Access key for Minio store* | Airflow
Apistar
Jupyter
Minio | 24 | | **minio_secret_key** | *Secret key for Minio store* | Airflow
Apistar
Jupyter
Minio | 25 | -------------------------------------------------------------------------------- /{{cookiecutter.project_name}}/shared/notebooks/example-minio-connection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# About\n", 8 | "This notebook shows the connection to the minio service" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "### Setup" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import os\n", 25 | "\n", 26 | "from minio import Minio\n", 27 | "\n", 28 | "# create a connection to the object store\n", 29 | "minio_client = Minio(\n", 30 | " endpoint=\"minio:9000\",\n", 31 | " access_key=os.environ[\"MINIO_ACCESS_KEY\"],\n", 32 | " secret_key=os.environ[\"MINIO_SECRET_KEY\"],\n", 33 | " secure=False\n", 34 | ")\n", 35 | "\n", 36 | "# write a sample file\n", 37 | "with open(\"sample.txt\", \"w\") as f:\n", 38 | " f.write(\"This is just a sample text\")" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "### Write Data" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 2, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "data": { 55 | "text/plain": [ 56 | "'208ee96fcb544b4f59f19a7d4c10fd9d'" 57 | ] 58 | }, 59 | "execution_count": 2, 60 | "metadata": {}, 61 | "output_type": "execute_result" 62 | } 63 | ], 64 | "source": [ 65 | "# create a minio bucket\n", 66 | "minio_client.make_bucket(\"sample-bucket\")\n", 67 | "\n", 68 | "# write the object to minio\n", 69 | "minio_client.fput_object(\n", 70 | " bucket_name=\"sample-bucket\", \n", 71 | " object_name=\"sample-file.txt\", \n", 72 | " file_path=\"./sample.txt\"\n", 73 | ")" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "### Read Data" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 3, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "data": { 90 | "text/plain": [ 91 | "'This is just a sample text'" 92 | ] 93 | }, 94 | "execution_count": 3, 95 | "metadata": {}, 96 | "output_type": "execute_result" 97 | } 98 | ], 99 | "source": [ 100 | "minio_client.get_object(\"sample-bucket\", \"sample-file.txt\").data.decode()" 101 | ] 102 | } 103 | ], 104 | "metadata": { 105 | "kernelspec": { 106 | "display_name": "Python 3", 107 | "language": "python", 108 | "name": "python3" 109 | }, 110 | "language_info": { 111 | "codemirror_mode": { 112 | "name": "ipython", 113 | "version": 3 114 | }, 115 | "file_extension": ".py", 116 | "mimetype": "text/x-python", 117 | "name": "python", 118 | "nbconvert_exporter": "python", 119 | "pygments_lexer": "ipython3", 120 | "version": "3.6.5" 121 | } 122 | }, 123 | "nbformat": 4, 124 | "nbformat_minor": 2 125 | } 126 | -------------------------------------------------------------------------------- /{{cookiecutter.project_name}}/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | 3 | # All available services 4 | services: 5 | 6 | # Computation 7 | jupyter: 8 | container_name: "{{cookiecutter.project_name}}_jupyter" 9 | restart: "always" 10 | build: services/jupyter 11 | env_file: 12 | - ./config/jupyter.env 13 | - ./config/minio.env 14 | - ./config/shared_database.env 15 | volumes: 16 | - ./shared/notebooks/:/home/jovyan/work/notebooks 17 | - ./shared/dags/:/home/jovyan/work/dags 18 | - ./services/apistar/api/:/home/jovyan/work/api 19 | ports: 20 | - 8888:8888 21 | entrypoint: sh -c 'start-notebook.sh --NotebookApp.token=$$JUPYTER_PASSWORD' 22 | 23 | # Visualization 24 | superset: 25 | container_name: "{{cookiecutter.project_name}}_superset" 26 | restart: "always" 27 | image: tylerfowler/superset:0.24 28 | depends_on: 29 | - postgres 30 | env_file: 31 | - ./config/superset_container.env 32 | - ./config/superset_database.env 33 | ports: 34 | - 8088:8088 35 | 36 | # Misc Storage 37 | postgres: 38 | container_name: "{{cookiecutter.project_name}}_postgres" 39 | restart: "always" 40 | image: postgres 41 | env_file: 42 | - ./config/postgres.env 43 | - ./config/superset_database.env 44 | - ./config/airflow_database.env 45 | - ./config/shared_database.env 46 | volumes: 47 | - postgres_volume:/var/lib/postgresql/data/ 48 | - ./services/postgres/:/docker-entrypoint-initdb.d/ 49 | ports: 50 | - 5432:5432 51 | 52 | # Scheduling 53 | airflow: 54 | container_name: "{{cookiecutter.project_name}}_airflow" 55 | restart: "always" 56 | image: puckel/docker-airflow:1.9.0-2 57 | depends_on: 58 | - postgres 59 | env_file: 60 | - ./config/airflow_container.env 61 | - ./config/minio.env 62 | - ./config/shared_database.env 63 | volumes: 64 | - ./shared/requirements.txt:/requirements.txt 65 | - ./shared/dags/:/usr/local/airflow/dags 66 | ports: 67 | - 8080:8080 68 | command: webserver 69 | 70 | # Model Storage 71 | minio: 72 | container_name: "{{cookiecutter.project_name}}_minio" 73 | restart: "always" 74 | image: minio/minio 75 | env_file: 76 | - ./config/minio.env 77 | volumes: 78 | - minio_volume:/data 79 | ports: 80 | - 9000:9000 81 | command: server /data 82 | 83 | # API 84 | apistar: 85 | container_name: "{{cookiecutter.project_name}}_apistar" 86 | restart: "always" 87 | build: services/apistar 88 | env_file: 89 | - ./config/minio.env 90 | volumes: 91 | - ./services/apistar/api:/usr/src/app 92 | ports: 93 | - 8000:8000 94 | command: gunicorn app:app -b 0.0.0.0:8000 95 | 96 | # Admin Overview 97 | portainer: 98 | container_name: "{{cookiecutter.project_name}}_portainer" 99 | restart: "always" 100 | image: portainer/portainer 101 | volumes: 102 | - /var/run/docker.sock:/var/run/docker.sock 103 | - portainer_volume:/data 104 | ports: 105 | - 9090:9000 106 | command: -H unix:///var/run/docker.sock 107 | 108 | # Volumes to persist data 109 | volumes: 110 | postgres_volume: 111 | minio_volume: 112 | portainer_volume: 113 | -------------------------------------------------------------------------------- /{{cookiecutter.project_name}}/shared/notebooks/example-postgres-connection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# About\n", 8 | "This notebook shows the connection to the postgres service" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "### Setup" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import os\n", 25 | "\n", 26 | "import pandas as pd\n", 27 | "from sqlalchemy import create_engine\n", 28 | "\n", 29 | "# set up postgres connection\n", 30 | "pwd = os.environ[\"SHARED_PASSWORD\"]\n", 31 | "con = create_engine(\n", 32 | " \"postgres://shared:{pwd}@postgres/shared\".format(**locals()))\n", 33 | "\n", 34 | "# create a sample dataframe\n", 35 | "df_sample = pd.DataFrame(\n", 36 | " [\n", 37 | " [1, 2, 3], \n", 38 | " [4, 5, 6]\n", 39 | " ], \n", 40 | " columns=[\"a\", \"b\", \"c\"])" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "### Write Data" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 2, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "df_sample.to_sql(name=\"sample\", con=con, index=False)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "### Read Data" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 3, 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "data": { 73 | "text/html": [ 74 | "
\n", 75 | "\n", 88 | "\n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | "
abc
0123
1456
\n", 112 | "
" 113 | ], 114 | "text/plain": [ 115 | " a b c\n", 116 | "0 1 2 3\n", 117 | "1 4 5 6" 118 | ] 119 | }, 120 | "execution_count": 3, 121 | "metadata": {}, 122 | "output_type": "execute_result" 123 | } 124 | ], 125 | "source": [ 126 | "pd.read_sql(sql=\"SELECT * FROM sample\", con=con)" 127 | ] 128 | } 129 | ], 130 | "metadata": { 131 | "kernelspec": { 132 | "display_name": "Python 3", 133 | "language": "python", 134 | "name": "python3" 135 | }, 136 | "language_info": { 137 | "codemirror_mode": { 138 | "name": "ipython", 139 | "version": 3 140 | }, 141 | "file_extension": ".py", 142 | "mimetype": "text/x-python", 143 | "name": "python", 144 | "nbconvert_exporter": "python", 145 | "pygments_lexer": "ipython3", 146 | "version": "3.6.5" 147 | } 148 | }, 149 | "nbformat": 4, 150 | "nbformat_minor": 2 151 | } 152 | --------------------------------------------------------------------------------