├── .dvc
    ├── config
    ├── .gitignore
    └── plots
    │   ├── default.json
    │   ├── smooth.json
    │   ├── confusion.json
    │   ├── scatter.json
    │   ├── confusion_normalized.json
    │   └── linear.json
├── conftest.py
├── notebooks
    ├── .gitkeep
    └── relative_path_imports.ipynb
├── src
    ├── __init__.py
    ├── __main__.py
    ├── greenhouse_clock.py
    ├── eda_monitoring.py
    ├── data_sourcing.py
    ├── performance_monitoring.py
    ├── main.py
    ├── api.py
    ├── data_preprocessing.py
    ├── data_splitting.py
    ├── modeling.py
    └── feature_engineering.py
├── tests
    ├── .gitkeep
    ├── test_data_sourcing.py
    ├── test_feature_engineering.py
    └── test_data_splitting.py
├── data
    └── .gitignore
├── .flake8
├── images
    ├── Greenhouse_logo.png
    ├── greenhouse_architecture_v01.png
    ├── greenhouse_architecture_v02.png
    ├── greenhouse_architecture_v03.png
    ├── greenhouse_github_card_v02.png
    ├── greenhouse_architecture_gitops.png
    └── Screenshot_from_2021-03-07_18-31-36_VS-Code.png
├── examples
    ├── palmer_penguins
    │   ├── flow
    │   │   └── prefect_flow.pdf
    │   ├── src
    │   │   ├── greenhouse_clock.py
    │   │   ├── eda_monitoring.py
    │   │   ├── data_sourcing.py
    │   │   ├── data_preprocessing.py
    │   │   ├── data_splitting.py
    │   │   ├── api.py
    │   │   ├── performance_monitoring.py
    │   │   ├── feature_engineering.py
    │   │   ├── modeling.py
    │   │   └── main.py
    │   ├── requirements.txt
    │   ├── Dockerfile
    │   └── tests
    │   │   └── test_data_sourcing.py
    └── vanilla
    │   ├── src
    │       ├── greenhouse_clock.py
    │       ├── data_sourcing.py
    │       ├── performance_monitoring.py
    │       ├── main.py
    │       ├── api.py
    │       ├── data_preprocessing.py
    │       ├── data_splitting.py
    │       ├── modeling.py
    │       └── feature_engineering.py
    │   ├── requirements.txt
    │   ├── Dockerfile
    │   └── tests
    │       └── test_data_sourcing.py
├── .dvcignore
├── .dockerignore
├── dvc.yaml
├── .vscode
    └── settings.json
├── requirements.txt
├── .devcontainer.json
├── version.toml
├── Dockerfile
├── dvc.lock
├── .pre-commit-config.yaml
├── docker-compose.yml
├── .gitignore
├── CONTRIBUTING.md
├── Makefile
├── README.md
└── LICENSE


/.dvc/config:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/notebooks/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/__main__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/.gitignore:
--------------------------------------------------------------------------------
1 | /test_dvc.json
2 | 


--------------------------------------------------------------------------------
/.dvc/.gitignore:
--------------------------------------------------------------------------------
1 | /config.local
2 | /tmp
3 | /cache
4 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 87
3 | extend-ignore = "E203"


--------------------------------------------------------------------------------
/images/Greenhouse_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/felipepenha/py-greenhouse/HEAD/images/Greenhouse_logo.png


--------------------------------------------------------------------------------
/src/greenhouse_clock.py:
--------------------------------------------------------------------------------
1 | import time
2 | 
3 | 
4 | def get_time(format="%Y%m%d%H%M%S"):
5 | 
6 |     return time.strftime(format)
7 | 


--------------------------------------------------------------------------------
/images/greenhouse_architecture_v01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/felipepenha/py-greenhouse/HEAD/images/greenhouse_architecture_v01.png


--------------------------------------------------------------------------------
/images/greenhouse_architecture_v02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/felipepenha/py-greenhouse/HEAD/images/greenhouse_architecture_v02.png


--------------------------------------------------------------------------------
/images/greenhouse_architecture_v03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/felipepenha/py-greenhouse/HEAD/images/greenhouse_architecture_v03.png


--------------------------------------------------------------------------------
/images/greenhouse_github_card_v02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/felipepenha/py-greenhouse/HEAD/images/greenhouse_github_card_v02.png


--------------------------------------------------------------------------------
/images/greenhouse_architecture_gitops.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/felipepenha/py-greenhouse/HEAD/images/greenhouse_architecture_gitops.png


--------------------------------------------------------------------------------
/examples/palmer_penguins/flow/prefect_flow.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/felipepenha/py-greenhouse/HEAD/examples/palmer_penguins/flow/prefect_flow.pdf


--------------------------------------------------------------------------------
/examples/vanilla/src/greenhouse_clock.py:
--------------------------------------------------------------------------------
1 | import time
2 | 
3 | 
4 | def get_time(format="%Y%m%d%H%M%S"):
5 | 
6 |     return time.strftime(format)
7 | 


--------------------------------------------------------------------------------
/examples/palmer_penguins/src/greenhouse_clock.py:
--------------------------------------------------------------------------------
1 | import time
2 | 
3 | 
4 | def get_time(format="%Y%m%d%H%M%S"):
5 | 
6 |     return time.strftime(format)
7 | 


--------------------------------------------------------------------------------
/.dvcignore:
--------------------------------------------------------------------------------
1 | # Add patterns of files dvc should ignore, which could improve
2 | # the performance. Learn more at
3 | # https://dvc.org/doc/user-guide/dvcignore
4 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | # Docker
 2 | Dockerfile
 3 | .dockerignore
 4 | 
 5 | # Git
 6 | .git
 7 | .gitignore
 8 | .gitattributes
 9 | 
10 | # Images
11 | images
12 | .png
13 | 


--------------------------------------------------------------------------------
/images/Screenshot_from_2021-03-07_18-31-36_VS-Code.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/felipepenha/py-greenhouse/HEAD/images/Screenshot_from_2021-03-07_18-31-36_VS-Code.png


--------------------------------------------------------------------------------
/dvc.yaml:
--------------------------------------------------------------------------------
1 | stages:
2 |   build-run:
3 |     cmd: make build && make run
4 |     deps:
5 |       - src/modeling.py
6 |     metrics:
7 |       - monitor/metadata_valid.json:
8 |           cache: true
9 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |   "python.linting.enabled": true,
3 |   "python.linting.flake8Enabled": true,
4 |   "python.linting.pylintEnabled": false,
5 |   "python.pythonPath": "/usr/local/bin/python"
6 | }
7 | 


--------------------------------------------------------------------------------
/examples/vanilla/requirements.txt:
--------------------------------------------------------------------------------
 1 | fastapi==0.65.1
 2 | feature-engine==1.0.2
 3 | jupyterlab==3.0.9
 4 | numpy==1.20.1
 5 | pandas==1.2.2
 6 | pandera==0.6.2
 7 | pydantic==1.8.2
 8 | pytest==6.2.2
 9 | uvicorn==0.13.4
10 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | fastapi==0.65.1
 2 | feature-engine==1.0.2
 3 | jupyterlab==3.0.9
 4 | numpy==1.20.1
 5 | pandas==1.2.2
 6 | pandas-profiling==2.11.0
 7 | pandera==0.6.2
 8 | pdoc==7.0.3
 9 | prefect[viz]==0.14.12
10 | pydantic==1.8.2
11 | pytest==6.2.2
12 | uvicorn==0.13.4
13 | 


--------------------------------------------------------------------------------
/.devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "dockerFile": "Dockerfile",
 3 |   "extensions": [
 4 |     "ms-python.python",
 5 |     "dracula-theme.theme-dracula",
 6 |     "bungcip.better-toml",
 7 |     "tomoki1207.pdf"
 8 |   ],
 9 |   "name": "py-greenhouse",
10 |   "shutdownAction": "stopContainer"
11 | }
12 | 


--------------------------------------------------------------------------------
/version.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "greenhouse"
3 | version = "1.0.0"
4 | authors = ["Felipe Penha <felipe.penha@alumni.usp.br>"]
5 | description = "A Python containerized framework for a better Data X development workflow."
6 | repository = "https://github.com/felipepenha/py-greenhouse"
7 | license = "Apache-2.0"


--------------------------------------------------------------------------------
/src/eda_monitoring.py:
--------------------------------------------------------------------------------
 1 | import pandas_profiling
 2 | 
 3 | 
 4 | def export_eda_report(df, path, preffix, suffix):
 5 | 
 6 |     profile = pandas_profiling.ProfileReport(df, title="Pandas Profiling Report")
 7 | 
 8 |     path = "{}/{}_ead_monitoring_{}.html".format(path, preffix, suffix)
 9 | 
10 |     profile.to_file(path)
11 | 
12 |     pass
13 | 


--------------------------------------------------------------------------------
/examples/palmer_penguins/requirements.txt:
--------------------------------------------------------------------------------
 1 | fastapi==0.65.1
 2 | feature-engine==1.0.2
 3 | jupyterlab==3.0.9
 4 | matplotlib==3.3.4
 5 | numpy==1.20.1
 6 | palmerpenguins==0.1.4
 7 | pandas==1.2.2
 8 | pandas-profiling==2.11.0
 9 | pandera==0.6.2
10 | prefect[viz]==0.14.12
11 | pydantic==1.8.2
12 | pytest==6.2.2
13 | seaborn==0.11.1
14 | uvicorn==0.13.4
15 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9.2-slim-buster AS base
 2 | 
 3 | ARG APP_DIR=/usr/app/
 4 | 
 5 | USER root
 6 | 
 7 | RUN mkdir ${APP_DIR}
 8 | 
 9 | WORKDIR ${APP_DIR}
10 | 
11 | # pip requirements
12 | COPY requirements.txt ${APP_DIR}
13 | 
14 | RUN pip install --upgrade pip \
15 |     && pip3 install --no-cache-dir -r requirements.txt
16 | 
17 | CMD ["python3", "src/main.py"]


--------------------------------------------------------------------------------
/examples/palmer_penguins/src/eda_monitoring.py:
--------------------------------------------------------------------------------
 1 | import pandas_profiling
 2 | 
 3 | 
 4 | def export_eda_report(df, path, preffix, suffix):
 5 | 
 6 |     profile = pandas_profiling.ProfileReport(df, title="Pandas Profiling Report")
 7 | 
 8 |     path = "{}/{}_ead_monitoring_{}.html".format(path, preffix, suffix)
 9 | 
10 |     profile.to_file(path)
11 | 
12 |     pass
13 | 


--------------------------------------------------------------------------------
/examples/vanilla/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9.2-slim-buster AS base
 2 | 
 3 | ARG APP_DIR=/usr/app/
 4 | 
 5 | USER root
 6 | 
 7 | RUN mkdir ${APP_DIR}
 8 | 
 9 | WORKDIR ${APP_DIR}
10 | 
11 | # pip requirements
12 | COPY requirements.txt ${APP_DIR}
13 | 
14 | RUN pip install --upgrade pip \
15 |     && pip3 install --no-cache-dir -r requirements.txt
16 | 
17 | CMD ["python3", "src/main.py"]


--------------------------------------------------------------------------------
/dvc.lock:
--------------------------------------------------------------------------------
 1 | schema: '2.0'
 2 | stages:
 3 |   build:
 4 |     cmd: make build && make run
 5 |     outs:
 6 |     - path: monitor
 7 |       md5: 7c3a8f4c317b9ae0d1df7ab62820ff6d.dir
 8 |       size: 3251
 9 |       nfiles: 3
10 |   build-run:
11 |     cmd: make build && make run
12 |     deps:
13 |     - path: src/modeling.py
14 |       md5: 6103728b418f2ccc23840f786157baa4
15 |       size: 3177
16 |     outs:
17 |     - path: monitor/metadata_valid.json
18 |       md5: 91c3f20f203751f9034230a46dc45b0e
19 |       size: 1093
20 | 


--------------------------------------------------------------------------------
/examples/palmer_penguins/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9.2-slim-buster AS base
 2 | 
 3 | ARG APP_DIR=/usr/app/
 4 | 
 5 | USER root
 6 | 
 7 | RUN mkdir ${APP_DIR}
 8 | 
 9 | WORKDIR ${APP_DIR}
10 | 
11 | # graphviz is required by prefect[viz]==0.14.12
12 | RUN apt-get update \
13 |     && apt-get install -y build-essential graphviz  \
14 |     && apt-get clean
15 | 
16 | # pip requirements
17 | COPY requirements.txt ${APP_DIR}
18 | 
19 | RUN pip install --upgrade pip \
20 |     && pip3 install --no-cache-dir -r requirements.txt
21 | 
22 | CMD ["python3", "src/main.py"]


--------------------------------------------------------------------------------
/tests/test_data_sourcing.py:
--------------------------------------------------------------------------------
 1 | import pandera as pa
 2 | from src import data_sourcing
 3 | 
 4 | 
 5 | def test_data_sourcing_get():
 6 | 
 7 |     df = data_sourcing.get()
 8 | 
 9 |     print(df)
10 | 
11 |     schema = pa.DataFrameSchema(
12 |         {
13 |             "id": pa.Column(
14 |                 str,
15 |                 nullable=True,
16 |             ),
17 |             "x": pa.Column(
18 |                 float,
19 |                 nullable=True,
20 |             ),
21 |             "y": pa.Column(
22 |                 float,
23 |                 nullable=True,
24 |             ),
25 |         }
26 |     )
27 | 
28 |     schema(df)
29 | 


--------------------------------------------------------------------------------
/examples/vanilla/tests/test_data_sourcing.py:
--------------------------------------------------------------------------------
 1 | import pandera as pa
 2 | from src import data_sourcing
 3 | 
 4 | 
 5 | def test_data_sourcing_get():
 6 | 
 7 |     df = data_sourcing.get()
 8 | 
 9 |     print(df)
10 | 
11 |     schema = pa.DataFrameSchema(
12 |         {
13 |             "id": pa.Column(
14 |                 str,
15 |                 nullable=True,
16 |             ),
17 |             "x": pa.Column(
18 |                 float,
19 |                 nullable=True,
20 |             ),
21 |             "y": pa.Column(
22 |                 float,
23 |                 nullable=True,
24 |             ),
25 |         }
26 |     )
27 | 
28 |     schema(df)
29 | 


--------------------------------------------------------------------------------
/.dvc/plots/default.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
 3 |   "data": {
 4 |     "values": "<DVC_METRIC_DATA>"
 5 |   },
 6 |   "encoding": {
 7 |     "color": {
 8 |       "field": "rev",
 9 |       "type": "nominal"
10 |     },
11 |     "x": {
12 |       "field": "<DVC_METRIC_X>",
13 |       "title": "<DVC_METRIC_X_LABEL>",
14 |       "type": "quantitative"
15 |     },
16 |     "y": {
17 |       "field": "<DVC_METRIC_Y>",
18 |       "scale": {
19 |         "zero": false
20 |       },
21 |       "title": "<DVC_METRIC_Y_LABEL>",
22 |       "type": "quantitative"
23 |     }
24 |   },
25 |   "height": 300,
26 |   "mark": {
27 |     "type": "line"
28 |   },
29 |   "title": "<DVC_METRIC_TITLE>",
30 |   "width": 300
31 | }
32 | 


--------------------------------------------------------------------------------
/src/data_sourcing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | 
 5 | def get():
 6 |     """Get the data.
 7 | 
 8 |     ** Vanilla definition. **
 9 |     Include your own code below to import your project's data.
10 | 
11 |     Parameters
12 |     ----------
13 |     None
14 | 
15 |     Returns
16 |     -------
17 |     df: pandas dataframe
18 | 
19 |     Examples
20 |     --------
21 | 
22 |     Raises
23 |     ------
24 | 
25 |     Notes
26 |     -----
27 | 
28 |     """
29 | 
30 |     df = pd.DataFrame(
31 |         {
32 |             "id": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"],
33 |             "x": [0.0, np.nan, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
34 |             "y": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
35 |         }
36 |     )
37 | 
38 |     return df
39 | 


--------------------------------------------------------------------------------
/examples/vanilla/src/data_sourcing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | 
 5 | def get():
 6 |     """Get the data.
 7 | 
 8 |     ** Vanilla definition. **
 9 |     Include your own code below to import your project's data.
10 | 
11 |     Parameters
12 |     ----------
13 |     None
14 | 
15 |     Returns
16 |     -------
17 |     df: pandas dataframe
18 | 
19 |     Examples
20 |     --------
21 | 
22 |     Raises
23 |     ------
24 | 
25 |     Notes
26 |     -----
27 | 
28 |     """
29 | 
30 |     df = pd.DataFrame(
31 |         {
32 |             "id": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"],
33 |             "x": [0.0, np.nan, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
34 |             "y": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
35 |         }
36 |     )
37 | 
38 |     return df
39 | 


--------------------------------------------------------------------------------
/src/performance_monitoring.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import greenhouse_clock
 3 | 
 4 | meta = {}
 5 | 
 6 | # Timestamp for files
 7 | meta["timestr"] = greenhouse_clock.get_time()
 8 | 
 9 | 
10 | def report_performance(y_true, y_score, path, suffix=""):
11 |     """
12 | 
13 |     We suggest using `sklearn.metrics.classification_report`
14 | 
15 |     References
16 |     ----------
17 |     https://scikit-learn.org/stable/modules/generated/
18 |     sklearn.metrics.classification_report.html
19 |     """
20 | 
21 |     # Plug-in here your performance metrics as dictionary entries
22 |     meta["performance_metric_name"] = 0
23 | 
24 |     filename = "{0}metadata{1}.json".format(path, suffix)
25 | 
26 |     # Export to JSON
27 |     with open(filename, "w") as fp:
28 |         json.dump(meta, fp, indent=4)
29 | 
30 |     pass
31 | 


--------------------------------------------------------------------------------
/examples/vanilla/src/performance_monitoring.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import greenhouse_clock
 3 | 
 4 | meta = {}
 5 | 
 6 | # Timestamp for files
 7 | meta["timestr"] = greenhouse_clock.get_time()
 8 | 
 9 | 
10 | def report_performance(y_true, y_score, path, suffix=""):
11 |     """
12 | 
13 |     We suggest using `sklearn.metrics.classification_report`
14 | 
15 |     References
16 |     ----------
17 |     https://scikit-learn.org/stable/modules/generated/
18 |     sklearn.metrics.classification_report.html
19 |     """
20 | 
21 |     # Plug-in here your performance metrics as dictionary entries
22 |     meta["performance_metric_name"] = 0
23 | 
24 |     filename = "{0}metadata{1}.json".format(path, suffix)
25 | 
26 |     # Export to JSON
27 |     with open(filename, "w") as fp:
28 |         json.dump(meta, fp, indent=4)
29 | 
30 |     pass
31 | 


--------------------------------------------------------------------------------
/examples/palmer_penguins/src/data_sourcing.py:
--------------------------------------------------------------------------------
 1 | import palmerpenguins
 2 | 
 3 | 
 4 | def get():
 5 |     """Get the data.
 6 |     This template function uses the Palmer Peguins dataset as a place holder.
 7 |     Replace it by your own code to import your project's data.
 8 | 
 9 |     Parameters
10 |     ----------
11 |     None
12 | 
13 |     Returns
14 |     -------
15 |     pandas dataframe
16 |         Dataframe containing data.
17 | 
18 |     Examples
19 |     --------
20 | 
21 |     Raises
22 |     ------
23 | 
24 |     Notes
25 |     -----
26 | 
27 |     """
28 | 
29 |     df = palmerpenguins.load_penguins()
30 | 
31 |     cols = [
32 |         "bill_length_mm",
33 |         "bill_depth_mm",
34 |         "flipper_length_mm",
35 |         "body_mass_g",
36 |         "sex",
37 |         "species",
38 |     ]
39 | 
40 |     return df[cols]
41 | 


--------------------------------------------------------------------------------
/.dvc/plots/smooth.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
 3 |   "data": {
 4 |     "values": "<DVC_METRIC_DATA>"
 5 |   },
 6 |   "encoding": {
 7 |     "color": {
 8 |       "field": "rev",
 9 |       "type": "nominal"
10 |     },
11 |     "x": {
12 |       "field": "<DVC_METRIC_X>",
13 |       "title": "<DVC_METRIC_X_LABEL>",
14 |       "type": "quantitative"
15 |     },
16 |     "y": {
17 |       "field": "<DVC_METRIC_Y>",
18 |       "scale": {
19 |         "zero": false
20 |       },
21 |       "title": "<DVC_METRIC_Y_LABEL>",
22 |       "type": "quantitative"
23 |     }
24 |   },
25 |   "mark": {
26 |     "type": "line"
27 |   },
28 |   "title": "<DVC_METRIC_TITLE>",
29 |   "transform": [
30 |     {
31 |       "bandwidth": 0.3,
32 |       "groupby": [
33 |         "rev"
34 |       ],
35 |       "loess": "<DVC_METRIC_Y>",
36 |       "on": "<DVC_METRIC_X>"
37 |     }
38 |   ]
39 | }
40 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v3.4.0
 4 |     hooks:
 5 |     -   id: check-toml
 6 |         always_run: true
 7 |         verbose: true
 8 |     -   id: check-yaml
 9 |         always_run: true
10 |         verbose: true
11 |     -   id: pretty-format-json
12 |         always_run: true
13 |         verbose: true
14 |         args: ["--autofix"]
15 |         exclude: .dvc
16 |     -   id: requirements-txt-fixer
17 |         always_run: true
18 |         verbose: true
19 | -   repo: https://github.com/ambv/black
20 |     rev: 20.8b1
21 |     hooks:
22 |     -   id: black
23 |         always_run: true
24 |         verbose: true
25 | -   repo: https://gitlab.com/pycqa/flake8
26 |     rev: 3.9.0
27 |     hooks:
28 |     -   id: flake8
29 |         always_run: true
30 |         verbose: true
31 | -   repo: local
32 |     hooks:
33 |     -   id: test
34 |         name: test
35 |         entry: make test-no-log
36 |         language: system
37 |         pass_filenames: false
38 |         always_run: true
39 |         verbose: true
40 | 


--------------------------------------------------------------------------------
/src/main.py:
--------------------------------------------------------------------------------
 1 | import greenhouse_clock
 2 | import data_sourcing
 3 | import data_splitting
 4 | import data_preprocessing
 5 | import feature_engineering
 6 | from modeling import model
 7 | import performance_monitoring
 8 | 
 9 | start_time = greenhouse_clock.get_time()
10 | 
11 | if __name__ == "__main__":
12 | 
13 |     # Run prefect flow
14 |     df = data_sourcing.get()
15 |     df = data_preprocessing.clean(df)
16 |     df = data_preprocessing.normalize(df)
17 | 
18 |     train, valid, test = data_splitting.split(df)
19 | 
20 |     (
21 |         train["x"],
22 |         valid["x"],
23 |         test["x"],
24 |     ) = feature_engineering.numerical_missing_imputation(
25 |         train=train, valid=valid, test=test, cols=["x"], imputation_method="median"
26 |     )
27 | 
28 |     m = model().fit(train=train, y_col="y", x_col="x")
29 | 
30 |     train["pred"], valid["pred"], test["pred"] = m.transform_sets(train, valid, test)
31 | 
32 |     performance_monitoring.report_performance(
33 |         y_true=valid["y"],
34 |         y_score=valid["pred"],
35 |         path="/usr/app/monitor/",
36 |         suffix="_valid",
37 |     )
38 | 


--------------------------------------------------------------------------------
/examples/vanilla/src/main.py:
--------------------------------------------------------------------------------
 1 | import greenhouse_clock
 2 | import data_sourcing
 3 | import data_splitting
 4 | import data_preprocessing
 5 | import feature_engineering
 6 | from modeling import model
 7 | import performance_monitoring
 8 | 
 9 | start_time = greenhouse_clock.get_time()
10 | 
11 | if __name__ == "__main__":
12 | 
13 |     # Run prefect flow
14 |     df = data_sourcing.get()
15 |     df = data_preprocessing.clean(df)
16 |     df = data_preprocessing.normalize(df)
17 | 
18 |     train, valid, test = data_splitting.split(df)
19 | 
20 |     (
21 |         train["x"],
22 |         valid["x"],
23 |         test["x"],
24 |     ) = feature_engineering.numerical_missing_imputation(
25 |         train=train, valid=valid, test=test, cols=["x"], imputation_method="median"
26 |     )
27 | 
28 |     m = model().fit(train=train, y_col="y", x_col="x")
29 | 
30 |     train["pred"], valid["pred"], test["pred"] = m.transform_sets(train, valid, test)
31 | 
32 |     performance_monitoring.report_performance(
33 |         y_true=valid["y"],
34 |         y_score=valid["pred"],
35 |         path="/usr/app/monitor/",
36 |         suffix="_valid",
37 |     )
38 | 


--------------------------------------------------------------------------------
/examples/palmer_penguins/src/data_preprocessing.py:
--------------------------------------------------------------------------------
 1 | def clean(df):
 2 |     """Cleansing: a data pre-processing step. Usually, getting rid of garbage
 3 |     such as undesired characters.
 4 | 
 5 |     Cleansing must be a set of operations independent of data splitting.
 6 | 
 7 |     Parameters
 8 |     ----------
 9 |     df: pandas dataframe
10 | 
11 |     Returns
12 |     -------
13 |     pandas dataframe
14 |         Cleansed dataframe
15 | 
16 |     Examples
17 |     --------
18 | 
19 |     Raises
20 |     ------
21 | 
22 |     Notes
23 |     -----
24 | 
25 |     """
26 | 
27 |     return df
28 | 
29 | 
30 | def normalize(df):
31 |     """Normalization: a data pre-processing step. Usually, making adjusting
32 |     loser and upper casing, abbrevations, word order, and so on.
33 | 
34 |     Normalization must be a set of operations independent of data splitting.
35 | 
36 |     Parameters
37 |     ----------
38 |     df: pandas dataframe
39 | 
40 |     Returns
41 |     -------
42 |     pandas dataframe
43 |         Normalized dataframe
44 | 
45 |     Examples
46 |     --------
47 | 
48 |     Raises
49 |     ------
50 | 
51 |     Notes
52 |     -----
53 | 
54 |     """
55 | 
56 |     return df
57 | 


--------------------------------------------------------------------------------
/src/api.py:
--------------------------------------------------------------------------------
 1 | from fastapi import FastAPI
 2 | from pydantic import BaseModel
 3 | 
 4 | from src.modeling import VanillaModel
 5 | 
 6 | app = FastAPI()
 7 | 
 8 | 
 9 | class ModelIn(BaseModel):
10 |     x: str
11 | 
12 | 
13 | class ModelOut(BaseModel):
14 |     pred: float
15 | 
16 | 
17 | class ModelOutHealth(BaseModel):
18 |     id: str
19 | 
20 | 
21 | app = FastAPI()
22 | 
23 | 
24 | @app.post("/health")
25 | async def health():
26 | 
27 |     return {"id": "Healthy"}
28 | 
29 | 
30 | @app.post(
31 |     "/predict/",
32 |     response_model=ModelOut,
33 | )
34 | async def root(input: ModelIn):
35 | 
36 |     X = [
37 |         float(input.x),
38 |     ]
39 | 
40 |     # Load your model from /models
41 | 
42 |     # Note: for saving your model, we suggest using the
43 |     #       `joblib` python package
44 | 
45 |     # Ex:   path "/usr/app/models/"
46 |     #       joblib.dump(self.m, path)
47 |     #       model = joblib.load(path)
48 | 
49 |     # Vanila model always predict 0, so that
50 |     # inputs in the training phase are arbitrary
51 |     model = VanillaModel().fit(x=[0], y=[0])
52 | 
53 |     out_dict = {}
54 | 
55 |     out_dict["pred"] = model.predict(X)[0]
56 | 
57 |     return out_dict
58 | 


--------------------------------------------------------------------------------
/examples/vanilla/src/api.py:
--------------------------------------------------------------------------------
 1 | from fastapi import FastAPI
 2 | from pydantic import BaseModel
 3 | 
 4 | from src.modeling import VanillaModel
 5 | 
 6 | app = FastAPI()
 7 | 
 8 | 
 9 | class ModelIn(BaseModel):
10 |     x: str
11 | 
12 | 
13 | class ModelOut(BaseModel):
14 |     pred: float
15 | 
16 | 
17 | class ModelOutHealth(BaseModel):
18 |     id: str
19 | 
20 | 
21 | app = FastAPI()
22 | 
23 | 
24 | @app.post("/health")
25 | async def health():
26 | 
27 |     return {"id": "Healthy"}
28 | 
29 | 
30 | @app.post(
31 |     "/predict/",
32 |     response_model=ModelOut,
33 | )
34 | async def root(input: ModelIn):
35 | 
36 |     X = [
37 |         float(input.x),
38 |     ]
39 | 
40 |     # Load your model from /models
41 | 
42 |     # Note: for saving your model, we suggest using the
43 |     #       `joblib` python package
44 | 
45 |     # Ex:   path "/usr/app/models/"
46 |     #       joblib.dump(self.m, path)
47 |     #       model = joblib.load(path)
48 | 
49 |     # Vanila model always predict 0, so that
50 |     # inputs in the training phase are arbitrary
51 |     model = VanillaModel().fit(x=[0], y=[0])
52 | 
53 |     out_dict = {}
54 | 
55 |     out_dict["pred"] = model.predict(X)[0]
56 | 
57 |     return out_dict
58 | 


--------------------------------------------------------------------------------
/examples/palmer_penguins/tests/test_data_sourcing.py:
--------------------------------------------------------------------------------
 1 | import pandera as pa
 2 | from src import data_sourcing
 3 | 
 4 | 
 5 | def test_data_sourcing_get():
 6 | 
 7 |     df = data_sourcing.get()
 8 | 
 9 |     print(df)
10 | 
11 |     cats_sex = [
12 |         "male",
13 |         "female",
14 |     ]
15 |     cats_species = [
16 |         "Adelie",
17 |         "Gentoo",
18 |         "Chinstrap",
19 |     ]
20 | 
21 |     schema = pa.DataFrameSchema(
22 |         {
23 |             "bill_length_mm": pa.Column(
24 |                 float,
25 |                 nullable=True,
26 |             ),
27 |             "bill_depth_mm": pa.Column(
28 |                 float,
29 |                 nullable=True,
30 |             ),
31 |             "flipper_length_mm": pa.Column(
32 |                 float,
33 |                 nullable=True,
34 |             ),
35 |             "body_mass_g": pa.Column(
36 |                 float,
37 |                 nullable=True,
38 |             ),
39 |             "sex": pa.Column(
40 |                 str,
41 |                 checks=pa.Check.isin(cats_sex),
42 |                 nullable=True,
43 |             ),
44 |             "species": pa.Column(
45 |                 str,
46 |                 checks=pa.Check.isin(cats_species),
47 |                 nullable=True,
48 |             ),
49 |         }
50 |     )
51 | 
52 |     schema(df)
53 | 


--------------------------------------------------------------------------------
/src/data_preprocessing.py:
--------------------------------------------------------------------------------
 1 | def clean(df):
 2 |     """Cleansing: a data pre-processing step. Usually, getting rid of garbage
 3 |     such as undesired characters.
 4 | 
 5 |     Cleansing must be a set of operations independent of data splitting.
 6 | 
 7 |     ** Vanilla definition. **
 8 |     Include your own code below to import your project's data.
 9 | 
10 |     Parameters
11 |     ----------
12 |     df: pandas dataframe
13 | 
14 |     Returns
15 |     -------
16 |     pandas dataframe
17 |         Cleansed dataframe
18 | 
19 |     Examples
20 |     --------
21 | 
22 |     Raises
23 |     ------
24 | 
25 |     Notes
26 |     -----
27 | 
28 |     """
29 | 
30 |     return df
31 | 
32 | 
33 | def normalize(df):
34 |     """Normalization: a data pre-processing step. Usually, making adjusting
35 |     loser and upper casing, abbrevations, word order, and so on.
36 | 
37 |     Normalization must be a set of operations independent of data splitting.
38 | 
39 |     ** Vanilla definition. **
40 |     Include your own code below to import your project's data.
41 | 
42 |     Parameters
43 |     ----------
44 |     df: pandas dataframe
45 | 
46 |     Returns
47 |     -------
48 |     pandas dataframe
49 |         Normalized dataframe
50 | 
51 |     Examples
52 |     --------
53 | 
54 |     Raises
55 |     ------
56 | 
57 |     Notes
58 |     -----
59 | 
60 |     """
61 | 
62 |     return df
63 | 


--------------------------------------------------------------------------------
/examples/vanilla/src/data_preprocessing.py:
--------------------------------------------------------------------------------
 1 | def clean(df):
 2 |     """Cleansing: a data pre-processing step. Usually, getting rid of garbage
 3 |     such as undesired characters.
 4 | 
 5 |     Cleansing must be a set of operations independent of data splitting.
 6 | 
 7 |     ** Vanilla definition. **
 8 |     Include your own code below to import your project's data.
 9 | 
10 |     Parameters
11 |     ----------
12 |     df: pandas dataframe
13 | 
14 |     Returns
15 |     -------
16 |     pandas dataframe
17 |         Cleansed dataframe
18 | 
19 |     Examples
20 |     --------
21 | 
22 |     Raises
23 |     ------
24 | 
25 |     Notes
26 |     -----
27 | 
28 |     """
29 | 
30 |     return df
31 | 
32 | 
33 | def normalize(df):
34 |     """Normalization: a data pre-processing step. Usually, making adjusting
35 |     loser and upper casing, abbrevations, word order, and so on.
36 | 
37 |     Normalization must be a set of operations independent of data splitting.
38 | 
39 |     ** Vanilla definition. **
40 |     Include your own code below to import your project's data.
41 | 
42 |     Parameters
43 |     ----------
44 |     df: pandas dataframe
45 | 
46 |     Returns
47 |     -------
48 |     pandas dataframe
49 |         Normalized dataframe
50 | 
51 |     Examples
52 |     --------
53 | 
54 |     Raises
55 |     ------
56 | 
57 |     Notes
58 |     -----
59 | 
60 |     """
61 | 
62 |     return df
63 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | 
 3 | services:
 4 |   base:
 5 |     build:
 6 |       context: .
 7 |       dockerfile: Dockerfile
 8 |     image: base_image
 9 | 
10 |   bash:
11 |     image: base_image
12 |     user: root
13 |     volumes:
14 |       - .:/usr/app/
15 |     working_dir: /usr/app/
16 |     entrypoint: /bin/sh
17 | 
18 |   python3:
19 |     image: base_image
20 |     user: root
21 |     volumes:
22 |       - .:/usr/app/
23 |     working_dir: /usr/app/
24 |     command: "python3"
25 | 
26 |   jupyter:
27 |     image: base_image
28 |     user: root
29 |     volumes:
30 |       - .:/usr/app/
31 |     working_dir: /usr/app/
32 |     command: >
33 |       jupyter lab
34 |       --ip=0.0.0.0
35 |       --port=8888
36 |       --allow-root
37 |       --no-browser
38 |       --notebook-dir='/usr/app/'
39 |     ports:
40 |       - 8888:8888
41 | 
42 |   fastapi:
43 |     image: base_image
44 |     user: root
45 |     volumes:
46 |       - .:/usr/app/
47 |     working_dir: /usr/app/
48 |     command: >
49 |       uvicorn src.api:app --reload --host 0.0.0.0
50 |     ports:
51 |       - 8000:8000
52 | 
53 |   test:
54 |     image: base_image
55 |     user: root
56 |     volumes:
57 |       - .:/usr/app/
58 |     working_dir: /usr/app/
59 |     command: "pytest --verbose --capture=no --ignore=examples/"
60 | 
61 |   run:
62 |     image: base_image
63 |     user: root
64 |     volumes:
65 |       - .:/usr/app/
66 |     working_dir: /usr/app/
67 |     command: "python3 src/main.py"
68 | 
69 |   docs:
70 |     image: base_image
71 |     user: root
72 |     volumes:
73 |       - .:/usr/app/
74 |     working_dir: /usr/app/
75 |     environment:
76 |       - PYTHONPATH=/usr/app/src/
77 |     command: >
78 |       pdoc --docformat "numpy" -h 0.0.0.0 -p 314 src
79 |     ports:
80 |       - 314:314


--------------------------------------------------------------------------------
/tests/test_feature_engineering.py:
--------------------------------------------------------------------------------
 1 | from src import feature_engineering
 2 | import pandas as pd
 3 | from pandas import _testing
 4 | import numpy as np
 5 | 
 6 | 
 7 | def test_numerical_missing_imputation_twofeatures():
 8 | 
 9 |     df = pd.DataFrame(
10 |         {
11 |             "a": [1.0, 1.5, 2.0, 0.0, 1.25, np.nan],
12 |             "b": [1.0, 1.5, 2.0, 0.0, 0.0, np.nan],
13 |             "c": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
14 |             "d": ["apple", "apple", "pear", "apple", "pear", "apple"],
15 |         }
16 |     )
17 | 
18 |     expected = pd.DataFrame(
19 |         {
20 |             "a": [1.0, 1.5, 2.0, 0.0, 1.25, 1.25],
21 |             "b": [1.0, 1.5, 2.0, 0.0, 0.0, 1.0],
22 |         }
23 |     )
24 | 
25 |     train, valid, test = feature_engineering.numerical_missing_imputation(
26 |         train=df,
27 |         valid=df,
28 |         test=df,
29 |         cols=[
30 |             "a",
31 |             "b",
32 |         ],
33 |     )
34 | 
35 |     _testing.assert_frame_equal(train, expected)
36 | 
37 | 
38 | def test_one_hot_encoding():
39 | 
40 |     df = pd.DataFrame(
41 |         {
42 |             "class": ["a", "b", "c", "a", np.nan],
43 |             "col_1": [0.0, 0.0, 0.0, 0.0, 0.0],
44 |             "col_2": ["apple", "apple", "pear", "apple", "pear"],
45 |         }
46 |     )
47 | 
48 |     expected = pd.DataFrame(
49 |         {
50 |             "class_a": [1, 0, 0, 1, 0],
51 |             "class_b": [0, 1, 0, 0, 0],
52 |             "class_c": [0, 0, 1, 0, 0],
53 |             "class_na": [0, 0, 0, 0, 1],
54 |         }
55 |     )
56 | 
57 |     train, valid, test = feature_engineering.one_hot_encoding(
58 |         train=df,
59 |         valid=df,
60 |         test=df,
61 |         cols=[
62 |             "class",
63 |         ],
64 |     )
65 | 
66 |     _testing.assert_frame_equal(train, expected)
67 | 


--------------------------------------------------------------------------------
/src/data_splitting.py:
--------------------------------------------------------------------------------
 1 | def split(df, train_ratio=0.8, valid_ratio=0.1, test_ratio=0.1, seed=0):
 2 |     """Data splitting into 3 sets: train, valid, test
 3 | 
 4 |     train: training set. Used for training the ML model.
 5 |     valid: validation set. Used for frequent validation.
 6 |     test: test set. Used for final test.
 7 | 
 8 |     Parameters
 9 |     ----------
10 |     df: pandas dataframe
11 |         Input data
12 | 
13 |     train_ratio: float
14 |         Amount of data that goes into training, in percentage
15 | 
16 |     valid_ratio: float
17 |         Amount of data that goes into validation, in percentage
18 | 
19 |     test_ratio: float
20 |         Amount of data that goes into testing, in percentage
21 | 
22 |     seed: int
23 |         Seed for the data shuffling.
24 |         It is important to keep it fixed throughout the tuning of the model.
25 | 
26 |     Returns
27 |     -------
28 |     list
29 |         (train, valid, test)
30 |         (pandas dataframe, pandas dataframe, pandas dataframe)
31 | 
32 |     Examples
33 |     --------
34 | 
35 |     >>> len(data)
36 |     100
37 |     >>> train, valid, test = split(data)
38 |     >>> len(train)
39 |     80
40 |     >>> len(valid)
41 |     10
42 |     >>> len(test)
43 |     10
44 | 
45 |     Raises
46 |     ------
47 | 
48 |     Notes
49 |     -----
50 | 
51 |     """
52 | 
53 |     # Train set extracted from a random sample from `df`
54 |     train = df.sample(frac=train_ratio, random_state=seed)
55 | 
56 |     # Everything from `df` except `train`
57 |     rest = df.copy().drop(train.index)
58 | 
59 |     # Valid set ratio within `rest`
60 |     new_ratio = valid_ratio / (valid_ratio + test_ratio)
61 | 
62 |     # Train set extracted from a random sample from `rest`
63 |     valid = rest.sample(frac=new_ratio, random_state=seed)
64 | 
65 |     # Test set is everything in rest `except` for `valid`
66 |     test = rest.drop(valid.index)
67 | 
68 |     return train, valid, test
69 | 


--------------------------------------------------------------------------------
/examples/vanilla/src/data_splitting.py:
--------------------------------------------------------------------------------
 1 | def split(df, train_ratio=0.8, valid_ratio=0.1, test_ratio=0.1, seed=0):
 2 |     """Data splitting into 3 sets: train, valid, test
 3 | 
 4 |     train: training set. Used for training the ML model.
 5 |     valid: validation set. Used for frequent validation.
 6 |     test: test set. Used for final test.
 7 | 
 8 |     Parameters
 9 |     ----------
10 |     df: pandas dataframe
11 |         Input data
12 | 
13 |     train_ratio: float
14 |         Amount of data that goes into training, in percentage
15 | 
16 |     valid_ratio: float
17 |         Amount of data that goes into validation, in percentage
18 | 
19 |     test_ratio: float
20 |         Amount of data that goes into testing, in percentage
21 | 
22 |     seed: int
23 |         Seed for the data shuffling.
24 |         It is important to keep it fixed throughout the tuning of the model.
25 | 
26 |     Returns
27 |     -------
28 |     list
29 |         (train, valid, test)
30 |         (pandas dataframe, pandas dataframe, pandas dataframe)
31 | 
32 |     Examples
33 |     --------
34 | 
35 |     >>> len(data)
36 |     100
37 |     >>> train, valid, test = split(data)
38 |     >>> len(train)
39 |     80
40 |     >>> len(valid)
41 |     10
42 |     >>> len(test)
43 |     10
44 | 
45 |     Raises
46 |     ------
47 | 
48 |     Notes
49 |     -----
50 | 
51 |     """
52 | 
53 |     # Train set extracted from a random sample from `df`
54 |     train = df.sample(frac=train_ratio, random_state=seed)
55 | 
56 |     # Everything from `df` except `train`
57 |     rest = df.copy().drop(train.index)
58 | 
59 |     # Valid set ratio within `rest`
60 |     new_ratio = valid_ratio / (valid_ratio + test_ratio)
61 | 
62 |     # Train set extracted from a random sample from `rest`
63 |     valid = rest.sample(frac=new_ratio, random_state=seed)
64 | 
65 |     # Test set is everything in rest `except` for `valid`
66 |     test = rest.drop(valid.index)
67 | 
68 |     return train, valid, test
69 | 


--------------------------------------------------------------------------------
/examples/palmer_penguins/src/data_splitting.py:
--------------------------------------------------------------------------------
 1 | def split(df, train_ratio=0.8, valid_ratio=0.1, test_ratio=0.1, seed=0):
 2 |     """Data splitting into 3 sets: train, valid, test
 3 | 
 4 |     train: training set. Used for training the ML model.
 5 |     valid: validation set. Used for frequent validation.
 6 |     test: test set. Used for final test.
 7 | 
 8 |     Parameters
 9 |     ----------
10 |     df: pandas dataframe
11 |         Input data
12 | 
13 |     train_ratio: float
14 |         Amount of data that goes into training, in percentage
15 | 
16 |     valid_ratio: float
17 |         Amount of data that goes into validation, in percentage
18 | 
19 |     test_ratio: float
20 |         Amount of data that goes into testing, in percentage
21 | 
22 |     seed: int
23 |         Seed for the data shuffling.
24 |         It is important to keep it fixed throughout the tuning of the model.
25 | 
26 |     Returns
27 |     -------
28 |     list
29 |         (train, valid, test)
30 |         (pandas dataframe, pandas dataframe, pandas dataframe)
31 | 
32 |     Examples
33 |     --------
34 | 
35 |     >>> len(data)
36 |     100
37 |     >>> train, valid, test = split(data)
38 |     >>> len(train)
39 |     80
40 |     >>> len(valid)
41 |     10
42 |     >>> len(test)
43 |     10
44 | 
45 |     Raises
46 |     ------
47 | 
48 |     Notes
49 |     -----
50 | 
51 |     """
52 | 
53 |     # Train set extracted from a random sample from `df`
54 |     train = df.sample(frac=train_ratio, random_state=seed)
55 | 
56 |     # Everything from `df` except `train`
57 |     rest = df.copy().drop(train.index)
58 | 
59 |     # Valid set ratio within `rest`
60 |     new_ratio = valid_ratio / (valid_ratio + test_ratio)
61 | 
62 |     # Train set extracted from a random sample from `rest`
63 |     valid = rest.sample(frac=new_ratio, random_state=seed)
64 | 
65 |     # Test set is everything in rest `except` for `valid`
66 |     test = rest.drop(valid.index)
67 | 
68 |     return train, valid, test
69 | 


--------------------------------------------------------------------------------
/examples/palmer_penguins/src/api.py:
--------------------------------------------------------------------------------
 1 | from fastapi import FastAPI
 2 | import joblib
 3 | import numpy as np
 4 | 
 5 | from pydantic import BaseModel
 6 | 
 7 | app = FastAPI()
 8 | 
 9 | 
10 | class ModelIn(BaseModel):
11 |     sex: str
12 |     bill_length_mm: str
13 |     bill_depth_mm: str
14 |     flipper_length_mm: str
15 |     body_mass_g: str
16 | 
17 | 
18 | class ModelOut(BaseModel):
19 |     prob_0: float
20 |     prob_1: float
21 |     prob_2: float
22 |     species_code: int
23 |     species_name: str
24 | 
25 | 
26 | app = FastAPI()
27 | 
28 | 
29 | @app.post(
30 |     "/predict/",
31 |     response_model=ModelOut,
32 | )
33 | async def root(input: ModelIn):
34 | 
35 |     sex_male = {"male": 1, "female": 0, "na": 0}[input.sex]
36 | 
37 |     sex_female = {"male": 0, "female": 1, "na": 0}[input.sex]
38 | 
39 |     sex_na = {"male": 0, "female": 0, "na": 1}[input.sex]
40 | 
41 |     X = [
42 |         [
43 |             int(sex_male),
44 |             int(sex_female),
45 |             int(sex_na),
46 |             float(input.bill_length_mm),
47 |             float(input.bill_depth_mm),
48 |             float(input.flipper_length_mm),
49 |             float(input.body_mass_g),
50 |         ],
51 |     ]
52 | 
53 |     model = joblib.load("/usr/app/models/clf_random.joblib")
54 | 
55 |     out_dict = {}
56 | 
57 |     out_dict["prob_0"], out_dict["prob_1"], out_dict["prob_2"] = np.transpose(
58 |         model.predict_proba(X)
59 |     )
60 | 
61 |     out_dict["prob_0"] = out_dict["prob_0"][0]
62 | 
63 |     out_dict["prob_1"] = out_dict["prob_1"][0]
64 |     out_dict["prob_2"] = out_dict["prob_2"][0]
65 | 
66 |     encoder = joblib.load("/usr/app/models/label_encoder.joblib")
67 | 
68 |     # Recover classes
69 |     classes = encoder.classes_
70 | 
71 |     # Enumerate classes to recover codes (integers)
72 |     # Convert enumerate to dictionary
73 |     map_classes = dict(enumerate(classes))
74 | 
75 |     code = model.predict(X)[0]
76 | 
77 |     out_dict["species_code"] = int(code)
78 |     out_dict["species_name"] = map_classes[code]
79 | 
80 |     return out_dict
81 | 


--------------------------------------------------------------------------------
/examples/palmer_penguins/src/performance_monitoring.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn import metrics
 3 | import json
 4 | 
 5 | import greenhouse_clock
 6 | 
 7 | meta = {}
 8 | 
 9 | # Timestamp for files
10 | meta["timestr"] = greenhouse_clock.get_time()
11 | 
12 | 
13 | def optimal_threshold(y_true, y_score):
14 | 
15 |     # Performance extracted from the "ROC curve"
16 |     fpr, tpr, thr = metrics.roc_curve(
17 |         y_true=y_true, y_score=y_score, pos_label=1, drop_intermediate=False
18 |     )
19 | 
20 |     diff = np.abs(tpr - fpr)
21 | 
22 |     # Numpy index of the maximum separation between TPR and FPR
23 |     diff_idx = np.argmax(diff)
24 | 
25 |     # Optimum threshold based on max diff criterium
26 |     return thr[diff_idx]
27 | 
28 | 
29 | def report_performance(
30 |     y_true, y_score, best_hyperparams, path, opt_thr=0.5, suffix="_"
31 | ):
32 |     """
33 |     References
34 |     ----------
35 |     https://scikit-learn.org/stable/modules/generated/
36 |     sklearn.metrics.classification_report.html
37 |     """
38 | 
39 |     meta["optimal_hyperparameters"] = best_hyperparams
40 | 
41 |     meta["optimal_threshold"] = opt_thr
42 | 
43 |     # Performance extracted from the "ROC curve"
44 |     fpr, tpr, thr = metrics.roc_curve(
45 |         y_true=y_true, y_score=y_score, pos_label=1, drop_intermediate=False
46 |     )
47 | 
48 |     meta["AUC"] = metrics.auc(fpr, tpr)
49 | 
50 |     diff = np.abs(tpr - fpr)
51 | 
52 |     # Maximum difference between TPR and FPR
53 |     meta["max_diff_FPR_TPR"] = np.max(diff)
54 | 
55 |     # Numpy index of the maximum separation between TPR and FPR
56 |     diff_idx = np.argmax(diff)
57 | 
58 |     # Update optimum threshold based on max diff criterium
59 |     meta["threshold_from_max_diff"] = thr[diff_idx]
60 | 
61 |     # Predicted classes based on "optimal_threshold"
62 |     y_pred = [int(k >= opt_thr) for k in y_score]
63 | 
64 |     meta["classification_report"] = metrics.classification_report(
65 |         y_true=y_true, y_pred=y_pred, output_dict=True
66 |     )
67 | 
68 |     filename = "{0}metadata{1}.json".format(path, suffix)
69 | 
70 |     # Export to JSON
71 |     with open(filename, "w") as fp:
72 |         json.dump(meta, fp, indent=4)
73 | 
74 |     pass
75 | 


--------------------------------------------------------------------------------
/src/modeling.py:
--------------------------------------------------------------------------------
 1 | class VanillaModel:
 2 |     """Vanilla model where the predictions are always 0"""
 3 | 
 4 |     def __init__(self):
 5 | 
 6 |         pass
 7 | 
 8 |     def fit(self, x, y):
 9 | 
10 |         self.fitted = [0]
11 | 
12 |         return self
13 | 
14 |     def predict(self, x):
15 | 
16 |         return self.fitted * len(x)
17 | 
18 | 
19 | class model:
20 |     """
21 |     Replace below `VanillaModel` by an actual ML
22 |     model such as the ones provided by sklearn.
23 | 
24 |     We are assuming supervised models (a and y are available),
25 |     but you may also adapt it for unsupervised models
26 |     (only x available). In that case, erase any reference to
27 |     `y` below.
28 | 
29 |     References
30 |     ----------
31 |     https://scikit-learn.org/stable/
32 |     """
33 | 
34 |     def __init__(self):
35 | 
36 |         pass
37 | 
38 |     def fit(self, train, y_col, x_col):
39 | 
40 |         self.x_col = x_col
41 |         self.y_col = y_col
42 | 
43 |         self.m = VanillaModel().fit(x=train[x_col], y=train[y_col])
44 | 
45 |         # Save your model in /models
46 | 
47 |         # Note: for saving your model, we suggest using the
48 |         #       `joblib` python package
49 | 
50 |         # Ex:   path "/usr/app/models/"
51 |         #       joblib.dump(self.m, path)
52 | 
53 |         return self
54 | 
55 |     def transform_sets(self, train, valid, test):
56 | 
57 |         x_train = train[self.x_col].values
58 |         x_valid = valid[self.x_col].values
59 |         x_test = test[self.x_col].values
60 | 
61 |         y_train = train[self.y_col].values
62 |         y_valid = valid[self.y_col].values
63 |         y_test = test[self.y_col].values
64 | 
65 |         train_out = train.copy(deep=True)[self.y_col]
66 |         valid_out = valid.copy(deep=True)[self.y_col]
67 |         test_out = test.copy(deep=True)[self.y_col]
68 | 
69 |         train_out["actual"] = y_train
70 |         valid_out["actual"] = y_valid
71 |         test_out["actual"] = y_test
72 | 
73 |         # Predict
74 |         train_out["pred"] = (self.m).predict(x_train)
75 |         valid_out["pred"] = (self.m).predict(x_valid)
76 |         test_out["pred"] = (self.m).predict(x_test)
77 | 
78 |         return train_out, valid_out, test_out
79 | 


--------------------------------------------------------------------------------
/examples/vanilla/src/modeling.py:
--------------------------------------------------------------------------------
 1 | class VanillaModel:
 2 |     """Vanilla model where the predictions are always 0"""
 3 | 
 4 |     def __init__(self):
 5 | 
 6 |         pass
 7 | 
 8 |     def fit(self, x, y):
 9 | 
10 |         self.fitted = [0]
11 | 
12 |         return self
13 | 
14 |     def predict(self, x):
15 | 
16 |         return self.fitted * len(x)
17 | 
18 | 
19 | class model:
20 |     """
21 |     Replace below `VanillaModel` by an actual ML
22 |     model such as the ones provided by sklearn.
23 | 
24 |     We are assuming supervised models (a and y are available),
25 |     but you may also adapt it for unsupervised models
26 |     (only x available). In that case, erase any reference to
27 |     `y` below.
28 | 
29 |     References
30 |     ----------
31 |     https://scikit-learn.org/stable/
32 |     """
33 | 
34 |     def __init__(self):
35 | 
36 |         pass
37 | 
38 |     def fit(self, train, y_col, x_col):
39 | 
40 |         self.x_col = x_col
41 |         self.y_col = y_col
42 | 
43 |         self.m = VanillaModel().fit(x=train[x_col], y=train[y_col])
44 | 
45 |         # Save your model in /models
46 | 
47 |         # Note: for saving your model, we suggest using the
48 |         #       `joblib` python package
49 | 
50 |         # Ex:   path "/usr/app/models/"
51 |         #       joblib.dump(self.m, path)
52 | 
53 |         return self
54 | 
55 |     def transform_sets(self, train, valid, test):
56 | 
57 |         x_train = train[self.x_col].values
58 |         x_valid = valid[self.x_col].values
59 |         x_test = test[self.x_col].values
60 | 
61 |         y_train = train[self.y_col].values
62 |         y_valid = valid[self.y_col].values
63 |         y_test = test[self.y_col].values
64 | 
65 |         train_out = train.copy(deep=True)[self.y_col]
66 |         valid_out = valid.copy(deep=True)[self.y_col]
67 |         test_out = test.copy(deep=True)[self.y_col]
68 | 
69 |         train_out["actual"] = y_train
70 |         valid_out["actual"] = y_valid
71 |         test_out["actual"] = y_test
72 | 
73 |         # Predict
74 |         train_out["pred"] = (self.m).predict(x_train)
75 |         valid_out["pred"] = (self.m).predict(x_valid)
76 |         test_out["pred"] = (self.m).predict(x_test)
77 | 
78 |         return train_out, valid_out, test_out
79 | 


--------------------------------------------------------------------------------
/tests/test_data_splitting.py:
--------------------------------------------------------------------------------
 1 | from src import data_splitting
 2 | import pandas as pd
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.fixture
 7 | def df_10_rows():
 8 | 
 9 |     return pd.DataFrame(
10 |         {
11 |             "col_1": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
12 |             "col_2": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"],
13 |         }
14 |     )
15 | 
16 | 
17 | def test_data_splitting_train_ratio(df_10_rows):
18 | 
19 |     train, valid, test = data_splitting.split(
20 |         df_10_rows, train_ratio=0.7, valid_ratio=0.2, test_ratio=0.1, seed=0
21 |     )
22 | 
23 |     assert len(train) == 7
24 | 
25 | 
26 | def test_data_splitting_valid_ratio(df_10_rows):
27 | 
28 |     train, valid, test = data_splitting.split(
29 |         df_10_rows, train_ratio=0.7, valid_ratio=0.2, test_ratio=0.1, seed=0
30 |     )
31 | 
32 |     assert len(valid) == 2
33 | 
34 | 
35 | def test_data_splitting_test_ratio(df_10_rows):
36 | 
37 |     train, valid, test = data_splitting.split(
38 |         df_10_rows, train_ratio=0.7, valid_ratio=0.2, test_ratio=0.1, seed=0
39 |     )
40 | 
41 |     assert len(test) == 1
42 | 
43 | 
44 | def test_data_splitting_train_vs_valid(df_10_rows):
45 | 
46 |     train, valid, test = data_splitting.split(
47 |         df_10_rows, train_ratio=0.7, valid_ratio=0.2, test_ratio=0.1, seed=0
48 |     )
49 | 
50 |     df_check = train.merge(
51 |         valid,
52 |         how="inner",
53 |         right_on=["col_1", "col_2"],
54 |         left_on=["col_1", "col_2"],
55 |         sort=False,
56 |     )
57 | 
58 |     assert df_check.empty
59 | 
60 | 
61 | def test_data_splitting_train_vs_test(df_10_rows):
62 | 
63 |     train, valid, test = data_splitting.split(
64 |         df_10_rows, train_ratio=0.7, valid_ratio=0.2, test_ratio=0.1, seed=0
65 |     )
66 | 
67 |     df_check = train.merge(
68 |         test,
69 |         how="inner",
70 |         right_on=["col_1", "col_2"],
71 |         left_on=["col_1", "col_2"],
72 |         sort=False,
73 |     )
74 | 
75 |     assert df_check.empty
76 | 
77 | 
78 | def test_data_splitting_valid_vs_test(df_10_rows):
79 | 
80 |     train, valid, test = data_splitting.split(
81 |         df_10_rows, train_ratio=0.7, valid_ratio=0.2, test_ratio=0.1, seed=0
82 |     )
83 | 
84 |     df_check = valid.merge(
85 |         test,
86 |         how="inner",
87 |         right_on=["col_1", "col_2"],
88 |         left_on=["col_1", "col_2"],
89 |         sort=False,
90 |     )
91 | 
92 |     assert df_check.empty
93 | 


--------------------------------------------------------------------------------
/.dvc/plots/confusion.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
  3 |   "data": {
  4 |     "values": "<DVC_METRIC_DATA>"
  5 |   },
  6 |   "facet": {
  7 |     "field": "rev",
  8 |     "type": "nominal"
  9 |   },
 10 |   "spec": {
 11 |     "encoding": {
 12 |       "x": {
 13 |         "field": "<DVC_METRIC_X>",
 14 |         "sort": "ascending",
 15 |         "title": "<DVC_METRIC_X_LABEL>",
 16 |         "type": "nominal"
 17 |       },
 18 |       "y": {
 19 |         "field": "<DVC_METRIC_Y>",
 20 |         "sort": "ascending",
 21 |         "title": "<DVC_METRIC_Y_LABEL>",
 22 |         "type": "nominal"
 23 |       }
 24 |     },
 25 |     "layer": [
 26 |       {
 27 |         "encoding": {
 28 |           "color": {
 29 |             "field": "xy_count",
 30 |             "scale": {
 31 |               "domainMin": 0,
 32 |               "nice": true
 33 |             },
 34 |             "title": "",
 35 |             "type": "quantitative"
 36 |           }
 37 |         },
 38 |         "height": 300,
 39 |         "mark": "rect",
 40 |         "width": 300
 41 |       },
 42 |       {
 43 |         "encoding": {
 44 |           "color": {
 45 |             "condition": {
 46 |               "test": "datum.percent_of_max > 0.5",
 47 |               "value": "white"
 48 |             },
 49 |             "value": "black"
 50 |           },
 51 |           "text": {
 52 |             "field": "xy_count",
 53 |             "type": "quantitative"
 54 |           }
 55 |         },
 56 |         "mark": "text"
 57 |       }
 58 |     ],
 59 |     "transform": [
 60 |       {
 61 |         "aggregate": [
 62 |           {
 63 |             "as": "xy_count",
 64 |             "op": "count"
 65 |           }
 66 |         ],
 67 |         "groupby": [
 68 |           "<DVC_METRIC_Y>",
 69 |           "<DVC_METRIC_X>"
 70 |         ]
 71 |       },
 72 |       {
 73 |         "groupby": [
 74 |           "rev",
 75 |           "<DVC_METRIC_Y>"
 76 |         ],
 77 |         "impute": "xy_count",
 78 |         "key": "<DVC_METRIC_X>",
 79 |         "value": 0
 80 |       },
 81 |       {
 82 |         "groupby": [
 83 |           "rev",
 84 |           "<DVC_METRIC_X>"
 85 |         ],
 86 |         "impute": "xy_count",
 87 |         "key": "<DVC_METRIC_Y>",
 88 |         "value": 0
 89 |       },
 90 |       {
 91 |         "groupby": [],
 92 |         "joinaggregate": [
 93 |           {
 94 |             "as": "max_count",
 95 |             "field": "xy_count",
 96 |             "op": "max"
 97 |           }
 98 |         ]
 99 |       },
100 |       {
101 |         "as": "percent_of_max",
102 |         "calculate": "datum.xy_count / datum.max_count"
103 |       }
104 |     ]
105 |   },
106 |   "title": "<DVC_METRIC_TITLE>"
107 | }
108 | 


--------------------------------------------------------------------------------
/.dvc/plots/scatter.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
  3 |   "data": {
  4 |     "values": "<DVC_METRIC_DATA>"
  5 |   },
  6 |   "height": 300,
  7 |   "layer": [
  8 |     {
  9 |       "encoding": {
 10 |         "color": {
 11 |           "field": "rev",
 12 |           "type": "nominal"
 13 |         },
 14 |         "x": {
 15 |           "field": "<DVC_METRIC_X>",
 16 |           "title": "<DVC_METRIC_X_LABEL>",
 17 |           "type": "quantitative"
 18 |         },
 19 |         "y": {
 20 |           "field": "<DVC_METRIC_Y>",
 21 |           "scale": {
 22 |             "zero": false
 23 |           },
 24 |           "title": "<DVC_METRIC_Y_LABEL>",
 25 |           "type": "quantitative"
 26 |         }
 27 |       },
 28 |       "layer": [
 29 |         {
 30 |           "mark": "point"
 31 |         },
 32 |         {
 33 |           "encoding": {
 34 |             "opacity": {
 35 |               "condition": {
 36 |                 "selection": "label",
 37 |                 "value": 1
 38 |               },
 39 |               "value": 0
 40 |             }
 41 |           },
 42 |           "mark": "point",
 43 |           "selection": {
 44 |             "label": {
 45 |               "clear": "mouseout",
 46 |               "empty": "none",
 47 |               "encodings": [
 48 |                 "x"
 49 |               ],
 50 |               "nearest": true,
 51 |               "on": "mouseover",
 52 |               "type": "single"
 53 |             }
 54 |           }
 55 |         }
 56 |       ]
 57 |     },
 58 |     {
 59 |       "layer": [
 60 |         {
 61 |           "encoding": {
 62 |             "text": {
 63 |               "field": "<DVC_METRIC_Y>",
 64 |               "type": "quantitative"
 65 |             },
 66 |             "x": {
 67 |               "field": "<DVC_METRIC_X>",
 68 |               "type": "quantitative"
 69 |             },
 70 |             "y": {
 71 |               "field": "<DVC_METRIC_Y>",
 72 |               "type": "quantitative"
 73 |             }
 74 |           },
 75 |           "layer": [
 76 |             {
 77 |               "encoding": {
 78 |                 "color": {
 79 |                   "field": "rev",
 80 |                   "type": "nominal"
 81 |                 }
 82 |               },
 83 |               "mark": {
 84 |                 "align": "left",
 85 |                 "dx": 5,
 86 |                 "dy": -5,
 87 |                 "type": "text"
 88 |               }
 89 |             }
 90 |           ]
 91 |         }
 92 |       ],
 93 |       "transform": [
 94 |         {
 95 |           "filter": {
 96 |             "selection": "label"
 97 |           }
 98 |         }
 99 |       ]
100 |     }
101 |   ],
102 |   "title": "<DVC_METRIC_TITLE>",
103 |   "width": 300
104 | }
105 | 


--------------------------------------------------------------------------------
/.dvc/plots/confusion_normalized.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
  3 |   "data": {
  4 |     "values": "<DVC_METRIC_DATA>"
  5 |   },
  6 |   "facet": {
  7 |     "field": "rev",
  8 |     "type": "nominal"
  9 |   },
 10 |   "spec": {
 11 |     "encoding": {
 12 |       "x": {
 13 |         "field": "<DVC_METRIC_X>",
 14 |         "sort": "ascending",
 15 |         "title": "<DVC_METRIC_X_LABEL>",
 16 |         "type": "nominal"
 17 |       },
 18 |       "y": {
 19 |         "field": "<DVC_METRIC_Y>",
 20 |         "sort": "ascending",
 21 |         "title": "<DVC_METRIC_Y_LABEL>",
 22 |         "type": "nominal"
 23 |       }
 24 |     },
 25 |     "layer": [
 26 |       {
 27 |         "encoding": {
 28 |           "color": {
 29 |             "field": "percent_of_y",
 30 |             "scale": {
 31 |               "domain": [
 32 |                 0,
 33 |                 1
 34 |               ]
 35 |             },
 36 |             "title": "",
 37 |             "type": "quantitative"
 38 |           }
 39 |         },
 40 |         "height": 300,
 41 |         "mark": "rect",
 42 |         "width": 300
 43 |       },
 44 |       {
 45 |         "encoding": {
 46 |           "color": {
 47 |             "condition": {
 48 |               "test": "datum.percent_of_y > 0.5",
 49 |               "value": "white"
 50 |             },
 51 |             "value": "black"
 52 |           },
 53 |           "text": {
 54 |             "field": "percent_of_y",
 55 |             "format": ".2f",
 56 |             "type": "quantitative"
 57 |           }
 58 |         },
 59 |         "mark": "text"
 60 |       }
 61 |     ],
 62 |     "transform": [
 63 |       {
 64 |         "aggregate": [
 65 |           {
 66 |             "as": "xy_count",
 67 |             "op": "count"
 68 |           }
 69 |         ],
 70 |         "groupby": [
 71 |           "<DVC_METRIC_Y>",
 72 |           "<DVC_METRIC_X>"
 73 |         ]
 74 |       },
 75 |       {
 76 |         "groupby": [
 77 |           "rev",
 78 |           "<DVC_METRIC_Y>"
 79 |         ],
 80 |         "impute": "xy_count",
 81 |         "key": "<DVC_METRIC_X>",
 82 |         "value": 0
 83 |       },
 84 |       {
 85 |         "groupby": [
 86 |           "rev",
 87 |           "<DVC_METRIC_X>"
 88 |         ],
 89 |         "impute": "xy_count",
 90 |         "key": "<DVC_METRIC_Y>",
 91 |         "value": 0
 92 |       },
 93 |       {
 94 |         "groupby": [
 95 |           "<DVC_METRIC_Y>"
 96 |         ],
 97 |         "joinaggregate": [
 98 |           {
 99 |             "as": "sum_y",
100 |             "field": "xy_count",
101 |             "op": "sum"
102 |           }
103 |         ]
104 |       },
105 |       {
106 |         "as": "percent_of_y",
107 |         "calculate": "datum.xy_count / datum.sum_y"
108 |       }
109 |     ]
110 |   },
111 |   "title": "<DVC_METRIC_TITLE>"
112 | }
113 | 


--------------------------------------------------------------------------------
/examples/palmer_penguins/src/feature_engineering.py:
--------------------------------------------------------------------------------
  1 | from feature_engine import encoding, imputation
  2 | 
  3 | 
  4 | def numerical_missing_imputation(train, valid, test, cols, imputation_method="median"):
  5 |     """Missing imputation for numerical variables.
  6 | 
  7 |     The algorithm learns from the train set and applies transformations
  8 |     to all three input datasets: train, valid, test.
  9 | 
 10 |     Parameters
 11 |     ----------
 12 |     train: pandas dataframe
 13 |         Training set
 14 | 
 15 |     valid: pandas dataframe
 16 |         Validation set
 17 | 
 18 |     test: pandas dataframe
 19 |         Test set
 20 | 
 21 |     cols: list
 22 |         List of numerical columns
 23 | 
 24 |     imputation_method: string
 25 |         Desired method of imputation. Options are 'mean' and 'median'.
 26 |         Default value: 'median'.
 27 | 
 28 |     Returns
 29 |     -------
 30 |     list
 31 |         (train, valid, test)
 32 |         (pandas dataframe, pandas dataframe, pandas dataframe)
 33 | 
 34 |     Examples
 35 |     --------
 36 | 
 37 |     Raises
 38 |     ------
 39 | 
 40 |     Notes
 41 |     -----
 42 | 
 43 |     """
 44 | 
 45 |     fe = imputation.MeanMedianImputer(
 46 |         imputation_method=imputation_method, variables=cols
 47 |     )
 48 | 
 49 |     # Fit over training set
 50 |     fe.fit(train[cols])
 51 | 
 52 |     # Apply to train, valid, test
 53 |     return (
 54 |         fe.transform(train[cols]),
 55 |         fe.transform(valid[cols]),
 56 |         fe.transform(test[cols]),
 57 |     )
 58 | 
 59 | 
 60 | def one_hot_encoding(train, valid, test, cols):
 61 |     """One-hot-encoding of all categories found in `cols`.
 62 | 
 63 |     The algorithm learns from the train set and applies transformations
 64 |     to all three input datasets: train, valid, test.
 65 | 
 66 |     Missing values in col lead to col_na=1
 67 | 
 68 |     Parameters
 69 |     ----------
 70 |     train: pandas dataframe
 71 |         Training set
 72 | 
 73 |     valid: pandas dataframe
 74 |         Validation set
 75 | 
 76 |     test: pandas dataframe
 77 |         Test set
 78 | 
 79 |     cols: list
 80 |         List of numerical columns
 81 | 
 82 |     Returns
 83 |     -------
 84 |     list
 85 |         (train, valid, test)
 86 |         (pandas dataframe, pandas dataframe, pandas dataframe)
 87 | 
 88 |     Examples
 89 |     --------
 90 | 
 91 |     Raises
 92 |     ------
 93 | 
 94 |     Notes
 95 |     -----
 96 | 
 97 |     """
 98 | 
 99 |     fe = encoding.OneHotEncoder(variables=cols)
100 | 
101 |     for k in cols:
102 |         train[k] = train[k].fillna("na")
103 |         valid[k] = valid[k].fillna("na")
104 |         test[k] = test[k].fillna("na")
105 | 
106 |     # Fit over training set
107 |     fe.fit(train[cols])
108 | 
109 |     # Apply to train, valid, test
110 |     return (
111 |         fe.transform(train[cols]),
112 |         fe.transform(valid[cols]),
113 |         fe.transform(test[cols]),
114 |     )
115 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # pytype static type analyzer
135 | .pytype/
136 | 
137 | # Cython debug symbols
138 | cython_debug/
139 | 
140 | # Local logs of the project
141 | logs/
142 | 
143 | # Local monitoring reports of the project
144 | monitor/
145 | 
146 | # Data directory
147 | data/*.csv
148 | 
149 | # Models directory
150 | models/
151 | /monitor
152 | 


--------------------------------------------------------------------------------
/.dvc/plots/linear.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "$schema": "https://vega.github.io/schema/vega-lite/v4.json",
  3 |   "data": {
  4 |     "values": "<DVC_METRIC_DATA>"
  5 |   },
  6 |   "height": 300,
  7 |   "layer": [
  8 |     {
  9 |       "encoding": {
 10 |         "color": {
 11 |           "field": "rev",
 12 |           "type": "nominal"
 13 |         },
 14 |         "x": {
 15 |           "field": "<DVC_METRIC_X>",
 16 |           "title": "<DVC_METRIC_X_LABEL>",
 17 |           "type": "quantitative"
 18 |         },
 19 |         "y": {
 20 |           "field": "<DVC_METRIC_Y>",
 21 |           "scale": {
 22 |             "zero": false
 23 |           },
 24 |           "title": "<DVC_METRIC_Y_LABEL>",
 25 |           "type": "quantitative"
 26 |         }
 27 |       },
 28 |       "layer": [
 29 |         {
 30 |           "mark": "line"
 31 |         },
 32 |         {
 33 |           "encoding": {
 34 |             "opacity": {
 35 |               "condition": {
 36 |                 "selection": "label",
 37 |                 "value": 1
 38 |               },
 39 |               "value": 0
 40 |             }
 41 |           },
 42 |           "mark": "point",
 43 |           "selection": {
 44 |             "label": {
 45 |               "clear": "mouseout",
 46 |               "empty": "none",
 47 |               "encodings": [
 48 |                 "x"
 49 |               ],
 50 |               "nearest": true,
 51 |               "on": "mouseover",
 52 |               "type": "single"
 53 |             }
 54 |           }
 55 |         }
 56 |       ]
 57 |     },
 58 |     {
 59 |       "layer": [
 60 |         {
 61 |           "encoding": {
 62 |             "x": {
 63 |               "field": "<DVC_METRIC_X>",
 64 |               "type": "quantitative"
 65 |             }
 66 |           },
 67 |           "mark": {
 68 |             "color": "gray",
 69 |             "type": "rule"
 70 |           }
 71 |         },
 72 |         {
 73 |           "encoding": {
 74 |             "text": {
 75 |               "field": "<DVC_METRIC_Y>",
 76 |               "type": "quantitative"
 77 |             },
 78 |             "x": {
 79 |               "field": "<DVC_METRIC_X>",
 80 |               "type": "quantitative"
 81 |             },
 82 |             "y": {
 83 |               "field": "<DVC_METRIC_Y>",
 84 |               "type": "quantitative"
 85 |             }
 86 |           },
 87 |           "layer": [
 88 |             {
 89 |               "encoding": {
 90 |                 "color": {
 91 |                   "field": "rev",
 92 |                   "type": "nominal"
 93 |                 }
 94 |               },
 95 |               "mark": {
 96 |                 "align": "left",
 97 |                 "dx": 5,
 98 |                 "dy": -5,
 99 |                 "type": "text"
100 |               }
101 |             }
102 |           ]
103 |         }
104 |       ],
105 |       "transform": [
106 |         {
107 |           "filter": {
108 |             "selection": "label"
109 |           }
110 |         }
111 |       ]
112 |     }
113 |   ],
114 |   "title": "<DVC_METRIC_TITLE>",
115 |   "width": 300
116 | }
117 | 


--------------------------------------------------------------------------------
/src/feature_engineering.py:
--------------------------------------------------------------------------------
  1 | from feature_engine import encoding, imputation
  2 | 
  3 | # Note: we suggest using the below helper functions
  4 | #       for missing imputation (for numerical) and
  5 | #       one-hot-encoding (for categorical).
  6 | #       You will find most of other popular Feature
  7 | #       Engineering methods in the `feature_engine`
  8 | #       python package.
  9 | 
 10 | 
 11 | def numerical_missing_imputation(train, valid, test, cols, imputation_method="median"):
 12 |     """Missing imputation for numerical variables.
 13 | 
 14 |     The algorithm learns from the train set and applies transformations
 15 |     to all three input datasets: train, valid, test.
 16 | 
 17 |     Parameters
 18 |     ----------
 19 |     train: pandas dataframe
 20 |         Training set
 21 | 
 22 |     valid: pandas dataframe
 23 |         Validation set
 24 | 
 25 |     test: pandas dataframe
 26 |         Test set
 27 | 
 28 |     cols: list
 29 |         List of numerical columns
 30 | 
 31 |     imputation_method: string
 32 |         Desired method of imputation. Options are 'mean' and 'median'.
 33 |         Default value: 'median'.
 34 | 
 35 |     Returns
 36 |     -------
 37 |     list
 38 |         (train, valid, test)
 39 |         (pandas dataframe, pandas dataframe, pandas dataframe)
 40 | 
 41 |     Examples
 42 |     --------
 43 | 
 44 |     Raises
 45 |     ------
 46 | 
 47 |     Notes
 48 |     -----
 49 | 
 50 |     """
 51 | 
 52 |     fe = imputation.MeanMedianImputer(
 53 |         imputation_method=imputation_method, variables=cols
 54 |     )
 55 | 
 56 |     # Fit over training set
 57 |     fe.fit(train[cols])
 58 | 
 59 |     # Apply to train, valid, test
 60 |     return (
 61 |         fe.transform(train[cols]),
 62 |         fe.transform(valid[cols]),
 63 |         fe.transform(test[cols]),
 64 |     )
 65 | 
 66 | 
 67 | def one_hot_encoding(train, valid, test, cols):
 68 |     """One-hot-encoding of all categories found in `cols`.
 69 | 
 70 |     The algorithm learns from the train set and applies transformations
 71 |     to all three input datasets: train, valid, test.
 72 | 
 73 |     Missing values in col lead to col_na=1
 74 | 
 75 |     Parameters
 76 |     ----------
 77 |     train: pandas dataframe
 78 |         Training set
 79 | 
 80 |     valid: pandas dataframe
 81 |         Validation set
 82 | 
 83 |     test: pandas dataframe
 84 |         Test set
 85 | 
 86 |     cols: list
 87 |         List of numerical columns
 88 | 
 89 |     Returns
 90 |     -------
 91 |     list
 92 |         (train, valid, test)
 93 |         (pandas dataframe, pandas dataframe, pandas dataframe)
 94 | 
 95 |     Examples
 96 |     --------
 97 | 
 98 |     Raises
 99 |     ------
100 | 
101 |     Notes
102 |     -----
103 | 
104 |     """
105 | 
106 |     fe = encoding.OneHotEncoder(variables=cols)
107 | 
108 |     for k in cols:
109 |         train[k] = train[k].fillna("na")
110 |         valid[k] = valid[k].fillna("na")
111 |         test[k] = test[k].fillna("na")
112 | 
113 |     # Fit over training set
114 |     fe.fit(train[cols])
115 | 
116 |     # Apply to train, valid, test
117 |     return (
118 |         fe.transform(train[cols]),
119 |         fe.transform(valid[cols]),
120 |         fe.transform(test[cols]),
121 |     )
122 | 


--------------------------------------------------------------------------------
/examples/vanilla/src/feature_engineering.py:
--------------------------------------------------------------------------------
  1 | from feature_engine import encoding, imputation
  2 | 
  3 | # Note: we suggest using the below helper functions
  4 | #       for missing imputation (for numerical) and
  5 | #       one-hot-encoding (for categorical).
  6 | #       You will find most of other popular Feature
  7 | #       Engineering methods in the `feature_engine`
  8 | #       python package.
  9 | 
 10 | 
 11 | def numerical_missing_imputation(train, valid, test, cols, imputation_method="median"):
 12 |     """Missing imputation for numerical variables.
 13 | 
 14 |     The algorithm learns from the train set and applies transformations
 15 |     to all three input datasets: train, valid, test.
 16 | 
 17 |     Parameters
 18 |     ----------
 19 |     train: pandas dataframe
 20 |         Training set
 21 | 
 22 |     valid: pandas dataframe
 23 |         Validation set
 24 | 
 25 |     test: pandas dataframe
 26 |         Test set
 27 | 
 28 |     cols: list
 29 |         List of numerical columns
 30 | 
 31 |     imputation_method: string
 32 |         Desired method of imputation. Options are 'mean' and 'median'.
 33 |         Default value: 'median'.
 34 | 
 35 |     Returns
 36 |     -------
 37 |     list
 38 |         (train, valid, test)
 39 |         (pandas dataframe, pandas dataframe, pandas dataframe)
 40 | 
 41 |     Examples
 42 |     --------
 43 | 
 44 |     Raises
 45 |     ------
 46 | 
 47 |     Notes
 48 |     -----
 49 | 
 50 |     """
 51 | 
 52 |     fe = imputation.MeanMedianImputer(
 53 |         imputation_method=imputation_method, variables=cols
 54 |     )
 55 | 
 56 |     # Fit over training set
 57 |     fe.fit(train[cols])
 58 | 
 59 |     # Apply to train, valid, test
 60 |     return (
 61 |         fe.transform(train[cols]),
 62 |         fe.transform(valid[cols]),
 63 |         fe.transform(test[cols]),
 64 |     )
 65 | 
 66 | 
 67 | def one_hot_encoding(train, valid, test, cols):
 68 |     """One-hot-encoding of all categories found in `cols`.
 69 | 
 70 |     The algorithm learns from the train set and applies transformations
 71 |     to all three input datasets: train, valid, test.
 72 | 
 73 |     Missing values in col lead to col_na=1
 74 | 
 75 |     Parameters
 76 |     ----------
 77 |     train: pandas dataframe
 78 |         Training set
 79 | 
 80 |     valid: pandas dataframe
 81 |         Validation set
 82 | 
 83 |     test: pandas dataframe
 84 |         Test set
 85 | 
 86 |     cols: list
 87 |         List of numerical columns
 88 | 
 89 |     Returns
 90 |     -------
 91 |     list
 92 |         (train, valid, test)
 93 |         (pandas dataframe, pandas dataframe, pandas dataframe)
 94 | 
 95 |     Examples
 96 |     --------
 97 | 
 98 |     Raises
 99 |     ------
100 | 
101 |     Notes
102 |     -----
103 | 
104 |     """
105 | 
106 |     fe = encoding.OneHotEncoder(variables=cols)
107 | 
108 |     for k in cols:
109 |         train[k] = train[k].fillna("na")
110 |         valid[k] = valid[k].fillna("na")
111 |         test[k] = test[k].fillna("na")
112 | 
113 |     # Fit over training set
114 |     fe.fit(train[cols])
115 | 
116 |     # Apply to train, valid, test
117 |     return (
118 |         fe.transform(train[cols]),
119 |         fe.transform(valid[cols]),
120 |         fe.transform(test[cols]),
121 |     )
122 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing
  2 | 
  3 | These contributing guidelines were designed both for the original [Greenhouse repo template](https://github.com/felipepenha/py-greenhouse) and any Python projects derived from that template.
  4 | 
  5 | If you just want to use the Greenhouse template for your new cool Data X or Machine Learning project, please choose the option ["Use this Template"](https://github.com/felipepenha/py-greenhouse/generate).
  6 | 
  7 | # Bugs, questions, or suggestions
  8 | 
  9 | In case you found bugs, would like to ask a question, or have suggestions to offer, feel free to use the [Issues Section](https://github.com/felipepenha/py-greenhouse/issues) in the GitHub repository.
 10 | 
 11 | 
 12 | # New Features and/or improvements
 13 | 
 14 | 
 15 | ## Cloning
 16 | 
 17 | Clone the repository locally:
 18 | 
 19 | ```git
 20 | $ git clone https://github.com/felipepenha/py-greenhouse.git
 21 | $ git pull dev
 22 | $ git branch dev
 23 | ```
 24 | 
 25 | Start working on a new branch copied from the `dev` branch:
 26 | 
 27 | ```git
 28 | $ git checkout -b [new_branch_name]
 29 | $ git branch --set-upstream-to=origin/dev [new_branch_name]
 30 | ```
 31 | 
 32 | 
 33 | ## Adding and Commiting
 34 | 
 35 | Use `git add [target]` and `git commit -m [message]` normally, at this point. Every time you commit, pre-commit hooks are triggered and your code will be linted and tested. If it fails on the first pass, you will need to git add again and commit.
 36 | 
 37 | Alternatively, run `make add-commit` (see also [issue #17](https://github.com/felipepenha/py-greenhouse/issues/17)).
 38 | 
 39 | 
 40 | ## Dealing With Inconsistencies
 41 | 
 42 | In case you new branch gets behind `dev`, you may correct it by performing
 43 | 
 44 | ```git
 45 | $ git stash save
 46 | $ git pull [new_branch_name]
 47 | $ git stash pop
 48 | ```
 49 | 
 50 | You may have to deal with the inconsistencies that may arise from that process before proceeding.
 51 | 
 52 | If you want to make your branch available online:
 53 | 
 54 | ```git
 55 | $ git push origin [new_branch_name]
 56 | ```
 57 | 
 58 | 
 59 | ## New Releases
 60 | 
 61 | Instructions for a new release:
 62 | 
 63 | 1. Check which is the latest version (Ex: `0.0.1`);
 64 | 2. Change the field `version` in `version.toml` (Ex: `version="0.0.2"`);
 65 | 3. Add your name and email address to the field `authors` in `version.toml`; and
 66 | 4. Run:
 67 | ```bash
 68 | $ make release
 69 | ```
 70 | 
 71 | The above command will take care of checking the version in `version.toml` and releasing your code on `dev` with a tag consistent with `version.toml`.
 72 | 
 73 | ## Pull Requests to the Main Branch
 74 | 
 75 | New releases will usually be pulled/merged to `main` and need approval.
 76 | 
 77 | # Conventions
 78 | 
 79 | ## Commit Messages
 80 | 
 81 | [conventionalcommits.org v1.0.0](https://www.conventionalcommits.org/en/v1.0.0/)
 82 | 
 83 | Some common commit messages you will find in the project:
 84 | 
 85 | ```git
 86 | "docs:"
 87 | "fix:"
 88 | "feat:"
 89 | "refactor:"
 90 | "test:"
 91 | ```
 92 | 
 93 | 
 94 | ## Versioning
 95 | 
 96 | [Semantic Versioning 2.0.0](https://semver.org/)
 97 | 
 98 | ## Docstrings
 99 | 
100 | [Numpy Docstrings](https://numpydoc.readthedocs.io/en/latest/format.html)
101 | 
102 | A useful template:
103 | 
104 | ```python
105 | def func(x):
106 |     """[Summary]
107 | 
108 |     Parameters
109 |     ----------
110 |     x: type
111 |         [description]
112 | 
113 |     Returns
114 |     -------
115 |     type
116 |         [description]
117 | 
118 |     Examples
119 |     --------
120 | 
121 |     Raises
122 |     ------
123 | 
124 |     Notes
125 |     -----
126 | 
127 |     """
128 | ```


--------------------------------------------------------------------------------
/examples/palmer_penguins/src/modeling.py:
--------------------------------------------------------------------------------
  1 | from sklearn import preprocessing
  2 | import numpy as np
  3 | import pandas as pd
  4 | from sklearn.ensemble import RandomForestClassifier
  5 | from sklearn.model_selection import RandomizedSearchCV
  6 | import joblib
  7 | 
  8 | 
  9 | class model:
 10 |     def __init__(self):
 11 | 
 12 |         pass
 13 | 
 14 |     def fit(self, train, y_col, x_col, n_jobs=1, seed=1):
 15 | 
 16 |         self.x_col = x_col
 17 |         self.y_col = y_col
 18 | 
 19 |         x_train = train[self.x_col].values
 20 | 
 21 |         self.le = preprocessing.LabelEncoder()
 22 | 
 23 |         # Trin encoder over training set
 24 |         (self.le).fit(train[self.y_col].values.ravel())
 25 | 
 26 |         path = "/usr/app/models/label_encoder.joblib"
 27 | 
 28 |         joblib.dump(self.le, path)
 29 | 
 30 |         y_train = (self.le).transform(train[self.y_col].values.ravel())
 31 | 
 32 |         # Store the grid in a dictionary
 33 |         grid = {}
 34 | 
 35 |         grid["max_features"] = [4, 5]
 36 |         grid["max_depth"] = [4, 5]
 37 |         grid["n_estimators"] = [50, 75, 200]
 38 | 
 39 |         clf = RandomForestClassifier(random_state=seed)
 40 | 
 41 |         self.clf_random = RandomizedSearchCV(
 42 |             estimator=clf,
 43 |             param_distributions=grid,
 44 |             n_iter=10,
 45 |             cv=None,
 46 |             verbose=2,
 47 |             random_state=seed,
 48 |             n_jobs=n_jobs,
 49 |         )
 50 | 
 51 |         # Train model over training set
 52 |         (self.clf_random).fit(x_train, y_train.ravel())
 53 | 
 54 |         path = "/usr/app/models/clf_random.joblib"
 55 | 
 56 |         joblib.dump(self.clf_random, path)
 57 | 
 58 |     def transform_sets(self, train, valid, test):
 59 | 
 60 |         x_train = train[self.x_col].values
 61 |         x_valid = valid[self.x_col].values
 62 |         x_test = test[self.x_col].values
 63 | 
 64 |         y_train = (self.le).transform(train[self.y_col].values.ravel())
 65 |         y_valid = (self.le).transform(valid[self.y_col].values.ravel())
 66 |         y_test = (self.le).transform(test[self.y_col].values.ravel())
 67 | 
 68 |         train_out = train.copy(deep=True)[self.y_col]
 69 |         valid_out = valid.copy(deep=True)[self.y_col]
 70 |         test_out = test.copy(deep=True)[self.y_col]
 71 | 
 72 |         train_out["actual"] = y_train
 73 |         valid_out["actual"] = y_valid
 74 |         test_out["actual"] = y_test
 75 | 
 76 |         # Predict
 77 |         train_out["pred"] = (self.clf_random).predict(x_train)
 78 |         valid_out["pred"] = (self.clf_random).predict(x_valid)
 79 |         test_out["pred"] = (self.clf_random).predict(x_test)
 80 | 
 81 |         train_out["prob_0"], train_out["prob_1"], train_out["prob_2"] = np.transpose(
 82 |             (self.clf_random).predict_proba(x_train)
 83 |         )
 84 |         valid_out["prob_0"], valid_out["prob_1"], valid_out["prob_2"] = np.transpose(
 85 |             (self.clf_random).predict_proba(x_valid)
 86 |         )
 87 |         test_out["prob_0"], test_out["prob_1"], test_out["prob_2"] = np.transpose(
 88 |             (self.clf_random).predict_proba(x_test)
 89 |         )
 90 | 
 91 |         return train_out, valid_out, test_out, (self.clf_random).best_params_
 92 | 
 93 |     def transform_new(self, obs):
 94 |         """
 95 |         obs: pandas dataframe
 96 |         """
 97 | 
 98 |         x_obs = obs[self.x_col].values
 99 | 
100 |         # Predict
101 |         obs_out = pd.DataFrame({"pred": (self.clf_random).predict(x_obs)})
102 | 
103 |         obs_out["prob_0"], obs_out["prob_1"], obs_out["prob_2"] = np.transpose(
104 |             (self.clf_random).predict_proba(x_obs)
105 |         )
106 | 
107 |         return obs_out
108 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | # The present file, 'Makefile' has been modified from the original at
  2 | # https://github.com/NeowayLabs/data-science-template
  3 | # under the folllowing license:
  4 | #
  5 | # MIT License
  6 | #
  7 | # Copyright (c) 2019 Neoway Business Solution
  8 | #
  9 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 10 | # of this software and associated documentation files (the "Software"), to deal
 11 | # in the Software without restriction, including without limitation the rights
 12 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 13 | # copies of the Software, and to permit persons to whom the Software is
 14 | # furnished to do so, subject to the following conditions:
 15 | #
 16 | # The above copyright notice and this permission notice shall be included in all
 17 | # copies or substantial portions of the Software.
 18 | #
 19 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 20 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 21 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 22 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 23 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 24 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 25 | # SOFTWARE.
 26 | 
 27 | BUILD = docker-compose build
 28 | RUN = docker-compose run
 29 | VERSION = $(shell awk -F ' = ' '$$1 ~ /version/ { gsub(/[\"]/, "", $$2); printf("%s",$$2) }' version.toml)
 30 | MAKEFILE_ABS_PATH = $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 31 | 
 32 | help:
 33 | 	@echo "USAGE"
 34 | 	@echo
 35 | 	@echo "    make <command>"
 36 | 	@echo "    Include 'sudo' when necessary."
 37 | 	@echo "    To avoid using sudo, follow the steps in"
 38 | 	@echo "    https://docs.docker.com/engine/install/linux-postinstall/"
 39 | 	@echo
 40 | 	@echo
 41 | 	@echo "COMMANDS"
 42 | 	@echo
 43 | 	@echo "    add-commit      git add, pre-commit, and commit"
 44 | 	@echo "    bash            bash REPL (Read-Eval-Print loop), suitable for debugging"
 45 | 	@echo "    build           build image using cache"
 46 | 	@echo "    build-no-cache  build image from scratch, and not from cache"
 47 | 	@echo "    docs            show the src modules documentation on the browser"
 48 | 	@echo "    dvc             runs dvc commands for model versioning and comparison"
 49 | 	@echo "    fastapi         starts up fastapi"
 50 | 	@echo "    jupyter         access Python through the Jupyter Notebook"
 51 | 	@echo "    palmer-penguins moves files in examples/palmer_penguins to the main dir"
 52 | 	@echo "    pre-commit      early run of pre-commit git hooks"
 53 | 	@echo "    python3         access Python through the REPL (Read-Eval-Print loop)"
 54 | 	@echo "    release         release on dev branch. \
 55 | 	Be sure to update version.toml before running this operation"
 56 | 	@echo "    run             run src/main.py"
 57 | 	@echo "    vanilla         moves files in examples/vanilla to the main dir"
 58 | 	@echo "    test            run all tests using pytest (from within the container)"
 59 | 	@echo "    test-no-log     same as test but without log generation"
 60 | 
 61 | #################
 62 | # User Commands #
 63 | #################
 64 | 
 65 | build:
 66 | 	mkdir --parents monitor
 67 | 	mkdir --parents logs
 68 | 	mkdir --parent models
 69 | 	$(BUILD)
 70 | 
 71 | build-no-cache:
 72 | 	mkdir --parents monitor
 73 | 	mkdir --parents logs
 74 | 	$(BUILD) --no-cache
 75 | 
 76 | bash:
 77 | 	$(RUN) bash
 78 | 
 79 | python3:
 80 | 	$(RUN) python3
 81 | 
 82 | jupyter:
 83 | 	$(RUN) --service-ports jupyter
 84 | 
 85 | fastapi:
 86 | 	$(RUN) --service-ports fastapi
 87 | 
 88 | test-no-log:
 89 | 	$(RUN) test
 90 | 
 91 | test:
 92 | 	# test and append log to file including datetime in UTC
 93 | 	(date --utc && $(RUN) test) 2>&1 | tee -ai logs/log_test.txt
 94 | 
 95 | run:
 96 | 	# run and append log to file including datetime in UTC
 97 | 	(date --utc && $(RUN) run) 2>&1 | tee -ai logs/log_run.txt
 98 | 
 99 | pre-commit:
100 | 	pre-commit run --all-files
101 | 
102 | dvc:
103 | 	- dvc checkout
104 | 	# DVC pipeline
105 | 	- dvc repro
106 | 	# Trigger dvc metrics diff file logging
107 | 	# $(compare-to) is the git rev you are comparing to
108 | 	dvc metrics diff --all $(compare-to) > logs/log_metrics_diff.txt
109 | 
110 | add-commit:
111 | 	# `-` signalizes that errors will be ignored by make
112 | 	# Add all files in the current directory
113 | 	- git add .
114 | 	# Run hooks in `pre-commit` that cause file changes
115 | 	- pre-commit run check-toml
116 | 	- pre-commit run check-yaml
117 | 	- pre-commit run pretty-format-json
118 | 	- pre-commit run requirements-txt-fixer
119 | 	- pre-commit run black
120 | 	- pre-commit run flake8
121 | 	# Add currently tracked files (which have been modified)
122 | 	- git add --update
123 | 	# Commit with `--message "$(message)"`.
124 | 	# `pre-commit` will run once again,
125 | 	# but now for all hooks
126 | 	git commit --message="$(message)"
127 | 
128 | release:
129 | 	# Create tag based on `version.toml`
130 | 	# `-` signalizes that errors will be ignored by make
131 | 	git tag --annotate $(VERSION) \
132 | 	--message "VERSION=$(VERSION) read from `version.toml`"
133 | 	# Push from `HEAD` (on current branch) to `dev`,
134 | 	# using the tag created above.
135 | 	# Append log to file including datetime in UTC
136 | 	(date --utc && git push origin HEAD:dev tag $(VERSION)) \
137 | 	2>&1 | tee -ai logs/log_release.txt
138 | 
139 | docs:
140 | 	# Auto documentation.
141 | 	# references: https://pdoc.dev/ | https://calmcode.io/makefiles/phony-folders.html
142 | 	$(RUN) --service-ports docs
143 | 
144 | vanilla:
145 | 	echo ""
146 | 	echo "COPYING FROM $(MAKEFILE_ABS_PATH)/examples/vanilla/** TO $(MAKEFILE_ABS_PATH)"
147 | 	echo ""
148 | 	cp -r $(MAKEFILE_ABS_PATH)/examples/vanilla/** $(MAKEFILE_ABS_PATH)
149 | 
150 | palmer-penguins:
151 | 	echo ""
152 | 	echo "COPYING FROM $(MAKEFILE_ABS_PATH)/examples/palmer_penguins/** TO $(MAKEFILE_ABS_PATH)"
153 | 	echo ""
154 | 	cp -r $(MAKEFILE_ABS_PATH)/examples/palmer_penguins/** $(MAKEFILE_ABS_PATH)
155 | 


--------------------------------------------------------------------------------
/examples/palmer_penguins/src/main.py:
--------------------------------------------------------------------------------
  1 | import greenhouse_clock
  2 | import data_sourcing
  3 | import data_splitting
  4 | import data_preprocessing
  5 | import feature_engineering
  6 | import eda_monitoring
  7 | import modeling
  8 | import performance_monitoring
  9 | 
 10 | from prefect import Flow, task, context
 11 | 
 12 | import pandas as pd
 13 | 
 14 | # Pandas options for better shell display
 15 | pd.set_option("display.max_rows", 100)
 16 | pd.set_option("display.max_columns", None)
 17 | pd.set_option("display.width", None)
 18 | 
 19 | start_time = greenhouse_clock.get_time()
 20 | 
 21 | 
 22 | @task
 23 | def sourcing():
 24 | 
 25 |     return data_sourcing.get()
 26 | 
 27 | 
 28 | @task
 29 | def cleansing(df):
 30 | 
 31 |     return data_preprocessing.clean(df)
 32 | 
 33 | 
 34 | @task
 35 | def normalizing(df):
 36 | 
 37 |     return data_preprocessing.normalize(df)
 38 | 
 39 | 
 40 | @task(nout=3)
 41 | def splitting(df):
 42 | 
 43 |     return data_splitting.split(df)
 44 | 
 45 | 
 46 | @task(nout=3)
 47 | def one_hot(train, valid, test, cols):
 48 | 
 49 |     logger = context.get("logger")
 50 | 
 51 |     logger.info(train)
 52 | 
 53 |     train_hot, valid_hot, test_hot = feature_engineering.one_hot_encoding(
 54 |         train=train,
 55 |         valid=valid,
 56 |         test=test,
 57 |         cols=cols,
 58 |     )
 59 | 
 60 |     train = train.join(train_hot)
 61 |     valid = valid.join(valid_hot)
 62 |     test = test.join(test_hot)
 63 | 
 64 |     logger.info(train)
 65 | 
 66 |     return train, valid, test
 67 | 
 68 | 
 69 | @task(nout=3)
 70 | def imputation(train, valid, test, cols, imputation_method):
 71 | 
 72 |     logger = context.get("logger")
 73 | 
 74 |     # Find rows where the numerical variables are nan
 75 |     mask = train[cols].isna()
 76 | 
 77 |     logger.info(train[mask])
 78 | 
 79 |     train_imp, valid_imp, test_imp = feature_engineering.numerical_missing_imputation(
 80 |         train=train,
 81 |         valid=valid,
 82 |         test=test,
 83 |         cols=cols,
 84 |         imputation_method=imputation_method,
 85 |     )
 86 | 
 87 |     train = train.join(train_imp, rsuffix="_imputed")
 88 |     valid = valid.join(valid_imp, rsuffix="_imputed")
 89 |     test = test.join(test_imp, rsuffix="_imputed")
 90 | 
 91 |     logger.info(train[mask])
 92 | 
 93 |     return train, valid, test
 94 | 
 95 | 
 96 | @task
 97 | def eda(df, path, preffix, suffix):
 98 | 
 99 |     eda_monitoring.export_eda_report(df=df, path=path, preffix=preffix, suffix=suffix)
100 | 
101 |     pass
102 | 
103 | 
104 | @task(nout=5)
105 | def model(train, valid, test, obs, y_col, x_col):
106 | 
107 |     mo = modeling.model()
108 | 
109 |     mo.fit(train=train, y_col=y_col, x_col=x_col)
110 | 
111 |     lst = list(mo.transform_sets(train=train, valid=valid, test=test))
112 | 
113 |     lst.append(mo.transform_new(obs=obs))
114 | 
115 |     return lst
116 | 
117 | 
118 | @task
119 | def threshold(y_true, y_score):
120 | 
121 |     return performance_monitoring.optimal_threshold(y_true=y_true, y_score=y_score)
122 | 
123 | 
124 | @task
125 | def performance(y_true, y_score, best_hyperparams, path, opt_thr, suffix):
126 | 
127 |     return performance_monitoring.report_performance(
128 |         y_true=y_true,
129 |         y_score=y_score,
130 |         best_hyperparams=best_hyperparams,
131 |         path=path,
132 |         opt_thr=opt_thr,
133 |         suffix=suffix,
134 |     )
135 | 
136 | 
137 | @task
138 | def binarize(binary_map, series):
139 | 
140 |     return series.map(binary_map)
141 | 
142 | 
143 | @task
144 | def print_out(s):
145 | 
146 |     print(s)
147 | 
148 |     pass
149 | 
150 | 
151 | @task
152 | def df_to_csv(df, filename):
153 | 
154 |     df.to_csv(filename)
155 | 
156 |     pass
157 | 
158 | 
159 | # Define prefect flow
160 | with Flow("greenhouse") as flow:
161 | 
162 |     df = sourcing()
163 |     df = cleansing(df)
164 |     df = normalizing(df)
165 |     train, valid, test = splitting(df)
166 | 
167 |     # eda(
168 |     #     df=train,
169 |     #     path="monitor/",
170 |     #     preffix=start_time,
171 |     #     suffix="before_feat_eng"
172 |     # )
173 | 
174 |     # Categorical
175 |     cat_cols = [
176 |         "sex",
177 |     ]
178 | 
179 |     train, valid, test = one_hot(
180 |         train=train,
181 |         valid=valid,
182 |         test=test,
183 |         cols=cat_cols,
184 |     )
185 | 
186 |     # Numerical
187 |     num_cols = [
188 |         "bill_length_mm",
189 |         "bill_depth_mm",
190 |         "flipper_length_mm",
191 |         "body_mass_g",
192 |     ]
193 | 
194 |     train, valid, test = imputation(
195 |         train=train,
196 |         valid=valid,
197 |         test=test,
198 |         cols=num_cols,
199 |         imputation_method="median",
200 |     )
201 | 
202 |     # eda(
203 |     #     df=train,
204 |     #     path="monitor/",
205 |     #     preffix=start_time,
206 |     #     suffix="after_feat_eng"
207 |     # )
208 | 
209 |     y_col = ["species"]
210 | 
211 |     x_col = [
212 |         "sex_male",
213 |         "sex_female",
214 |         "sex_na",
215 |         "bill_length_mm_imputed",
216 |         "bill_depth_mm_imputed",
217 |         "flipper_length_mm_imputed",
218 |         "body_mass_g_imputed",
219 |     ]
220 | 
221 |     # `obs=test` just as an example here.
222 |     # It should be actually new data, unseen by the model.
223 |     train, valid, test, best_hyperparams, new = model(
224 |         train=train,
225 |         valid=valid,
226 |         test=test,
227 |         obs=test,
228 |         y_col=y_col,
229 |         x_col=x_col,
230 |     )
231 | 
232 |     path = "data/"
233 |     filename = path + "{}_predict_new.csv".format(start_time)
234 | 
235 |     df_to_csv(df=new, filename=filename)
236 | 
237 |     # Obtain the optimal threshold of
238 |     # class 0 vs 1+2
239 |     # from the training set
240 |     opt_thr = threshold(y_true=train["actual"], y_score=train["prob_0"])
241 | 
242 |     # class 0 --> 1
243 |     # class 1 or class 2 --> 0
244 | 
245 |     binary_map = {
246 |         0: 1,
247 |         1: 0,
248 |         2: 0,
249 |     }
250 | 
251 |     # Performance report over training set
252 |     performance(
253 |         y_true=binarize(binary_map=binary_map, series=train["actual"]),
254 |         y_score=train["prob_0"],
255 |         best_hyperparams=best_hyperparams,
256 |         path="monitor/",
257 |         opt_thr=opt_thr,
258 |         suffix="_train",
259 |     )
260 | 
261 |     # Performance report over validation set
262 |     performance(
263 |         y_true=binarize(binary_map=binary_map, series=valid["actual"]),
264 |         y_score=valid["prob_0"],
265 |         best_hyperparams=best_hyperparams,
266 |         path="monitor/",
267 |         opt_thr=opt_thr,
268 |         suffix="_valid",
269 |     )
270 | 
271 |     # Performance report over test set
272 |     performance(
273 |         y_true=binarize(binary_map=binary_map, series=test["actual"]),
274 |         y_score=test["prob_0"],
275 |         best_hyperparams=best_hyperparams,
276 |         path="monitor/",
277 |         opt_thr=opt_thr,
278 |         suffix="_test",
279 |     )
280 | 
281 | 
282 | if __name__ == "__main__":
283 | 
284 |     # Run prefect flow
285 |     flow.run()
286 | 
287 |     # Export flow as a PDF
288 |     flow.visualize(filename="flow/prefect_flow")
289 | 


--------------------------------------------------------------------------------
/notebooks/relative_path_imports.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "57843254",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import os\n",
 11 |     "import sys\n",
 12 |     "module_path = os.path.abspath(os.path.join('..'))\n",
 13 |     "if module_path not in sys.path:\n",
 14 |     "    sys.path.append(module_path)"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "id": "07b4ecce",
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "from src import data_sourcing\n",
 25 |     "from src import data_splitting\n",
 26 |     "from src import data_preprocessing\n",
 27 |     "from src import feature_engineering\n",
 28 |     "from src import monitoring\n",
 29 |     "from src import modeling"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 3,
 35 |    "id": "f0c1d56e",
 36 |    "metadata": {},
 37 |    "outputs": [
 38 |     {
 39 |      "data": {
 40 |       "text/html": [
 41 |        "<div>\n",
 42 |        "<style scoped>\n",
 43 |        "    .dataframe tbody tr th:only-of-type {\n",
 44 |        "        vertical-align: middle;\n",
 45 |        "    }\n",
 46 |        "\n",
 47 |        "    .dataframe tbody tr th {\n",
 48 |        "        vertical-align: top;\n",
 49 |        "    }\n",
 50 |        "\n",
 51 |        "    .dataframe thead th {\n",
 52 |        "        text-align: right;\n",
 53 |        "    }\n",
 54 |        "</style>\n",
 55 |        "<table border=\"1\" class=\"dataframe\">\n",
 56 |        "  <thead>\n",
 57 |        "    <tr style=\"text-align: right;\">\n",
 58 |        "      <th></th>\n",
 59 |        "      <th>bill_length_mm</th>\n",
 60 |        "      <th>bill_depth_mm</th>\n",
 61 |        "      <th>flipper_length_mm</th>\n",
 62 |        "      <th>body_mass_g</th>\n",
 63 |        "      <th>sex</th>\n",
 64 |        "      <th>species</th>\n",
 65 |        "    </tr>\n",
 66 |        "  </thead>\n",
 67 |        "  <tbody>\n",
 68 |        "    <tr>\n",
 69 |        "      <th>0</th>\n",
 70 |        "      <td>39.1</td>\n",
 71 |        "      <td>18.7</td>\n",
 72 |        "      <td>181.0</td>\n",
 73 |        "      <td>3750.0</td>\n",
 74 |        "      <td>male</td>\n",
 75 |        "      <td>Adelie</td>\n",
 76 |        "    </tr>\n",
 77 |        "    <tr>\n",
 78 |        "      <th>1</th>\n",
 79 |        "      <td>39.5</td>\n",
 80 |        "      <td>17.4</td>\n",
 81 |        "      <td>186.0</td>\n",
 82 |        "      <td>3800.0</td>\n",
 83 |        "      <td>female</td>\n",
 84 |        "      <td>Adelie</td>\n",
 85 |        "    </tr>\n",
 86 |        "    <tr>\n",
 87 |        "      <th>2</th>\n",
 88 |        "      <td>40.3</td>\n",
 89 |        "      <td>18.0</td>\n",
 90 |        "      <td>195.0</td>\n",
 91 |        "      <td>3250.0</td>\n",
 92 |        "      <td>female</td>\n",
 93 |        "      <td>Adelie</td>\n",
 94 |        "    </tr>\n",
 95 |        "    <tr>\n",
 96 |        "      <th>3</th>\n",
 97 |        "      <td>NaN</td>\n",
 98 |        "      <td>NaN</td>\n",
 99 |        "      <td>NaN</td>\n",
100 |        "      <td>NaN</td>\n",
101 |        "      <td>NaN</td>\n",
102 |        "      <td>Adelie</td>\n",
103 |        "    </tr>\n",
104 |        "    <tr>\n",
105 |        "      <th>4</th>\n",
106 |        "      <td>36.7</td>\n",
107 |        "      <td>19.3</td>\n",
108 |        "      <td>193.0</td>\n",
109 |        "      <td>3450.0</td>\n",
110 |        "      <td>female</td>\n",
111 |        "      <td>Adelie</td>\n",
112 |        "    </tr>\n",
113 |        "    <tr>\n",
114 |        "      <th>...</th>\n",
115 |        "      <td>...</td>\n",
116 |        "      <td>...</td>\n",
117 |        "      <td>...</td>\n",
118 |        "      <td>...</td>\n",
119 |        "      <td>...</td>\n",
120 |        "      <td>...</td>\n",
121 |        "    </tr>\n",
122 |        "    <tr>\n",
123 |        "      <th>339</th>\n",
124 |        "      <td>55.8</td>\n",
125 |        "      <td>19.8</td>\n",
126 |        "      <td>207.0</td>\n",
127 |        "      <td>4000.0</td>\n",
128 |        "      <td>male</td>\n",
129 |        "      <td>Chinstrap</td>\n",
130 |        "    </tr>\n",
131 |        "    <tr>\n",
132 |        "      <th>340</th>\n",
133 |        "      <td>43.5</td>\n",
134 |        "      <td>18.1</td>\n",
135 |        "      <td>202.0</td>\n",
136 |        "      <td>3400.0</td>\n",
137 |        "      <td>female</td>\n",
138 |        "      <td>Chinstrap</td>\n",
139 |        "    </tr>\n",
140 |        "    <tr>\n",
141 |        "      <th>341</th>\n",
142 |        "      <td>49.6</td>\n",
143 |        "      <td>18.2</td>\n",
144 |        "      <td>193.0</td>\n",
145 |        "      <td>3775.0</td>\n",
146 |        "      <td>male</td>\n",
147 |        "      <td>Chinstrap</td>\n",
148 |        "    </tr>\n",
149 |        "    <tr>\n",
150 |        "      <th>342</th>\n",
151 |        "      <td>50.8</td>\n",
152 |        "      <td>19.0</td>\n",
153 |        "      <td>210.0</td>\n",
154 |        "      <td>4100.0</td>\n",
155 |        "      <td>male</td>\n",
156 |        "      <td>Chinstrap</td>\n",
157 |        "    </tr>\n",
158 |        "    <tr>\n",
159 |        "      <th>343</th>\n",
160 |        "      <td>50.2</td>\n",
161 |        "      <td>18.7</td>\n",
162 |        "      <td>198.0</td>\n",
163 |        "      <td>3775.0</td>\n",
164 |        "      <td>female</td>\n",
165 |        "      <td>Chinstrap</td>\n",
166 |        "    </tr>\n",
167 |        "  </tbody>\n",
168 |        "</table>\n",
169 |        "<p>344 rows × 6 columns</p>\n",
170 |        "</div>"
171 |       ],
172 |       "text/plain": [
173 |        "     bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g     sex  \\\n",
174 |        "0              39.1           18.7              181.0       3750.0    male   \n",
175 |        "1              39.5           17.4              186.0       3800.0  female   \n",
176 |        "2              40.3           18.0              195.0       3250.0  female   \n",
177 |        "3               NaN            NaN                NaN          NaN     NaN   \n",
178 |        "4              36.7           19.3              193.0       3450.0  female   \n",
179 |        "..              ...            ...                ...          ...     ...   \n",
180 |        "339            55.8           19.8              207.0       4000.0    male   \n",
181 |        "340            43.5           18.1              202.0       3400.0  female   \n",
182 |        "341            49.6           18.2              193.0       3775.0    male   \n",
183 |        "342            50.8           19.0              210.0       4100.0    male   \n",
184 |        "343            50.2           18.7              198.0       3775.0  female   \n",
185 |        "\n",
186 |        "       species  \n",
187 |        "0       Adelie  \n",
188 |        "1       Adelie  \n",
189 |        "2       Adelie  \n",
190 |        "3       Adelie  \n",
191 |        "4       Adelie  \n",
192 |        "..         ...  \n",
193 |        "339  Chinstrap  \n",
194 |        "340  Chinstrap  \n",
195 |        "341  Chinstrap  \n",
196 |        "342  Chinstrap  \n",
197 |        "343  Chinstrap  \n",
198 |        "\n",
199 |        "[344 rows x 6 columns]"
200 |       ]
201 |      },
202 |      "execution_count": 3,
203 |      "metadata": {},
204 |      "output_type": "execute_result"
205 |     }
206 |    ],
207 |    "source": [
208 |     "data_sourcing.get()"
209 |    ]
210 |   }
211 |  ],
212 |  "metadata": {
213 |   "kernelspec": {
214 |    "display_name": "Python 3",
215 |    "language": "python",
216 |    "name": "python3"
217 |   },
218 |   "language_info": {
219 |    "codemirror_mode": {
220 |     "name": "ipython",
221 |     "version": 3
222 |    },
223 |    "file_extension": ".py",
224 |    "mimetype": "text/x-python",
225 |    "name": "python",
226 |    "nbconvert_exporter": "python",
227 |    "pygments_lexer": "ipython3",
228 |    "version": "3.9.2"
229 |   }
230 |  },
231 |  "nbformat": 4,
232 |  "nbformat_minor": 5
233 | }
234 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![](/images/greenhouse_github_card_v02.png)
  2 | 
  3 | # py-greenhouse
  4 | 
  5 | A containerized Python framework for a better Data X development workflow. Where X = Science, Engineering, Analytics, etc.
  6 | 
  7 | The name "Greenhouse" is a metaphor. A greenhouse is a structure made of glass to grow plants despite of external conditions such as a cold winter. Likewise, the Greenhouse framework builds a standalone container for Rust developmet which is fully transparent to the user.
  8 | 
  9 | [Watch an overview clip on Twitch! 🖥️💜🖥️💜](https://www.twitch.tv/videos/1013368507)
 10 | 
 11 | ![](/images/greenhouse_architecture_v03.png)
 12 | 
 13 | 
 14 | # But what is a template?
 15 | 
 16 | `py-greenhouse` is a GitHub template, not a package. This means that you will work on a copy of this project and you will replace placeholders by code that fits your own purposes.
 17 | 
 18 | If you just want to use the Greenhouse template for your new cool Data X or Machine Learning project, please choose the option ["Use this Template"](https://github.com/felipepenha/py-greenhouse/generate).
 19 | 
 20 | The current version of `py-greenhouse` uses the [Palmer Penguins dataset](https://github.com/mcnakhaee/palmerpenguins) called via an API (see [`src/data_sourcing.py`](https://github.com/felipepenha/py-greenhouse/blob/main/src/data_sourcing.py)). You may use other datasets, coming from different sources, and you may need to setup keys for cloud environment access, all of which are not covered here.
 21 | 
 22 | 
 23 | # Local OS Requirements
 24 | 
 25 | These are requirements for your local machine, ideally a Debian Linux OS:
 26 | 
 27 | ## - [docker](https://docs.docker.com/engine/install/)
 28 | 
 29 | Follow the [instructions in the docker docs](https://docs.docker.com/engine/install/linux-postinstall/) to ensure that $USER has root access to docker.
 30 | 
 31 | ## - [docker-compose](https://docs.docker.com/compose/install/)
 32 | 
 33 | ## - VS Code
 34 | 
 35 | In your local machine:
 36 | 
 37 | 1. [install VS Code](https://code.visualstudio.com/docs/setup/linux),
 38 | 
 39 | 2. install the [`ms-vscode-remote.remote-containers`](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) extension locally,
 40 | 
 41 | A pop-up will open up asking if you would like to reload the workspace in the container:
 42 | 
 43 | ![](/images/Screenshot_from_2021-03-07_18-31-36_VS-Code.png)
 44 | 
 45 | After choosing "Reopen in Container", VS Code will open the "bash" docker-compose service in the greenhouse container, as specified in the manifest `.devcontainer.json`. 
 46 | 
 47 | Notice that VS Code will run intilization commands that may take some time to process.
 48 | 
 49 | VS Code will already include the [`ms-python.python`](https://marketplace.visualstudio.com/items?itemName=ms-python.python) extension, without the need to install it in your own local machine. You may add any other extensions that you may need in your Python project in the configuration file `.devcontainer.json` .
 50 | 
 51 | ## - [git](https://git-scm.com/download/linux)
 52 | 
 53 | ```
 54 | sudo apt-get git
 55 | ```
 56 | 
 57 | ## - make
 58 | 
 59 | ```
 60 | sudo apt-get update
 61 | sudo apt-get install build-essential
 62 | ```
 63 | 
 64 | ## - awk
 65 | ## - tee
 66 | ## - touch
 67 | 
 68 | ## - python3
 69 | 
 70 | ```
 71 | sudo apt-get update
 72 | sudo apt-get install python3
 73 | ```
 74 | 
 75 | ## - pip3
 76 | 
 77 | ```
 78 | sudo apt-get update
 79 | sudo apt-get install python3-pip
 80 | ```
 81 | 
 82 | ## - pre-commit
 83 | 
 84 | ```
 85 | pip3 install pre-commit
 86 | ```
 87 | 
 88 | In the main directory of the project where there is already a `.git/` subdirectory:
 89 | 
 90 | ```
 91 | pre-commit install
 92 | pre-commit migrate-config
 93 | pre-commit autoupdate
 94 | ```
 95 | 
 96 | The main directory may be either the locally cloned py-greenhouse or a project based on the github template.
 97 | 
 98 | Alternatively, simply run in the terminal `make install-requirements`, to install the `pre-commit` Python package.
 99 | 
100 | ## - [dvc](https://dvc.org/doc/install/linux)
101 | 
102 | ```
103 | pip3 install dvc
104 | ```
105 | 
106 | ## Do I need to install any other requirements?
107 | 
108 | No. After installing the basic local requirements described above, you are all set to run everything else inside a Docker container.
109 | 
110 | # Quick Start
111 | 
112 | This is a template repository. [Follow this link for instructions to create a repository from a template](https://docs.github.com/en/github/creating-cloning-and-archiving-repositories/creating-a-repository-from-a-template#creating-a-repository-from-a-template).
113 | 
114 | 
115 | First, make sure `make`, `docker` and `docker-compose` are installed in your system.
116 | 
117 | 
118 | The greenhouse dev work is performed via `make` commands.
119 | 
120 | 
121 | To see the most up to date list of available commands run
122 | 
123 | ```bash
124 | $ make help
125 | 
126 | USAGE
127 | 
128 |     make <command>
129 |     Include 'sudo' when necessary.
130 |     To avoid using sudo, follow the steps in
131 |     https://docs.docker.com/engine/install/linux-postinstall/
132 | 
133 | 
134 | COMMANDS
135 | 
136 |     add-commit      git add, pre-commit, and commit
137 |     bash            bash REPL (Read-Eval-Print loop), suitable for debugging
138 |     build           build image using cache
139 |     build-no-cache  build image from scratch, and not from cache
140 |     docs            show the src modules documentation on the browser
141 |     fastapi         starts up fastapi
142 |     jupyter         access Python through the Jupyter Notebook
143 |     pre-commit      early run of pre-commit git hooks
144 |     python3         access Python through the REPL (Read-Eval-Print loop)
145 |     release         release on dev branch. Be sure to update version.toml before running this operation
146 |     run             run src/main.py
147 |     test            run all tests using pytest (from within the container)
148 | 
149 | ```
150 | 
151 | 
152 | To build your greenhouse (as it is), you first need to run:
153 | 
154 | ```bash
155 | $ make build-no-cache
156 | ```
157 | 
158 | 
159 | To access Jupyter in your local browser:
160 | 
161 | ```bash
162 | $ make jupyter
163 | 
164 | Use Control-C to stop this server and shut down all kernels (twice to skip confirmation).
165 |     
166 |     To access the notebook, open this file in a browser:
167 |         file:///root/.local/share/jupyter/runtime/nbserver-1-open.html
168 |     Or copy and paste one of these URLs:
169 |         http://...:8888/lab?token=...
170 | ```
171 | 
172 | 
173 | Next, you simply need to follow the instructions printed out on your own terminal.
174 | 
175 | 
176 | In the generic example above, I would paste the following on my browser:
177 | 
178 | ```bash
179 | http://...:8888/lab?token=...
180 | ```
181 | 
182 | 
183 | Any changes made in the files within the Jupyter interface, for example saved changes in `.rs`, `.ipynb`, and `.py` files, will be reflected in the original files you store locally, and vice-versa. This is ensured by the fact that the whole greenhouse directory is set as a `volume` in the `docker-compose.yml` configuration file.
184 | 
185 | 
186 | You may also choose to run code using the REPL (Read-Eval-Print loop) in the terminal by running:
187 | 
188 | ```bash
189 | $ make python3
190 | ```
191 | 
192 | 
193 | Now, you are ready to start developing Python code by creating new `.py` files in the `/src` directory.
194 | 
195 | 
196 | During development phase, you can normally test out new code in a Jupyter Notebook.
197 | 
198 | Check out additional notebooks in the `/notebooks` directory (`.ipynb` files with preffix `example_`).
199 | 
200 | 
201 | # Greenhouse Structure
202 | 
203 | ```bash
204 | .
205 | ├── conftest.py
206 | ├── CONTRIBUTING.md
207 | ├── docker-compose.yml
208 | ├── Dockerfile
209 | ├── notebooks
210 | ├── flow
211 | ├── images
212 | ├── LICENSE
213 | ├── logs
214 | ├── Makefile
215 | ├── monitor
216 | ├── README.md
217 | ├── requirements.txt
218 | ├── src
219 | │   ├── data_preprocessing.py
220 | │   ├── data_sourcing.py
221 | │   ├── data_splitting.py
222 | │   ├── eda_monitoring.py
223 | │   ├── feature_engineering.py
224 | │   ├── greenhouse_clock.py
225 | │   ├── main.py
226 | │   ├── modeling.py
227 | │   ├── performance_monitoring.py
228 | ├── tests
229 | │   ├── test_data_sourcing.py
230 | │   ├── test_data_splitting.py
231 | │   └── test_feature_engineering.py
232 | └── version.toml
233 | ```
234 | 
235 | Highlights:
236 | 
237 | * `notebooks/`: notebooks, usually Jupyter Notebooks not in production
238 | * `logs/`: dated logs, usually `.txt` files
239 | * `monitor/`: files exported for monitoring purposes (data, model performance, etc). usually `.html` or `.json`.
240 | * `flow/`: flow diagram as provided by `prefect`
241 | * `requirements.txt`: pip3 requirements for your project
242 | * `src/`: source directory for your Python project
243 | * `src/main.py`: main file where flow is defined
244 | * `test/`: tests of Python code. All tests will run automatically as pre-commit git hooks, in the container.
245 | * `version.toml`: information about your project, such as the version number to be used in the git tag pushed to the repo with `make release`.
246 | 
247 | 
248 | 
249 | # Adding External Dependencies
250 | 
251 | You need to include any external dependencies to the `requirements.txt` file in addition to the default list provided here.
252 | 
253 | 
254 | ## Continuous Integration / Continuous Delivery (CI/CD)
255 | 
256 | Follow the instructins in [CONTRIBUTING.md](https://github.com/felipepenha/rust-greenhouse/blob/main/CONTRIBUTING.md). Be sure to update `version.toml` before each new release on the `dev` branch.
257 | 
258 | ![](/images/greenhouse_architecture_gitops.png)
259 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------