├── .dvc ├── config ├── .gitignore └── plots │ ├── default.json │ ├── smooth.json │ ├── confusion.json │ ├── scatter.json │ ├── confusion_normalized.json │ └── linear.json ├── conftest.py ├── notebooks ├── .gitkeep └── relative_path_imports.ipynb ├── src ├── __init__.py ├── __main__.py ├── greenhouse_clock.py ├── eda_monitoring.py ├── data_sourcing.py ├── performance_monitoring.py ├── main.py ├── api.py ├── data_preprocessing.py ├── data_splitting.py ├── modeling.py └── feature_engineering.py ├── tests ├── .gitkeep ├── test_data_sourcing.py ├── test_feature_engineering.py └── test_data_splitting.py ├── data └── .gitignore ├── .flake8 ├── images ├── Greenhouse_logo.png ├── greenhouse_architecture_v01.png ├── greenhouse_architecture_v02.png ├── greenhouse_architecture_v03.png ├── greenhouse_github_card_v02.png ├── greenhouse_architecture_gitops.png └── Screenshot_from_2021-03-07_18-31-36_VS-Code.png ├── examples ├── palmer_penguins │ ├── flow │ │ └── prefect_flow.pdf │ ├── src │ │ ├── greenhouse_clock.py │ │ ├── eda_monitoring.py │ │ ├── data_sourcing.py │ │ ├── data_preprocessing.py │ │ ├── data_splitting.py │ │ ├── api.py │ │ ├── performance_monitoring.py │ │ ├── feature_engineering.py │ │ ├── modeling.py │ │ └── main.py │ ├── requirements.txt │ ├── Dockerfile │ └── tests │ │ └── test_data_sourcing.py └── vanilla │ ├── src │ ├── greenhouse_clock.py │ ├── data_sourcing.py │ ├── performance_monitoring.py │ ├── main.py │ ├── api.py │ ├── data_preprocessing.py │ ├── data_splitting.py │ ├── modeling.py │ └── feature_engineering.py │ ├── requirements.txt │ ├── Dockerfile │ └── tests │ └── test_data_sourcing.py ├── .dvcignore ├── .dockerignore ├── dvc.yaml ├── .vscode └── settings.json ├── requirements.txt ├── .devcontainer.json ├── version.toml ├── Dockerfile ├── dvc.lock ├── .pre-commit-config.yaml ├── docker-compose.yml ├── .gitignore ├── CONTRIBUTING.md ├── Makefile ├── README.md └── LICENSE /.dvc/config: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /notebooks/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/__main__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | /test_dvc.json 2 | -------------------------------------------------------------------------------- /.dvc/.gitignore: -------------------------------------------------------------------------------- 1 | /config.local 2 | /tmp 3 | /cache 4 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 87 3 | extend-ignore = "E203" -------------------------------------------------------------------------------- /images/Greenhouse_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/felipepenha/py-greenhouse/HEAD/images/Greenhouse_logo.png -------------------------------------------------------------------------------- /src/greenhouse_clock.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | 4 | def get_time(format="%Y%m%d%H%M%S"): 5 | 6 | return time.strftime(format) 7 | -------------------------------------------------------------------------------- /images/greenhouse_architecture_v01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/felipepenha/py-greenhouse/HEAD/images/greenhouse_architecture_v01.png -------------------------------------------------------------------------------- /images/greenhouse_architecture_v02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/felipepenha/py-greenhouse/HEAD/images/greenhouse_architecture_v02.png -------------------------------------------------------------------------------- /images/greenhouse_architecture_v03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/felipepenha/py-greenhouse/HEAD/images/greenhouse_architecture_v03.png -------------------------------------------------------------------------------- /images/greenhouse_github_card_v02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/felipepenha/py-greenhouse/HEAD/images/greenhouse_github_card_v02.png -------------------------------------------------------------------------------- /images/greenhouse_architecture_gitops.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/felipepenha/py-greenhouse/HEAD/images/greenhouse_architecture_gitops.png -------------------------------------------------------------------------------- /examples/palmer_penguins/flow/prefect_flow.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/felipepenha/py-greenhouse/HEAD/examples/palmer_penguins/flow/prefect_flow.pdf -------------------------------------------------------------------------------- /examples/vanilla/src/greenhouse_clock.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | 4 | def get_time(format="%Y%m%d%H%M%S"): 5 | 6 | return time.strftime(format) 7 | -------------------------------------------------------------------------------- /examples/palmer_penguins/src/greenhouse_clock.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | 4 | def get_time(format="%Y%m%d%H%M%S"): 5 | 6 | return time.strftime(format) 7 | -------------------------------------------------------------------------------- /.dvcignore: -------------------------------------------------------------------------------- 1 | # Add patterns of files dvc should ignore, which could improve 2 | # the performance. Learn more at 3 | # https://dvc.org/doc/user-guide/dvcignore 4 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # Docker 2 | Dockerfile 3 | .dockerignore 4 | 5 | # Git 6 | .git 7 | .gitignore 8 | .gitattributes 9 | 10 | # Images 11 | images 12 | .png 13 | -------------------------------------------------------------------------------- /images/Screenshot_from_2021-03-07_18-31-36_VS-Code.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/felipepenha/py-greenhouse/HEAD/images/Screenshot_from_2021-03-07_18-31-36_VS-Code.png -------------------------------------------------------------------------------- /dvc.yaml: -------------------------------------------------------------------------------- 1 | stages: 2 | build-run: 3 | cmd: make build && make run 4 | deps: 5 | - src/modeling.py 6 | metrics: 7 | - monitor/metadata_valid.json: 8 | cache: true 9 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.linting.enabled": true, 3 | "python.linting.flake8Enabled": true, 4 | "python.linting.pylintEnabled": false, 5 | "python.pythonPath": "/usr/local/bin/python" 6 | } 7 | -------------------------------------------------------------------------------- /examples/vanilla/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi==0.65.1 2 | feature-engine==1.0.2 3 | jupyterlab==3.0.9 4 | numpy==1.20.1 5 | pandas==1.2.2 6 | pandera==0.6.2 7 | pydantic==1.8.2 8 | pytest==6.2.2 9 | uvicorn==0.13.4 10 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi==0.65.1 2 | feature-engine==1.0.2 3 | jupyterlab==3.0.9 4 | numpy==1.20.1 5 | pandas==1.2.2 6 | pandas-profiling==2.11.0 7 | pandera==0.6.2 8 | pdoc==7.0.3 9 | prefect[viz]==0.14.12 10 | pydantic==1.8.2 11 | pytest==6.2.2 12 | uvicorn==0.13.4 13 | -------------------------------------------------------------------------------- /.devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "dockerFile": "Dockerfile", 3 | "extensions": [ 4 | "ms-python.python", 5 | "dracula-theme.theme-dracula", 6 | "bungcip.better-toml", 7 | "tomoki1207.pdf" 8 | ], 9 | "name": "py-greenhouse", 10 | "shutdownAction": "stopContainer" 11 | } 12 | -------------------------------------------------------------------------------- /version.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "greenhouse" 3 | version = "1.0.0" 4 | authors = ["Felipe Penha "] 5 | description = "A Python containerized framework for a better Data X development workflow." 6 | repository = "https://github.com/felipepenha/py-greenhouse" 7 | license = "Apache-2.0" -------------------------------------------------------------------------------- /src/eda_monitoring.py: -------------------------------------------------------------------------------- 1 | import pandas_profiling 2 | 3 | 4 | def export_eda_report(df, path, preffix, suffix): 5 | 6 | profile = pandas_profiling.ProfileReport(df, title="Pandas Profiling Report") 7 | 8 | path = "{}/{}_ead_monitoring_{}.html".format(path, preffix, suffix) 9 | 10 | profile.to_file(path) 11 | 12 | pass 13 | -------------------------------------------------------------------------------- /examples/palmer_penguins/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi==0.65.1 2 | feature-engine==1.0.2 3 | jupyterlab==3.0.9 4 | matplotlib==3.3.4 5 | numpy==1.20.1 6 | palmerpenguins==0.1.4 7 | pandas==1.2.2 8 | pandas-profiling==2.11.0 9 | pandera==0.6.2 10 | prefect[viz]==0.14.12 11 | pydantic==1.8.2 12 | pytest==6.2.2 13 | seaborn==0.11.1 14 | uvicorn==0.13.4 15 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9.2-slim-buster AS base 2 | 3 | ARG APP_DIR=/usr/app/ 4 | 5 | USER root 6 | 7 | RUN mkdir ${APP_DIR} 8 | 9 | WORKDIR ${APP_DIR} 10 | 11 | # pip requirements 12 | COPY requirements.txt ${APP_DIR} 13 | 14 | RUN pip install --upgrade pip \ 15 | && pip3 install --no-cache-dir -r requirements.txt 16 | 17 | CMD ["python3", "src/main.py"] -------------------------------------------------------------------------------- /examples/palmer_penguins/src/eda_monitoring.py: -------------------------------------------------------------------------------- 1 | import pandas_profiling 2 | 3 | 4 | def export_eda_report(df, path, preffix, suffix): 5 | 6 | profile = pandas_profiling.ProfileReport(df, title="Pandas Profiling Report") 7 | 8 | path = "{}/{}_ead_monitoring_{}.html".format(path, preffix, suffix) 9 | 10 | profile.to_file(path) 11 | 12 | pass 13 | -------------------------------------------------------------------------------- /examples/vanilla/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9.2-slim-buster AS base 2 | 3 | ARG APP_DIR=/usr/app/ 4 | 5 | USER root 6 | 7 | RUN mkdir ${APP_DIR} 8 | 9 | WORKDIR ${APP_DIR} 10 | 11 | # pip requirements 12 | COPY requirements.txt ${APP_DIR} 13 | 14 | RUN pip install --upgrade pip \ 15 | && pip3 install --no-cache-dir -r requirements.txt 16 | 17 | CMD ["python3", "src/main.py"] -------------------------------------------------------------------------------- /dvc.lock: -------------------------------------------------------------------------------- 1 | schema: '2.0' 2 | stages: 3 | build: 4 | cmd: make build && make run 5 | outs: 6 | - path: monitor 7 | md5: 7c3a8f4c317b9ae0d1df7ab62820ff6d.dir 8 | size: 3251 9 | nfiles: 3 10 | build-run: 11 | cmd: make build && make run 12 | deps: 13 | - path: src/modeling.py 14 | md5: 6103728b418f2ccc23840f786157baa4 15 | size: 3177 16 | outs: 17 | - path: monitor/metadata_valid.json 18 | md5: 91c3f20f203751f9034230a46dc45b0e 19 | size: 1093 20 | -------------------------------------------------------------------------------- /examples/palmer_penguins/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9.2-slim-buster AS base 2 | 3 | ARG APP_DIR=/usr/app/ 4 | 5 | USER root 6 | 7 | RUN mkdir ${APP_DIR} 8 | 9 | WORKDIR ${APP_DIR} 10 | 11 | # graphviz is required by prefect[viz]==0.14.12 12 | RUN apt-get update \ 13 | && apt-get install -y build-essential graphviz \ 14 | && apt-get clean 15 | 16 | # pip requirements 17 | COPY requirements.txt ${APP_DIR} 18 | 19 | RUN pip install --upgrade pip \ 20 | && pip3 install --no-cache-dir -r requirements.txt 21 | 22 | CMD ["python3", "src/main.py"] -------------------------------------------------------------------------------- /tests/test_data_sourcing.py: -------------------------------------------------------------------------------- 1 | import pandera as pa 2 | from src import data_sourcing 3 | 4 | 5 | def test_data_sourcing_get(): 6 | 7 | df = data_sourcing.get() 8 | 9 | print(df) 10 | 11 | schema = pa.DataFrameSchema( 12 | { 13 | "id": pa.Column( 14 | str, 15 | nullable=True, 16 | ), 17 | "x": pa.Column( 18 | float, 19 | nullable=True, 20 | ), 21 | "y": pa.Column( 22 | float, 23 | nullable=True, 24 | ), 25 | } 26 | ) 27 | 28 | schema(df) 29 | -------------------------------------------------------------------------------- /examples/vanilla/tests/test_data_sourcing.py: -------------------------------------------------------------------------------- 1 | import pandera as pa 2 | from src import data_sourcing 3 | 4 | 5 | def test_data_sourcing_get(): 6 | 7 | df = data_sourcing.get() 8 | 9 | print(df) 10 | 11 | schema = pa.DataFrameSchema( 12 | { 13 | "id": pa.Column( 14 | str, 15 | nullable=True, 16 | ), 17 | "x": pa.Column( 18 | float, 19 | nullable=True, 20 | ), 21 | "y": pa.Column( 22 | float, 23 | nullable=True, 24 | ), 25 | } 26 | ) 27 | 28 | schema(df) 29 | -------------------------------------------------------------------------------- /.dvc/plots/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://vega.github.io/schema/vega-lite/v4.json", 3 | "data": { 4 | "values": "" 5 | }, 6 | "encoding": { 7 | "color": { 8 | "field": "rev", 9 | "type": "nominal" 10 | }, 11 | "x": { 12 | "field": "", 13 | "title": "", 14 | "type": "quantitative" 15 | }, 16 | "y": { 17 | "field": "", 18 | "scale": { 19 | "zero": false 20 | }, 21 | "title": "", 22 | "type": "quantitative" 23 | } 24 | }, 25 | "height": 300, 26 | "mark": { 27 | "type": "line" 28 | }, 29 | "title": "", 30 | "width": 300 31 | } 32 | -------------------------------------------------------------------------------- /src/data_sourcing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | def get(): 6 | """Get the data. 7 | 8 | ** Vanilla definition. ** 9 | Include your own code below to import your project's data. 10 | 11 | Parameters 12 | ---------- 13 | None 14 | 15 | Returns 16 | ------- 17 | df: pandas dataframe 18 | 19 | Examples 20 | -------- 21 | 22 | Raises 23 | ------ 24 | 25 | Notes 26 | ----- 27 | 28 | """ 29 | 30 | df = pd.DataFrame( 31 | { 32 | "id": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"], 33 | "x": [0.0, np.nan, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 34 | "y": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 35 | } 36 | ) 37 | 38 | return df 39 | -------------------------------------------------------------------------------- /examples/vanilla/src/data_sourcing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | def get(): 6 | """Get the data. 7 | 8 | ** Vanilla definition. ** 9 | Include your own code below to import your project's data. 10 | 11 | Parameters 12 | ---------- 13 | None 14 | 15 | Returns 16 | ------- 17 | df: pandas dataframe 18 | 19 | Examples 20 | -------- 21 | 22 | Raises 23 | ------ 24 | 25 | Notes 26 | ----- 27 | 28 | """ 29 | 30 | df = pd.DataFrame( 31 | { 32 | "id": ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"], 33 | "x": [0.0, np.nan, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 34 | "y": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 35 | } 36 | ) 37 | 38 | return df 39 | -------------------------------------------------------------------------------- /src/performance_monitoring.py: -------------------------------------------------------------------------------- 1 | import json 2 | import greenhouse_clock 3 | 4 | meta = {} 5 | 6 | # Timestamp for files 7 | meta["timestr"] = greenhouse_clock.get_time() 8 | 9 | 10 | def report_performance(y_true, y_score, path, suffix=""): 11 | """ 12 | 13 | We suggest using `sklearn.metrics.classification_report` 14 | 15 | References 16 | ---------- 17 | https://scikit-learn.org/stable/modules/generated/ 18 | sklearn.metrics.classification_report.html 19 | """ 20 | 21 | # Plug-in here your performance metrics as dictionary entries 22 | meta["performance_metric_name"] = 0 23 | 24 | filename = "{0}metadata{1}.json".format(path, suffix) 25 | 26 | # Export to JSON 27 | with open(filename, "w") as fp: 28 | json.dump(meta, fp, indent=4) 29 | 30 | pass 31 | -------------------------------------------------------------------------------- /examples/vanilla/src/performance_monitoring.py: -------------------------------------------------------------------------------- 1 | import json 2 | import greenhouse_clock 3 | 4 | meta = {} 5 | 6 | # Timestamp for files 7 | meta["timestr"] = greenhouse_clock.get_time() 8 | 9 | 10 | def report_performance(y_true, y_score, path, suffix=""): 11 | """ 12 | 13 | We suggest using `sklearn.metrics.classification_report` 14 | 15 | References 16 | ---------- 17 | https://scikit-learn.org/stable/modules/generated/ 18 | sklearn.metrics.classification_report.html 19 | """ 20 | 21 | # Plug-in here your performance metrics as dictionary entries 22 | meta["performance_metric_name"] = 0 23 | 24 | filename = "{0}metadata{1}.json".format(path, suffix) 25 | 26 | # Export to JSON 27 | with open(filename, "w") as fp: 28 | json.dump(meta, fp, indent=4) 29 | 30 | pass 31 | -------------------------------------------------------------------------------- /examples/palmer_penguins/src/data_sourcing.py: -------------------------------------------------------------------------------- 1 | import palmerpenguins 2 | 3 | 4 | def get(): 5 | """Get the data. 6 | This template function uses the Palmer Peguins dataset as a place holder. 7 | Replace it by your own code to import your project's data. 8 | 9 | Parameters 10 | ---------- 11 | None 12 | 13 | Returns 14 | ------- 15 | pandas dataframe 16 | Dataframe containing data. 17 | 18 | Examples 19 | -------- 20 | 21 | Raises 22 | ------ 23 | 24 | Notes 25 | ----- 26 | 27 | """ 28 | 29 | df = palmerpenguins.load_penguins() 30 | 31 | cols = [ 32 | "bill_length_mm", 33 | "bill_depth_mm", 34 | "flipper_length_mm", 35 | "body_mass_g", 36 | "sex", 37 | "species", 38 | ] 39 | 40 | return df[cols] 41 | -------------------------------------------------------------------------------- /.dvc/plots/smooth.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://vega.github.io/schema/vega-lite/v4.json", 3 | "data": { 4 | "values": "" 5 | }, 6 | "encoding": { 7 | "color": { 8 | "field": "rev", 9 | "type": "nominal" 10 | }, 11 | "x": { 12 | "field": "", 13 | "title": "", 14 | "type": "quantitative" 15 | }, 16 | "y": { 17 | "field": "", 18 | "scale": { 19 | "zero": false 20 | }, 21 | "title": "", 22 | "type": "quantitative" 23 | } 24 | }, 25 | "mark": { 26 | "type": "line" 27 | }, 28 | "title": "", 29 | "transform": [ 30 | { 31 | "bandwidth": 0.3, 32 | "groupby": [ 33 | "rev" 34 | ], 35 | "loess": "", 36 | "on": "" 37 | } 38 | ] 39 | } 40 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v3.4.0 4 | hooks: 5 | - id: check-toml 6 | always_run: true 7 | verbose: true 8 | - id: check-yaml 9 | always_run: true 10 | verbose: true 11 | - id: pretty-format-json 12 | always_run: true 13 | verbose: true 14 | args: ["--autofix"] 15 | exclude: .dvc 16 | - id: requirements-txt-fixer 17 | always_run: true 18 | verbose: true 19 | - repo: https://github.com/ambv/black 20 | rev: 20.8b1 21 | hooks: 22 | - id: black 23 | always_run: true 24 | verbose: true 25 | - repo: https://gitlab.com/pycqa/flake8 26 | rev: 3.9.0 27 | hooks: 28 | - id: flake8 29 | always_run: true 30 | verbose: true 31 | - repo: local 32 | hooks: 33 | - id: test 34 | name: test 35 | entry: make test-no-log 36 | language: system 37 | pass_filenames: false 38 | always_run: true 39 | verbose: true 40 | -------------------------------------------------------------------------------- /src/main.py: -------------------------------------------------------------------------------- 1 | import greenhouse_clock 2 | import data_sourcing 3 | import data_splitting 4 | import data_preprocessing 5 | import feature_engineering 6 | from modeling import model 7 | import performance_monitoring 8 | 9 | start_time = greenhouse_clock.get_time() 10 | 11 | if __name__ == "__main__": 12 | 13 | # Run prefect flow 14 | df = data_sourcing.get() 15 | df = data_preprocessing.clean(df) 16 | df = data_preprocessing.normalize(df) 17 | 18 | train, valid, test = data_splitting.split(df) 19 | 20 | ( 21 | train["x"], 22 | valid["x"], 23 | test["x"], 24 | ) = feature_engineering.numerical_missing_imputation( 25 | train=train, valid=valid, test=test, cols=["x"], imputation_method="median" 26 | ) 27 | 28 | m = model().fit(train=train, y_col="y", x_col="x") 29 | 30 | train["pred"], valid["pred"], test["pred"] = m.transform_sets(train, valid, test) 31 | 32 | performance_monitoring.report_performance( 33 | y_true=valid["y"], 34 | y_score=valid["pred"], 35 | path="/usr/app/monitor/", 36 | suffix="_valid", 37 | ) 38 | -------------------------------------------------------------------------------- /examples/vanilla/src/main.py: -------------------------------------------------------------------------------- 1 | import greenhouse_clock 2 | import data_sourcing 3 | import data_splitting 4 | import data_preprocessing 5 | import feature_engineering 6 | from modeling import model 7 | import performance_monitoring 8 | 9 | start_time = greenhouse_clock.get_time() 10 | 11 | if __name__ == "__main__": 12 | 13 | # Run prefect flow 14 | df = data_sourcing.get() 15 | df = data_preprocessing.clean(df) 16 | df = data_preprocessing.normalize(df) 17 | 18 | train, valid, test = data_splitting.split(df) 19 | 20 | ( 21 | train["x"], 22 | valid["x"], 23 | test["x"], 24 | ) = feature_engineering.numerical_missing_imputation( 25 | train=train, valid=valid, test=test, cols=["x"], imputation_method="median" 26 | ) 27 | 28 | m = model().fit(train=train, y_col="y", x_col="x") 29 | 30 | train["pred"], valid["pred"], test["pred"] = m.transform_sets(train, valid, test) 31 | 32 | performance_monitoring.report_performance( 33 | y_true=valid["y"], 34 | y_score=valid["pred"], 35 | path="/usr/app/monitor/", 36 | suffix="_valid", 37 | ) 38 | -------------------------------------------------------------------------------- /examples/palmer_penguins/src/data_preprocessing.py: -------------------------------------------------------------------------------- 1 | def clean(df): 2 | """Cleansing: a data pre-processing step. Usually, getting rid of garbage 3 | such as undesired characters. 4 | 5 | Cleansing must be a set of operations independent of data splitting. 6 | 7 | Parameters 8 | ---------- 9 | df: pandas dataframe 10 | 11 | Returns 12 | ------- 13 | pandas dataframe 14 | Cleansed dataframe 15 | 16 | Examples 17 | -------- 18 | 19 | Raises 20 | ------ 21 | 22 | Notes 23 | ----- 24 | 25 | """ 26 | 27 | return df 28 | 29 | 30 | def normalize(df): 31 | """Normalization: a data pre-processing step. Usually, making adjusting 32 | loser and upper casing, abbrevations, word order, and so on. 33 | 34 | Normalization must be a set of operations independent of data splitting. 35 | 36 | Parameters 37 | ---------- 38 | df: pandas dataframe 39 | 40 | Returns 41 | ------- 42 | pandas dataframe 43 | Normalized dataframe 44 | 45 | Examples 46 | -------- 47 | 48 | Raises 49 | ------ 50 | 51 | Notes 52 | ----- 53 | 54 | """ 55 | 56 | return df 57 | -------------------------------------------------------------------------------- /src/api.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI 2 | from pydantic import BaseModel 3 | 4 | from src.modeling import VanillaModel 5 | 6 | app = FastAPI() 7 | 8 | 9 | class ModelIn(BaseModel): 10 | x: str 11 | 12 | 13 | class ModelOut(BaseModel): 14 | pred: float 15 | 16 | 17 | class ModelOutHealth(BaseModel): 18 | id: str 19 | 20 | 21 | app = FastAPI() 22 | 23 | 24 | @app.post("/health") 25 | async def health(): 26 | 27 | return {"id": "Healthy"} 28 | 29 | 30 | @app.post( 31 | "/predict/", 32 | response_model=ModelOut, 33 | ) 34 | async def root(input: ModelIn): 35 | 36 | X = [ 37 | float(input.x), 38 | ] 39 | 40 | # Load your model from /models 41 | 42 | # Note: for saving your model, we suggest using the 43 | # `joblib` python package 44 | 45 | # Ex: path "/usr/app/models/" 46 | # joblib.dump(self.m, path) 47 | # model = joblib.load(path) 48 | 49 | # Vanila model always predict 0, so that 50 | # inputs in the training phase are arbitrary 51 | model = VanillaModel().fit(x=[0], y=[0]) 52 | 53 | out_dict = {} 54 | 55 | out_dict["pred"] = model.predict(X)[0] 56 | 57 | return out_dict 58 | -------------------------------------------------------------------------------- /examples/vanilla/src/api.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI 2 | from pydantic import BaseModel 3 | 4 | from src.modeling import VanillaModel 5 | 6 | app = FastAPI() 7 | 8 | 9 | class ModelIn(BaseModel): 10 | x: str 11 | 12 | 13 | class ModelOut(BaseModel): 14 | pred: float 15 | 16 | 17 | class ModelOutHealth(BaseModel): 18 | id: str 19 | 20 | 21 | app = FastAPI() 22 | 23 | 24 | @app.post("/health") 25 | async def health(): 26 | 27 | return {"id": "Healthy"} 28 | 29 | 30 | @app.post( 31 | "/predict/", 32 | response_model=ModelOut, 33 | ) 34 | async def root(input: ModelIn): 35 | 36 | X = [ 37 | float(input.x), 38 | ] 39 | 40 | # Load your model from /models 41 | 42 | # Note: for saving your model, we suggest using the 43 | # `joblib` python package 44 | 45 | # Ex: path "/usr/app/models/" 46 | # joblib.dump(self.m, path) 47 | # model = joblib.load(path) 48 | 49 | # Vanila model always predict 0, so that 50 | # inputs in the training phase are arbitrary 51 | model = VanillaModel().fit(x=[0], y=[0]) 52 | 53 | out_dict = {} 54 | 55 | out_dict["pred"] = model.predict(X)[0] 56 | 57 | return out_dict 58 | -------------------------------------------------------------------------------- /examples/palmer_penguins/tests/test_data_sourcing.py: -------------------------------------------------------------------------------- 1 | import pandera as pa 2 | from src import data_sourcing 3 | 4 | 5 | def test_data_sourcing_get(): 6 | 7 | df = data_sourcing.get() 8 | 9 | print(df) 10 | 11 | cats_sex = [ 12 | "male", 13 | "female", 14 | ] 15 | cats_species = [ 16 | "Adelie", 17 | "Gentoo", 18 | "Chinstrap", 19 | ] 20 | 21 | schema = pa.DataFrameSchema( 22 | { 23 | "bill_length_mm": pa.Column( 24 | float, 25 | nullable=True, 26 | ), 27 | "bill_depth_mm": pa.Column( 28 | float, 29 | nullable=True, 30 | ), 31 | "flipper_length_mm": pa.Column( 32 | float, 33 | nullable=True, 34 | ), 35 | "body_mass_g": pa.Column( 36 | float, 37 | nullable=True, 38 | ), 39 | "sex": pa.Column( 40 | str, 41 | checks=pa.Check.isin(cats_sex), 42 | nullable=True, 43 | ), 44 | "species": pa.Column( 45 | str, 46 | checks=pa.Check.isin(cats_species), 47 | nullable=True, 48 | ), 49 | } 50 | ) 51 | 52 | schema(df) 53 | -------------------------------------------------------------------------------- /src/data_preprocessing.py: -------------------------------------------------------------------------------- 1 | def clean(df): 2 | """Cleansing: a data pre-processing step. Usually, getting rid of garbage 3 | such as undesired characters. 4 | 5 | Cleansing must be a set of operations independent of data splitting. 6 | 7 | ** Vanilla definition. ** 8 | Include your own code below to import your project's data. 9 | 10 | Parameters 11 | ---------- 12 | df: pandas dataframe 13 | 14 | Returns 15 | ------- 16 | pandas dataframe 17 | Cleansed dataframe 18 | 19 | Examples 20 | -------- 21 | 22 | Raises 23 | ------ 24 | 25 | Notes 26 | ----- 27 | 28 | """ 29 | 30 | return df 31 | 32 | 33 | def normalize(df): 34 | """Normalization: a data pre-processing step. Usually, making adjusting 35 | loser and upper casing, abbrevations, word order, and so on. 36 | 37 | Normalization must be a set of operations independent of data splitting. 38 | 39 | ** Vanilla definition. ** 40 | Include your own code below to import your project's data. 41 | 42 | Parameters 43 | ---------- 44 | df: pandas dataframe 45 | 46 | Returns 47 | ------- 48 | pandas dataframe 49 | Normalized dataframe 50 | 51 | Examples 52 | -------- 53 | 54 | Raises 55 | ------ 56 | 57 | Notes 58 | ----- 59 | 60 | """ 61 | 62 | return df 63 | -------------------------------------------------------------------------------- /examples/vanilla/src/data_preprocessing.py: -------------------------------------------------------------------------------- 1 | def clean(df): 2 | """Cleansing: a data pre-processing step. Usually, getting rid of garbage 3 | such as undesired characters. 4 | 5 | Cleansing must be a set of operations independent of data splitting. 6 | 7 | ** Vanilla definition. ** 8 | Include your own code below to import your project's data. 9 | 10 | Parameters 11 | ---------- 12 | df: pandas dataframe 13 | 14 | Returns 15 | ------- 16 | pandas dataframe 17 | Cleansed dataframe 18 | 19 | Examples 20 | -------- 21 | 22 | Raises 23 | ------ 24 | 25 | Notes 26 | ----- 27 | 28 | """ 29 | 30 | return df 31 | 32 | 33 | def normalize(df): 34 | """Normalization: a data pre-processing step. Usually, making adjusting 35 | loser and upper casing, abbrevations, word order, and so on. 36 | 37 | Normalization must be a set of operations independent of data splitting. 38 | 39 | ** Vanilla definition. ** 40 | Include your own code below to import your project's data. 41 | 42 | Parameters 43 | ---------- 44 | df: pandas dataframe 45 | 46 | Returns 47 | ------- 48 | pandas dataframe 49 | Normalized dataframe 50 | 51 | Examples 52 | -------- 53 | 54 | Raises 55 | ------ 56 | 57 | Notes 58 | ----- 59 | 60 | """ 61 | 62 | return df 63 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | 3 | services: 4 | base: 5 | build: 6 | context: . 7 | dockerfile: Dockerfile 8 | image: base_image 9 | 10 | bash: 11 | image: base_image 12 | user: root 13 | volumes: 14 | - .:/usr/app/ 15 | working_dir: /usr/app/ 16 | entrypoint: /bin/sh 17 | 18 | python3: 19 | image: base_image 20 | user: root 21 | volumes: 22 | - .:/usr/app/ 23 | working_dir: /usr/app/ 24 | command: "python3" 25 | 26 | jupyter: 27 | image: base_image 28 | user: root 29 | volumes: 30 | - .:/usr/app/ 31 | working_dir: /usr/app/ 32 | command: > 33 | jupyter lab 34 | --ip=0.0.0.0 35 | --port=8888 36 | --allow-root 37 | --no-browser 38 | --notebook-dir='/usr/app/' 39 | ports: 40 | - 8888:8888 41 | 42 | fastapi: 43 | image: base_image 44 | user: root 45 | volumes: 46 | - .:/usr/app/ 47 | working_dir: /usr/app/ 48 | command: > 49 | uvicorn src.api:app --reload --host 0.0.0.0 50 | ports: 51 | - 8000:8000 52 | 53 | test: 54 | image: base_image 55 | user: root 56 | volumes: 57 | - .:/usr/app/ 58 | working_dir: /usr/app/ 59 | command: "pytest --verbose --capture=no --ignore=examples/" 60 | 61 | run: 62 | image: base_image 63 | user: root 64 | volumes: 65 | - .:/usr/app/ 66 | working_dir: /usr/app/ 67 | command: "python3 src/main.py" 68 | 69 | docs: 70 | image: base_image 71 | user: root 72 | volumes: 73 | - .:/usr/app/ 74 | working_dir: /usr/app/ 75 | environment: 76 | - PYTHONPATH=/usr/app/src/ 77 | command: > 78 | pdoc --docformat "numpy" -h 0.0.0.0 -p 314 src 79 | ports: 80 | - 314:314 -------------------------------------------------------------------------------- /tests/test_feature_engineering.py: -------------------------------------------------------------------------------- 1 | from src import feature_engineering 2 | import pandas as pd 3 | from pandas import _testing 4 | import numpy as np 5 | 6 | 7 | def test_numerical_missing_imputation_twofeatures(): 8 | 9 | df = pd.DataFrame( 10 | { 11 | "a": [1.0, 1.5, 2.0, 0.0, 1.25, np.nan], 12 | "b": [1.0, 1.5, 2.0, 0.0, 0.0, np.nan], 13 | "c": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 14 | "d": ["apple", "apple", "pear", "apple", "pear", "apple"], 15 | } 16 | ) 17 | 18 | expected = pd.DataFrame( 19 | { 20 | "a": [1.0, 1.5, 2.0, 0.0, 1.25, 1.25], 21 | "b": [1.0, 1.5, 2.0, 0.0, 0.0, 1.0], 22 | } 23 | ) 24 | 25 | train, valid, test = feature_engineering.numerical_missing_imputation( 26 | train=df, 27 | valid=df, 28 | test=df, 29 | cols=[ 30 | "a", 31 | "b", 32 | ], 33 | ) 34 | 35 | _testing.assert_frame_equal(train, expected) 36 | 37 | 38 | def test_one_hot_encoding(): 39 | 40 | df = pd.DataFrame( 41 | { 42 | "class": ["a", "b", "c", "a", np.nan], 43 | "col_1": [0.0, 0.0, 0.0, 0.0, 0.0], 44 | "col_2": ["apple", "apple", "pear", "apple", "pear"], 45 | } 46 | ) 47 | 48 | expected = pd.DataFrame( 49 | { 50 | "class_a": [1, 0, 0, 1, 0], 51 | "class_b": [0, 1, 0, 0, 0], 52 | "class_c": [0, 0, 1, 0, 0], 53 | "class_na": [0, 0, 0, 0, 1], 54 | } 55 | ) 56 | 57 | train, valid, test = feature_engineering.one_hot_encoding( 58 | train=df, 59 | valid=df, 60 | test=df, 61 | cols=[ 62 | "class", 63 | ], 64 | ) 65 | 66 | _testing.assert_frame_equal(train, expected) 67 | -------------------------------------------------------------------------------- /src/data_splitting.py: -------------------------------------------------------------------------------- 1 | def split(df, train_ratio=0.8, valid_ratio=0.1, test_ratio=0.1, seed=0): 2 | """Data splitting into 3 sets: train, valid, test 3 | 4 | train: training set. Used for training the ML model. 5 | valid: validation set. Used for frequent validation. 6 | test: test set. Used for final test. 7 | 8 | Parameters 9 | ---------- 10 | df: pandas dataframe 11 | Input data 12 | 13 | train_ratio: float 14 | Amount of data that goes into training, in percentage 15 | 16 | valid_ratio: float 17 | Amount of data that goes into validation, in percentage 18 | 19 | test_ratio: float 20 | Amount of data that goes into testing, in percentage 21 | 22 | seed: int 23 | Seed for the data shuffling. 24 | It is important to keep it fixed throughout the tuning of the model. 25 | 26 | Returns 27 | ------- 28 | list 29 | (train, valid, test) 30 | (pandas dataframe, pandas dataframe, pandas dataframe) 31 | 32 | Examples 33 | -------- 34 | 35 | >>> len(data) 36 | 100 37 | >>> train, valid, test = split(data) 38 | >>> len(train) 39 | 80 40 | >>> len(valid) 41 | 10 42 | >>> len(test) 43 | 10 44 | 45 | Raises 46 | ------ 47 | 48 | Notes 49 | ----- 50 | 51 | """ 52 | 53 | # Train set extracted from a random sample from `df` 54 | train = df.sample(frac=train_ratio, random_state=seed) 55 | 56 | # Everything from `df` except `train` 57 | rest = df.copy().drop(train.index) 58 | 59 | # Valid set ratio within `rest` 60 | new_ratio = valid_ratio / (valid_ratio + test_ratio) 61 | 62 | # Train set extracted from a random sample from `rest` 63 | valid = rest.sample(frac=new_ratio, random_state=seed) 64 | 65 | # Test set is everything in rest `except` for `valid` 66 | test = rest.drop(valid.index) 67 | 68 | return train, valid, test 69 | -------------------------------------------------------------------------------- /examples/vanilla/src/data_splitting.py: -------------------------------------------------------------------------------- 1 | def split(df, train_ratio=0.8, valid_ratio=0.1, test_ratio=0.1, seed=0): 2 | """Data splitting into 3 sets: train, valid, test 3 | 4 | train: training set. Used for training the ML model. 5 | valid: validation set. Used for frequent validation. 6 | test: test set. Used for final test. 7 | 8 | Parameters 9 | ---------- 10 | df: pandas dataframe 11 | Input data 12 | 13 | train_ratio: float 14 | Amount of data that goes into training, in percentage 15 | 16 | valid_ratio: float 17 | Amount of data that goes into validation, in percentage 18 | 19 | test_ratio: float 20 | Amount of data that goes into testing, in percentage 21 | 22 | seed: int 23 | Seed for the data shuffling. 24 | It is important to keep it fixed throughout the tuning of the model. 25 | 26 | Returns 27 | ------- 28 | list 29 | (train, valid, test) 30 | (pandas dataframe, pandas dataframe, pandas dataframe) 31 | 32 | Examples 33 | -------- 34 | 35 | >>> len(data) 36 | 100 37 | >>> train, valid, test = split(data) 38 | >>> len(train) 39 | 80 40 | >>> len(valid) 41 | 10 42 | >>> len(test) 43 | 10 44 | 45 | Raises 46 | ------ 47 | 48 | Notes 49 | ----- 50 | 51 | """ 52 | 53 | # Train set extracted from a random sample from `df` 54 | train = df.sample(frac=train_ratio, random_state=seed) 55 | 56 | # Everything from `df` except `train` 57 | rest = df.copy().drop(train.index) 58 | 59 | # Valid set ratio within `rest` 60 | new_ratio = valid_ratio / (valid_ratio + test_ratio) 61 | 62 | # Train set extracted from a random sample from `rest` 63 | valid = rest.sample(frac=new_ratio, random_state=seed) 64 | 65 | # Test set is everything in rest `except` for `valid` 66 | test = rest.drop(valid.index) 67 | 68 | return train, valid, test 69 | -------------------------------------------------------------------------------- /examples/palmer_penguins/src/data_splitting.py: -------------------------------------------------------------------------------- 1 | def split(df, train_ratio=0.8, valid_ratio=0.1, test_ratio=0.1, seed=0): 2 | """Data splitting into 3 sets: train, valid, test 3 | 4 | train: training set. Used for training the ML model. 5 | valid: validation set. Used for frequent validation. 6 | test: test set. Used for final test. 7 | 8 | Parameters 9 | ---------- 10 | df: pandas dataframe 11 | Input data 12 | 13 | train_ratio: float 14 | Amount of data that goes into training, in percentage 15 | 16 | valid_ratio: float 17 | Amount of data that goes into validation, in percentage 18 | 19 | test_ratio: float 20 | Amount of data that goes into testing, in percentage 21 | 22 | seed: int 23 | Seed for the data shuffling. 24 | It is important to keep it fixed throughout the tuning of the model. 25 | 26 | Returns 27 | ------- 28 | list 29 | (train, valid, test) 30 | (pandas dataframe, pandas dataframe, pandas dataframe) 31 | 32 | Examples 33 | -------- 34 | 35 | >>> len(data) 36 | 100 37 | >>> train, valid, test = split(data) 38 | >>> len(train) 39 | 80 40 | >>> len(valid) 41 | 10 42 | >>> len(test) 43 | 10 44 | 45 | Raises 46 | ------ 47 | 48 | Notes 49 | ----- 50 | 51 | """ 52 | 53 | # Train set extracted from a random sample from `df` 54 | train = df.sample(frac=train_ratio, random_state=seed) 55 | 56 | # Everything from `df` except `train` 57 | rest = df.copy().drop(train.index) 58 | 59 | # Valid set ratio within `rest` 60 | new_ratio = valid_ratio / (valid_ratio + test_ratio) 61 | 62 | # Train set extracted from a random sample from `rest` 63 | valid = rest.sample(frac=new_ratio, random_state=seed) 64 | 65 | # Test set is everything in rest `except` for `valid` 66 | test = rest.drop(valid.index) 67 | 68 | return train, valid, test 69 | -------------------------------------------------------------------------------- /examples/palmer_penguins/src/api.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI 2 | import joblib 3 | import numpy as np 4 | 5 | from pydantic import BaseModel 6 | 7 | app = FastAPI() 8 | 9 | 10 | class ModelIn(BaseModel): 11 | sex: str 12 | bill_length_mm: str 13 | bill_depth_mm: str 14 | flipper_length_mm: str 15 | body_mass_g: str 16 | 17 | 18 | class ModelOut(BaseModel): 19 | prob_0: float 20 | prob_1: float 21 | prob_2: float 22 | species_code: int 23 | species_name: str 24 | 25 | 26 | app = FastAPI() 27 | 28 | 29 | @app.post( 30 | "/predict/", 31 | response_model=ModelOut, 32 | ) 33 | async def root(input: ModelIn): 34 | 35 | sex_male = {"male": 1, "female": 0, "na": 0}[input.sex] 36 | 37 | sex_female = {"male": 0, "female": 1, "na": 0}[input.sex] 38 | 39 | sex_na = {"male": 0, "female": 0, "na": 1}[input.sex] 40 | 41 | X = [ 42 | [ 43 | int(sex_male), 44 | int(sex_female), 45 | int(sex_na), 46 | float(input.bill_length_mm), 47 | float(input.bill_depth_mm), 48 | float(input.flipper_length_mm), 49 | float(input.body_mass_g), 50 | ], 51 | ] 52 | 53 | model = joblib.load("/usr/app/models/clf_random.joblib") 54 | 55 | out_dict = {} 56 | 57 | out_dict["prob_0"], out_dict["prob_1"], out_dict["prob_2"] = np.transpose( 58 | model.predict_proba(X) 59 | ) 60 | 61 | out_dict["prob_0"] = out_dict["prob_0"][0] 62 | 63 | out_dict["prob_1"] = out_dict["prob_1"][0] 64 | out_dict["prob_2"] = out_dict["prob_2"][0] 65 | 66 | encoder = joblib.load("/usr/app/models/label_encoder.joblib") 67 | 68 | # Recover classes 69 | classes = encoder.classes_ 70 | 71 | # Enumerate classes to recover codes (integers) 72 | # Convert enumerate to dictionary 73 | map_classes = dict(enumerate(classes)) 74 | 75 | code = model.predict(X)[0] 76 | 77 | out_dict["species_code"] = int(code) 78 | out_dict["species_name"] = map_classes[code] 79 | 80 | return out_dict 81 | -------------------------------------------------------------------------------- /examples/palmer_penguins/src/performance_monitoring.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn import metrics 3 | import json 4 | 5 | import greenhouse_clock 6 | 7 | meta = {} 8 | 9 | # Timestamp for files 10 | meta["timestr"] = greenhouse_clock.get_time() 11 | 12 | 13 | def optimal_threshold(y_true, y_score): 14 | 15 | # Performance extracted from the "ROC curve" 16 | fpr, tpr, thr = metrics.roc_curve( 17 | y_true=y_true, y_score=y_score, pos_label=1, drop_intermediate=False 18 | ) 19 | 20 | diff = np.abs(tpr - fpr) 21 | 22 | # Numpy index of the maximum separation between TPR and FPR 23 | diff_idx = np.argmax(diff) 24 | 25 | # Optimum threshold based on max diff criterium 26 | return thr[diff_idx] 27 | 28 | 29 | def report_performance( 30 | y_true, y_score, best_hyperparams, path, opt_thr=0.5, suffix="_" 31 | ): 32 | """ 33 | References 34 | ---------- 35 | https://scikit-learn.org/stable/modules/generated/ 36 | sklearn.metrics.classification_report.html 37 | """ 38 | 39 | meta["optimal_hyperparameters"] = best_hyperparams 40 | 41 | meta["optimal_threshold"] = opt_thr 42 | 43 | # Performance extracted from the "ROC curve" 44 | fpr, tpr, thr = metrics.roc_curve( 45 | y_true=y_true, y_score=y_score, pos_label=1, drop_intermediate=False 46 | ) 47 | 48 | meta["AUC"] = metrics.auc(fpr, tpr) 49 | 50 | diff = np.abs(tpr - fpr) 51 | 52 | # Maximum difference between TPR and FPR 53 | meta["max_diff_FPR_TPR"] = np.max(diff) 54 | 55 | # Numpy index of the maximum separation between TPR and FPR 56 | diff_idx = np.argmax(diff) 57 | 58 | # Update optimum threshold based on max diff criterium 59 | meta["threshold_from_max_diff"] = thr[diff_idx] 60 | 61 | # Predicted classes based on "optimal_threshold" 62 | y_pred = [int(k >= opt_thr) for k in y_score] 63 | 64 | meta["classification_report"] = metrics.classification_report( 65 | y_true=y_true, y_pred=y_pred, output_dict=True 66 | ) 67 | 68 | filename = "{0}metadata{1}.json".format(path, suffix) 69 | 70 | # Export to JSON 71 | with open(filename, "w") as fp: 72 | json.dump(meta, fp, indent=4) 73 | 74 | pass 75 | -------------------------------------------------------------------------------- /src/modeling.py: -------------------------------------------------------------------------------- 1 | class VanillaModel: 2 | """Vanilla model where the predictions are always 0""" 3 | 4 | def __init__(self): 5 | 6 | pass 7 | 8 | def fit(self, x, y): 9 | 10 | self.fitted = [0] 11 | 12 | return self 13 | 14 | def predict(self, x): 15 | 16 | return self.fitted * len(x) 17 | 18 | 19 | class model: 20 | """ 21 | Replace below `VanillaModel` by an actual ML 22 | model such as the ones provided by sklearn. 23 | 24 | We are assuming supervised models (a and y are available), 25 | but you may also adapt it for unsupervised models 26 | (only x available). In that case, erase any reference to 27 | `y` below. 28 | 29 | References 30 | ---------- 31 | https://scikit-learn.org/stable/ 32 | """ 33 | 34 | def __init__(self): 35 | 36 | pass 37 | 38 | def fit(self, train, y_col, x_col): 39 | 40 | self.x_col = x_col 41 | self.y_col = y_col 42 | 43 | self.m = VanillaModel().fit(x=train[x_col], y=train[y_col]) 44 | 45 | # Save your model in /models 46 | 47 | # Note: for saving your model, we suggest using the 48 | # `joblib` python package 49 | 50 | # Ex: path "/usr/app/models/" 51 | # joblib.dump(self.m, path) 52 | 53 | return self 54 | 55 | def transform_sets(self, train, valid, test): 56 | 57 | x_train = train[self.x_col].values 58 | x_valid = valid[self.x_col].values 59 | x_test = test[self.x_col].values 60 | 61 | y_train = train[self.y_col].values 62 | y_valid = valid[self.y_col].values 63 | y_test = test[self.y_col].values 64 | 65 | train_out = train.copy(deep=True)[self.y_col] 66 | valid_out = valid.copy(deep=True)[self.y_col] 67 | test_out = test.copy(deep=True)[self.y_col] 68 | 69 | train_out["actual"] = y_train 70 | valid_out["actual"] = y_valid 71 | test_out["actual"] = y_test 72 | 73 | # Predict 74 | train_out["pred"] = (self.m).predict(x_train) 75 | valid_out["pred"] = (self.m).predict(x_valid) 76 | test_out["pred"] = (self.m).predict(x_test) 77 | 78 | return train_out, valid_out, test_out 79 | -------------------------------------------------------------------------------- /examples/vanilla/src/modeling.py: -------------------------------------------------------------------------------- 1 | class VanillaModel: 2 | """Vanilla model where the predictions are always 0""" 3 | 4 | def __init__(self): 5 | 6 | pass 7 | 8 | def fit(self, x, y): 9 | 10 | self.fitted = [0] 11 | 12 | return self 13 | 14 | def predict(self, x): 15 | 16 | return self.fitted * len(x) 17 | 18 | 19 | class model: 20 | """ 21 | Replace below `VanillaModel` by an actual ML 22 | model such as the ones provided by sklearn. 23 | 24 | We are assuming supervised models (a and y are available), 25 | but you may also adapt it for unsupervised models 26 | (only x available). In that case, erase any reference to 27 | `y` below. 28 | 29 | References 30 | ---------- 31 | https://scikit-learn.org/stable/ 32 | """ 33 | 34 | def __init__(self): 35 | 36 | pass 37 | 38 | def fit(self, train, y_col, x_col): 39 | 40 | self.x_col = x_col 41 | self.y_col = y_col 42 | 43 | self.m = VanillaModel().fit(x=train[x_col], y=train[y_col]) 44 | 45 | # Save your model in /models 46 | 47 | # Note: for saving your model, we suggest using the 48 | # `joblib` python package 49 | 50 | # Ex: path "/usr/app/models/" 51 | # joblib.dump(self.m, path) 52 | 53 | return self 54 | 55 | def transform_sets(self, train, valid, test): 56 | 57 | x_train = train[self.x_col].values 58 | x_valid = valid[self.x_col].values 59 | x_test = test[self.x_col].values 60 | 61 | y_train = train[self.y_col].values 62 | y_valid = valid[self.y_col].values 63 | y_test = test[self.y_col].values 64 | 65 | train_out = train.copy(deep=True)[self.y_col] 66 | valid_out = valid.copy(deep=True)[self.y_col] 67 | test_out = test.copy(deep=True)[self.y_col] 68 | 69 | train_out["actual"] = y_train 70 | valid_out["actual"] = y_valid 71 | test_out["actual"] = y_test 72 | 73 | # Predict 74 | train_out["pred"] = (self.m).predict(x_train) 75 | valid_out["pred"] = (self.m).predict(x_valid) 76 | test_out["pred"] = (self.m).predict(x_test) 77 | 78 | return train_out, valid_out, test_out 79 | -------------------------------------------------------------------------------- /tests/test_data_splitting.py: -------------------------------------------------------------------------------- 1 | from src import data_splitting 2 | import pandas as pd 3 | import pytest 4 | 5 | 6 | @pytest.fixture 7 | def df_10_rows(): 8 | 9 | return pd.DataFrame( 10 | { 11 | "col_1": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 12 | "col_2": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], 13 | } 14 | ) 15 | 16 | 17 | def test_data_splitting_train_ratio(df_10_rows): 18 | 19 | train, valid, test = data_splitting.split( 20 | df_10_rows, train_ratio=0.7, valid_ratio=0.2, test_ratio=0.1, seed=0 21 | ) 22 | 23 | assert len(train) == 7 24 | 25 | 26 | def test_data_splitting_valid_ratio(df_10_rows): 27 | 28 | train, valid, test = data_splitting.split( 29 | df_10_rows, train_ratio=0.7, valid_ratio=0.2, test_ratio=0.1, seed=0 30 | ) 31 | 32 | assert len(valid) == 2 33 | 34 | 35 | def test_data_splitting_test_ratio(df_10_rows): 36 | 37 | train, valid, test = data_splitting.split( 38 | df_10_rows, train_ratio=0.7, valid_ratio=0.2, test_ratio=0.1, seed=0 39 | ) 40 | 41 | assert len(test) == 1 42 | 43 | 44 | def test_data_splitting_train_vs_valid(df_10_rows): 45 | 46 | train, valid, test = data_splitting.split( 47 | df_10_rows, train_ratio=0.7, valid_ratio=0.2, test_ratio=0.1, seed=0 48 | ) 49 | 50 | df_check = train.merge( 51 | valid, 52 | how="inner", 53 | right_on=["col_1", "col_2"], 54 | left_on=["col_1", "col_2"], 55 | sort=False, 56 | ) 57 | 58 | assert df_check.empty 59 | 60 | 61 | def test_data_splitting_train_vs_test(df_10_rows): 62 | 63 | train, valid, test = data_splitting.split( 64 | df_10_rows, train_ratio=0.7, valid_ratio=0.2, test_ratio=0.1, seed=0 65 | ) 66 | 67 | df_check = train.merge( 68 | test, 69 | how="inner", 70 | right_on=["col_1", "col_2"], 71 | left_on=["col_1", "col_2"], 72 | sort=False, 73 | ) 74 | 75 | assert df_check.empty 76 | 77 | 78 | def test_data_splitting_valid_vs_test(df_10_rows): 79 | 80 | train, valid, test = data_splitting.split( 81 | df_10_rows, train_ratio=0.7, valid_ratio=0.2, test_ratio=0.1, seed=0 82 | ) 83 | 84 | df_check = valid.merge( 85 | test, 86 | how="inner", 87 | right_on=["col_1", "col_2"], 88 | left_on=["col_1", "col_2"], 89 | sort=False, 90 | ) 91 | 92 | assert df_check.empty 93 | -------------------------------------------------------------------------------- /.dvc/plots/confusion.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://vega.github.io/schema/vega-lite/v4.json", 3 | "data": { 4 | "values": "" 5 | }, 6 | "facet": { 7 | "field": "rev", 8 | "type": "nominal" 9 | }, 10 | "spec": { 11 | "encoding": { 12 | "x": { 13 | "field": "", 14 | "sort": "ascending", 15 | "title": "", 16 | "type": "nominal" 17 | }, 18 | "y": { 19 | "field": "", 20 | "sort": "ascending", 21 | "title": "", 22 | "type": "nominal" 23 | } 24 | }, 25 | "layer": [ 26 | { 27 | "encoding": { 28 | "color": { 29 | "field": "xy_count", 30 | "scale": { 31 | "domainMin": 0, 32 | "nice": true 33 | }, 34 | "title": "", 35 | "type": "quantitative" 36 | } 37 | }, 38 | "height": 300, 39 | "mark": "rect", 40 | "width": 300 41 | }, 42 | { 43 | "encoding": { 44 | "color": { 45 | "condition": { 46 | "test": "datum.percent_of_max > 0.5", 47 | "value": "white" 48 | }, 49 | "value": "black" 50 | }, 51 | "text": { 52 | "field": "xy_count", 53 | "type": "quantitative" 54 | } 55 | }, 56 | "mark": "text" 57 | } 58 | ], 59 | "transform": [ 60 | { 61 | "aggregate": [ 62 | { 63 | "as": "xy_count", 64 | "op": "count" 65 | } 66 | ], 67 | "groupby": [ 68 | "", 69 | "" 70 | ] 71 | }, 72 | { 73 | "groupby": [ 74 | "rev", 75 | "" 76 | ], 77 | "impute": "xy_count", 78 | "key": "", 79 | "value": 0 80 | }, 81 | { 82 | "groupby": [ 83 | "rev", 84 | "" 85 | ], 86 | "impute": "xy_count", 87 | "key": "", 88 | "value": 0 89 | }, 90 | { 91 | "groupby": [], 92 | "joinaggregate": [ 93 | { 94 | "as": "max_count", 95 | "field": "xy_count", 96 | "op": "max" 97 | } 98 | ] 99 | }, 100 | { 101 | "as": "percent_of_max", 102 | "calculate": "datum.xy_count / datum.max_count" 103 | } 104 | ] 105 | }, 106 | "title": "" 107 | } 108 | -------------------------------------------------------------------------------- /.dvc/plots/scatter.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://vega.github.io/schema/vega-lite/v4.json", 3 | "data": { 4 | "values": "" 5 | }, 6 | "height": 300, 7 | "layer": [ 8 | { 9 | "encoding": { 10 | "color": { 11 | "field": "rev", 12 | "type": "nominal" 13 | }, 14 | "x": { 15 | "field": "", 16 | "title": "", 17 | "type": "quantitative" 18 | }, 19 | "y": { 20 | "field": "", 21 | "scale": { 22 | "zero": false 23 | }, 24 | "title": "", 25 | "type": "quantitative" 26 | } 27 | }, 28 | "layer": [ 29 | { 30 | "mark": "point" 31 | }, 32 | { 33 | "encoding": { 34 | "opacity": { 35 | "condition": { 36 | "selection": "label", 37 | "value": 1 38 | }, 39 | "value": 0 40 | } 41 | }, 42 | "mark": "point", 43 | "selection": { 44 | "label": { 45 | "clear": "mouseout", 46 | "empty": "none", 47 | "encodings": [ 48 | "x" 49 | ], 50 | "nearest": true, 51 | "on": "mouseover", 52 | "type": "single" 53 | } 54 | } 55 | } 56 | ] 57 | }, 58 | { 59 | "layer": [ 60 | { 61 | "encoding": { 62 | "text": { 63 | "field": "", 64 | "type": "quantitative" 65 | }, 66 | "x": { 67 | "field": "", 68 | "type": "quantitative" 69 | }, 70 | "y": { 71 | "field": "", 72 | "type": "quantitative" 73 | } 74 | }, 75 | "layer": [ 76 | { 77 | "encoding": { 78 | "color": { 79 | "field": "rev", 80 | "type": "nominal" 81 | } 82 | }, 83 | "mark": { 84 | "align": "left", 85 | "dx": 5, 86 | "dy": -5, 87 | "type": "text" 88 | } 89 | } 90 | ] 91 | } 92 | ], 93 | "transform": [ 94 | { 95 | "filter": { 96 | "selection": "label" 97 | } 98 | } 99 | ] 100 | } 101 | ], 102 | "title": "", 103 | "width": 300 104 | } 105 | -------------------------------------------------------------------------------- /.dvc/plots/confusion_normalized.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://vega.github.io/schema/vega-lite/v4.json", 3 | "data": { 4 | "values": "" 5 | }, 6 | "facet": { 7 | "field": "rev", 8 | "type": "nominal" 9 | }, 10 | "spec": { 11 | "encoding": { 12 | "x": { 13 | "field": "", 14 | "sort": "ascending", 15 | "title": "", 16 | "type": "nominal" 17 | }, 18 | "y": { 19 | "field": "", 20 | "sort": "ascending", 21 | "title": "", 22 | "type": "nominal" 23 | } 24 | }, 25 | "layer": [ 26 | { 27 | "encoding": { 28 | "color": { 29 | "field": "percent_of_y", 30 | "scale": { 31 | "domain": [ 32 | 0, 33 | 1 34 | ] 35 | }, 36 | "title": "", 37 | "type": "quantitative" 38 | } 39 | }, 40 | "height": 300, 41 | "mark": "rect", 42 | "width": 300 43 | }, 44 | { 45 | "encoding": { 46 | "color": { 47 | "condition": { 48 | "test": "datum.percent_of_y > 0.5", 49 | "value": "white" 50 | }, 51 | "value": "black" 52 | }, 53 | "text": { 54 | "field": "percent_of_y", 55 | "format": ".2f", 56 | "type": "quantitative" 57 | } 58 | }, 59 | "mark": "text" 60 | } 61 | ], 62 | "transform": [ 63 | { 64 | "aggregate": [ 65 | { 66 | "as": "xy_count", 67 | "op": "count" 68 | } 69 | ], 70 | "groupby": [ 71 | "", 72 | "" 73 | ] 74 | }, 75 | { 76 | "groupby": [ 77 | "rev", 78 | "" 79 | ], 80 | "impute": "xy_count", 81 | "key": "", 82 | "value": 0 83 | }, 84 | { 85 | "groupby": [ 86 | "rev", 87 | "" 88 | ], 89 | "impute": "xy_count", 90 | "key": "", 91 | "value": 0 92 | }, 93 | { 94 | "groupby": [ 95 | "" 96 | ], 97 | "joinaggregate": [ 98 | { 99 | "as": "sum_y", 100 | "field": "xy_count", 101 | "op": "sum" 102 | } 103 | ] 104 | }, 105 | { 106 | "as": "percent_of_y", 107 | "calculate": "datum.xy_count / datum.sum_y" 108 | } 109 | ] 110 | }, 111 | "title": "" 112 | } 113 | -------------------------------------------------------------------------------- /examples/palmer_penguins/src/feature_engineering.py: -------------------------------------------------------------------------------- 1 | from feature_engine import encoding, imputation 2 | 3 | 4 | def numerical_missing_imputation(train, valid, test, cols, imputation_method="median"): 5 | """Missing imputation for numerical variables. 6 | 7 | The algorithm learns from the train set and applies transformations 8 | to all three input datasets: train, valid, test. 9 | 10 | Parameters 11 | ---------- 12 | train: pandas dataframe 13 | Training set 14 | 15 | valid: pandas dataframe 16 | Validation set 17 | 18 | test: pandas dataframe 19 | Test set 20 | 21 | cols: list 22 | List of numerical columns 23 | 24 | imputation_method: string 25 | Desired method of imputation. Options are 'mean' and 'median'. 26 | Default value: 'median'. 27 | 28 | Returns 29 | ------- 30 | list 31 | (train, valid, test) 32 | (pandas dataframe, pandas dataframe, pandas dataframe) 33 | 34 | Examples 35 | -------- 36 | 37 | Raises 38 | ------ 39 | 40 | Notes 41 | ----- 42 | 43 | """ 44 | 45 | fe = imputation.MeanMedianImputer( 46 | imputation_method=imputation_method, variables=cols 47 | ) 48 | 49 | # Fit over training set 50 | fe.fit(train[cols]) 51 | 52 | # Apply to train, valid, test 53 | return ( 54 | fe.transform(train[cols]), 55 | fe.transform(valid[cols]), 56 | fe.transform(test[cols]), 57 | ) 58 | 59 | 60 | def one_hot_encoding(train, valid, test, cols): 61 | """One-hot-encoding of all categories found in `cols`. 62 | 63 | The algorithm learns from the train set and applies transformations 64 | to all three input datasets: train, valid, test. 65 | 66 | Missing values in col lead to col_na=1 67 | 68 | Parameters 69 | ---------- 70 | train: pandas dataframe 71 | Training set 72 | 73 | valid: pandas dataframe 74 | Validation set 75 | 76 | test: pandas dataframe 77 | Test set 78 | 79 | cols: list 80 | List of numerical columns 81 | 82 | Returns 83 | ------- 84 | list 85 | (train, valid, test) 86 | (pandas dataframe, pandas dataframe, pandas dataframe) 87 | 88 | Examples 89 | -------- 90 | 91 | Raises 92 | ------ 93 | 94 | Notes 95 | ----- 96 | 97 | """ 98 | 99 | fe = encoding.OneHotEncoder(variables=cols) 100 | 101 | for k in cols: 102 | train[k] = train[k].fillna("na") 103 | valid[k] = valid[k].fillna("na") 104 | test[k] = test[k].fillna("na") 105 | 106 | # Fit over training set 107 | fe.fit(train[cols]) 108 | 109 | # Apply to train, valid, test 110 | return ( 111 | fe.transform(train[cols]), 112 | fe.transform(valid[cols]), 113 | fe.transform(test[cols]), 114 | ) 115 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Cython debug symbols 138 | cython_debug/ 139 | 140 | # Local logs of the project 141 | logs/ 142 | 143 | # Local monitoring reports of the project 144 | monitor/ 145 | 146 | # Data directory 147 | data/*.csv 148 | 149 | # Models directory 150 | models/ 151 | /monitor 152 | -------------------------------------------------------------------------------- /.dvc/plots/linear.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://vega.github.io/schema/vega-lite/v4.json", 3 | "data": { 4 | "values": "" 5 | }, 6 | "height": 300, 7 | "layer": [ 8 | { 9 | "encoding": { 10 | "color": { 11 | "field": "rev", 12 | "type": "nominal" 13 | }, 14 | "x": { 15 | "field": "", 16 | "title": "", 17 | "type": "quantitative" 18 | }, 19 | "y": { 20 | "field": "", 21 | "scale": { 22 | "zero": false 23 | }, 24 | "title": "", 25 | "type": "quantitative" 26 | } 27 | }, 28 | "layer": [ 29 | { 30 | "mark": "line" 31 | }, 32 | { 33 | "encoding": { 34 | "opacity": { 35 | "condition": { 36 | "selection": "label", 37 | "value": 1 38 | }, 39 | "value": 0 40 | } 41 | }, 42 | "mark": "point", 43 | "selection": { 44 | "label": { 45 | "clear": "mouseout", 46 | "empty": "none", 47 | "encodings": [ 48 | "x" 49 | ], 50 | "nearest": true, 51 | "on": "mouseover", 52 | "type": "single" 53 | } 54 | } 55 | } 56 | ] 57 | }, 58 | { 59 | "layer": [ 60 | { 61 | "encoding": { 62 | "x": { 63 | "field": "", 64 | "type": "quantitative" 65 | } 66 | }, 67 | "mark": { 68 | "color": "gray", 69 | "type": "rule" 70 | } 71 | }, 72 | { 73 | "encoding": { 74 | "text": { 75 | "field": "", 76 | "type": "quantitative" 77 | }, 78 | "x": { 79 | "field": "", 80 | "type": "quantitative" 81 | }, 82 | "y": { 83 | "field": "", 84 | "type": "quantitative" 85 | } 86 | }, 87 | "layer": [ 88 | { 89 | "encoding": { 90 | "color": { 91 | "field": "rev", 92 | "type": "nominal" 93 | } 94 | }, 95 | "mark": { 96 | "align": "left", 97 | "dx": 5, 98 | "dy": -5, 99 | "type": "text" 100 | } 101 | } 102 | ] 103 | } 104 | ], 105 | "transform": [ 106 | { 107 | "filter": { 108 | "selection": "label" 109 | } 110 | } 111 | ] 112 | } 113 | ], 114 | "title": "", 115 | "width": 300 116 | } 117 | -------------------------------------------------------------------------------- /src/feature_engineering.py: -------------------------------------------------------------------------------- 1 | from feature_engine import encoding, imputation 2 | 3 | # Note: we suggest using the below helper functions 4 | # for missing imputation (for numerical) and 5 | # one-hot-encoding (for categorical). 6 | # You will find most of other popular Feature 7 | # Engineering methods in the `feature_engine` 8 | # python package. 9 | 10 | 11 | def numerical_missing_imputation(train, valid, test, cols, imputation_method="median"): 12 | """Missing imputation for numerical variables. 13 | 14 | The algorithm learns from the train set and applies transformations 15 | to all three input datasets: train, valid, test. 16 | 17 | Parameters 18 | ---------- 19 | train: pandas dataframe 20 | Training set 21 | 22 | valid: pandas dataframe 23 | Validation set 24 | 25 | test: pandas dataframe 26 | Test set 27 | 28 | cols: list 29 | List of numerical columns 30 | 31 | imputation_method: string 32 | Desired method of imputation. Options are 'mean' and 'median'. 33 | Default value: 'median'. 34 | 35 | Returns 36 | ------- 37 | list 38 | (train, valid, test) 39 | (pandas dataframe, pandas dataframe, pandas dataframe) 40 | 41 | Examples 42 | -------- 43 | 44 | Raises 45 | ------ 46 | 47 | Notes 48 | ----- 49 | 50 | """ 51 | 52 | fe = imputation.MeanMedianImputer( 53 | imputation_method=imputation_method, variables=cols 54 | ) 55 | 56 | # Fit over training set 57 | fe.fit(train[cols]) 58 | 59 | # Apply to train, valid, test 60 | return ( 61 | fe.transform(train[cols]), 62 | fe.transform(valid[cols]), 63 | fe.transform(test[cols]), 64 | ) 65 | 66 | 67 | def one_hot_encoding(train, valid, test, cols): 68 | """One-hot-encoding of all categories found in `cols`. 69 | 70 | The algorithm learns from the train set and applies transformations 71 | to all three input datasets: train, valid, test. 72 | 73 | Missing values in col lead to col_na=1 74 | 75 | Parameters 76 | ---------- 77 | train: pandas dataframe 78 | Training set 79 | 80 | valid: pandas dataframe 81 | Validation set 82 | 83 | test: pandas dataframe 84 | Test set 85 | 86 | cols: list 87 | List of numerical columns 88 | 89 | Returns 90 | ------- 91 | list 92 | (train, valid, test) 93 | (pandas dataframe, pandas dataframe, pandas dataframe) 94 | 95 | Examples 96 | -------- 97 | 98 | Raises 99 | ------ 100 | 101 | Notes 102 | ----- 103 | 104 | """ 105 | 106 | fe = encoding.OneHotEncoder(variables=cols) 107 | 108 | for k in cols: 109 | train[k] = train[k].fillna("na") 110 | valid[k] = valid[k].fillna("na") 111 | test[k] = test[k].fillna("na") 112 | 113 | # Fit over training set 114 | fe.fit(train[cols]) 115 | 116 | # Apply to train, valid, test 117 | return ( 118 | fe.transform(train[cols]), 119 | fe.transform(valid[cols]), 120 | fe.transform(test[cols]), 121 | ) 122 | -------------------------------------------------------------------------------- /examples/vanilla/src/feature_engineering.py: -------------------------------------------------------------------------------- 1 | from feature_engine import encoding, imputation 2 | 3 | # Note: we suggest using the below helper functions 4 | # for missing imputation (for numerical) and 5 | # one-hot-encoding (for categorical). 6 | # You will find most of other popular Feature 7 | # Engineering methods in the `feature_engine` 8 | # python package. 9 | 10 | 11 | def numerical_missing_imputation(train, valid, test, cols, imputation_method="median"): 12 | """Missing imputation for numerical variables. 13 | 14 | The algorithm learns from the train set and applies transformations 15 | to all three input datasets: train, valid, test. 16 | 17 | Parameters 18 | ---------- 19 | train: pandas dataframe 20 | Training set 21 | 22 | valid: pandas dataframe 23 | Validation set 24 | 25 | test: pandas dataframe 26 | Test set 27 | 28 | cols: list 29 | List of numerical columns 30 | 31 | imputation_method: string 32 | Desired method of imputation. Options are 'mean' and 'median'. 33 | Default value: 'median'. 34 | 35 | Returns 36 | ------- 37 | list 38 | (train, valid, test) 39 | (pandas dataframe, pandas dataframe, pandas dataframe) 40 | 41 | Examples 42 | -------- 43 | 44 | Raises 45 | ------ 46 | 47 | Notes 48 | ----- 49 | 50 | """ 51 | 52 | fe = imputation.MeanMedianImputer( 53 | imputation_method=imputation_method, variables=cols 54 | ) 55 | 56 | # Fit over training set 57 | fe.fit(train[cols]) 58 | 59 | # Apply to train, valid, test 60 | return ( 61 | fe.transform(train[cols]), 62 | fe.transform(valid[cols]), 63 | fe.transform(test[cols]), 64 | ) 65 | 66 | 67 | def one_hot_encoding(train, valid, test, cols): 68 | """One-hot-encoding of all categories found in `cols`. 69 | 70 | The algorithm learns from the train set and applies transformations 71 | to all three input datasets: train, valid, test. 72 | 73 | Missing values in col lead to col_na=1 74 | 75 | Parameters 76 | ---------- 77 | train: pandas dataframe 78 | Training set 79 | 80 | valid: pandas dataframe 81 | Validation set 82 | 83 | test: pandas dataframe 84 | Test set 85 | 86 | cols: list 87 | List of numerical columns 88 | 89 | Returns 90 | ------- 91 | list 92 | (train, valid, test) 93 | (pandas dataframe, pandas dataframe, pandas dataframe) 94 | 95 | Examples 96 | -------- 97 | 98 | Raises 99 | ------ 100 | 101 | Notes 102 | ----- 103 | 104 | """ 105 | 106 | fe = encoding.OneHotEncoder(variables=cols) 107 | 108 | for k in cols: 109 | train[k] = train[k].fillna("na") 110 | valid[k] = valid[k].fillna("na") 111 | test[k] = test[k].fillna("na") 112 | 113 | # Fit over training set 114 | fe.fit(train[cols]) 115 | 116 | # Apply to train, valid, test 117 | return ( 118 | fe.transform(train[cols]), 119 | fe.transform(valid[cols]), 120 | fe.transform(test[cols]), 121 | ) 122 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | These contributing guidelines were designed both for the original [Greenhouse repo template](https://github.com/felipepenha/py-greenhouse) and any Python projects derived from that template. 4 | 5 | If you just want to use the Greenhouse template for your new cool Data X or Machine Learning project, please choose the option ["Use this Template"](https://github.com/felipepenha/py-greenhouse/generate). 6 | 7 | # Bugs, questions, or suggestions 8 | 9 | In case you found bugs, would like to ask a question, or have suggestions to offer, feel free to use the [Issues Section](https://github.com/felipepenha/py-greenhouse/issues) in the GitHub repository. 10 | 11 | 12 | # New Features and/or improvements 13 | 14 | 15 | ## Cloning 16 | 17 | Clone the repository locally: 18 | 19 | ```git 20 | $ git clone https://github.com/felipepenha/py-greenhouse.git 21 | $ git pull dev 22 | $ git branch dev 23 | ``` 24 | 25 | Start working on a new branch copied from the `dev` branch: 26 | 27 | ```git 28 | $ git checkout -b [new_branch_name] 29 | $ git branch --set-upstream-to=origin/dev [new_branch_name] 30 | ``` 31 | 32 | 33 | ## Adding and Commiting 34 | 35 | Use `git add [target]` and `git commit -m [message]` normally, at this point. Every time you commit, pre-commit hooks are triggered and your code will be linted and tested. If it fails on the first pass, you will need to git add again and commit. 36 | 37 | Alternatively, run `make add-commit` (see also [issue #17](https://github.com/felipepenha/py-greenhouse/issues/17)). 38 | 39 | 40 | ## Dealing With Inconsistencies 41 | 42 | In case you new branch gets behind `dev`, you may correct it by performing 43 | 44 | ```git 45 | $ git stash save 46 | $ git pull [new_branch_name] 47 | $ git stash pop 48 | ``` 49 | 50 | You may have to deal with the inconsistencies that may arise from that process before proceeding. 51 | 52 | If you want to make your branch available online: 53 | 54 | ```git 55 | $ git push origin [new_branch_name] 56 | ``` 57 | 58 | 59 | ## New Releases 60 | 61 | Instructions for a new release: 62 | 63 | 1. Check which is the latest version (Ex: `0.0.1`); 64 | 2. Change the field `version` in `version.toml` (Ex: `version="0.0.2"`); 65 | 3. Add your name and email address to the field `authors` in `version.toml`; and 66 | 4. Run: 67 | ```bash 68 | $ make release 69 | ``` 70 | 71 | The above command will take care of checking the version in `version.toml` and releasing your code on `dev` with a tag consistent with `version.toml`. 72 | 73 | ## Pull Requests to the Main Branch 74 | 75 | New releases will usually be pulled/merged to `main` and need approval. 76 | 77 | # Conventions 78 | 79 | ## Commit Messages 80 | 81 | [conventionalcommits.org v1.0.0](https://www.conventionalcommits.org/en/v1.0.0/) 82 | 83 | Some common commit messages you will find in the project: 84 | 85 | ```git 86 | "docs:" 87 | "fix:" 88 | "feat:" 89 | "refactor:" 90 | "test:" 91 | ``` 92 | 93 | 94 | ## Versioning 95 | 96 | [Semantic Versioning 2.0.0](https://semver.org/) 97 | 98 | ## Docstrings 99 | 100 | [Numpy Docstrings](https://numpydoc.readthedocs.io/en/latest/format.html) 101 | 102 | A useful template: 103 | 104 | ```python 105 | def func(x): 106 | """[Summary] 107 | 108 | Parameters 109 | ---------- 110 | x: type 111 | [description] 112 | 113 | Returns 114 | ------- 115 | type 116 | [description] 117 | 118 | Examples 119 | -------- 120 | 121 | Raises 122 | ------ 123 | 124 | Notes 125 | ----- 126 | 127 | """ 128 | ``` -------------------------------------------------------------------------------- /examples/palmer_penguins/src/modeling.py: -------------------------------------------------------------------------------- 1 | from sklearn import preprocessing 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn.ensemble import RandomForestClassifier 5 | from sklearn.model_selection import RandomizedSearchCV 6 | import joblib 7 | 8 | 9 | class model: 10 | def __init__(self): 11 | 12 | pass 13 | 14 | def fit(self, train, y_col, x_col, n_jobs=1, seed=1): 15 | 16 | self.x_col = x_col 17 | self.y_col = y_col 18 | 19 | x_train = train[self.x_col].values 20 | 21 | self.le = preprocessing.LabelEncoder() 22 | 23 | # Trin encoder over training set 24 | (self.le).fit(train[self.y_col].values.ravel()) 25 | 26 | path = "/usr/app/models/label_encoder.joblib" 27 | 28 | joblib.dump(self.le, path) 29 | 30 | y_train = (self.le).transform(train[self.y_col].values.ravel()) 31 | 32 | # Store the grid in a dictionary 33 | grid = {} 34 | 35 | grid["max_features"] = [4, 5] 36 | grid["max_depth"] = [4, 5] 37 | grid["n_estimators"] = [50, 75, 200] 38 | 39 | clf = RandomForestClassifier(random_state=seed) 40 | 41 | self.clf_random = RandomizedSearchCV( 42 | estimator=clf, 43 | param_distributions=grid, 44 | n_iter=10, 45 | cv=None, 46 | verbose=2, 47 | random_state=seed, 48 | n_jobs=n_jobs, 49 | ) 50 | 51 | # Train model over training set 52 | (self.clf_random).fit(x_train, y_train.ravel()) 53 | 54 | path = "/usr/app/models/clf_random.joblib" 55 | 56 | joblib.dump(self.clf_random, path) 57 | 58 | def transform_sets(self, train, valid, test): 59 | 60 | x_train = train[self.x_col].values 61 | x_valid = valid[self.x_col].values 62 | x_test = test[self.x_col].values 63 | 64 | y_train = (self.le).transform(train[self.y_col].values.ravel()) 65 | y_valid = (self.le).transform(valid[self.y_col].values.ravel()) 66 | y_test = (self.le).transform(test[self.y_col].values.ravel()) 67 | 68 | train_out = train.copy(deep=True)[self.y_col] 69 | valid_out = valid.copy(deep=True)[self.y_col] 70 | test_out = test.copy(deep=True)[self.y_col] 71 | 72 | train_out["actual"] = y_train 73 | valid_out["actual"] = y_valid 74 | test_out["actual"] = y_test 75 | 76 | # Predict 77 | train_out["pred"] = (self.clf_random).predict(x_train) 78 | valid_out["pred"] = (self.clf_random).predict(x_valid) 79 | test_out["pred"] = (self.clf_random).predict(x_test) 80 | 81 | train_out["prob_0"], train_out["prob_1"], train_out["prob_2"] = np.transpose( 82 | (self.clf_random).predict_proba(x_train) 83 | ) 84 | valid_out["prob_0"], valid_out["prob_1"], valid_out["prob_2"] = np.transpose( 85 | (self.clf_random).predict_proba(x_valid) 86 | ) 87 | test_out["prob_0"], test_out["prob_1"], test_out["prob_2"] = np.transpose( 88 | (self.clf_random).predict_proba(x_test) 89 | ) 90 | 91 | return train_out, valid_out, test_out, (self.clf_random).best_params_ 92 | 93 | def transform_new(self, obs): 94 | """ 95 | obs: pandas dataframe 96 | """ 97 | 98 | x_obs = obs[self.x_col].values 99 | 100 | # Predict 101 | obs_out = pd.DataFrame({"pred": (self.clf_random).predict(x_obs)}) 102 | 103 | obs_out["prob_0"], obs_out["prob_1"], obs_out["prob_2"] = np.transpose( 104 | (self.clf_random).predict_proba(x_obs) 105 | ) 106 | 107 | return obs_out 108 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # The present file, 'Makefile' has been modified from the original at 2 | # https://github.com/NeowayLabs/data-science-template 3 | # under the folllowing license: 4 | # 5 | # MIT License 6 | # 7 | # Copyright (c) 2019 Neoway Business Solution 8 | # 9 | # Permission is hereby granted, free of charge, to any person obtaining a copy 10 | # of this software and associated documentation files (the "Software"), to deal 11 | # in the Software without restriction, including without limitation the rights 12 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 | # copies of the Software, and to permit persons to whom the Software is 14 | # furnished to do so, subject to the following conditions: 15 | # 16 | # The above copyright notice and this permission notice shall be included in all 17 | # copies or substantial portions of the Software. 18 | # 19 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | # SOFTWARE. 26 | 27 | BUILD = docker-compose build 28 | RUN = docker-compose run 29 | VERSION = $(shell awk -F ' = ' '$$1 ~ /version/ { gsub(/[\"]/, "", $$2); printf("%s",$$2) }' version.toml) 30 | MAKEFILE_ABS_PATH = $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) 31 | 32 | help: 33 | @echo "USAGE" 34 | @echo 35 | @echo " make " 36 | @echo " Include 'sudo' when necessary." 37 | @echo " To avoid using sudo, follow the steps in" 38 | @echo " https://docs.docker.com/engine/install/linux-postinstall/" 39 | @echo 40 | @echo 41 | @echo "COMMANDS" 42 | @echo 43 | @echo " add-commit git add, pre-commit, and commit" 44 | @echo " bash bash REPL (Read-Eval-Print loop), suitable for debugging" 45 | @echo " build build image using cache" 46 | @echo " build-no-cache build image from scratch, and not from cache" 47 | @echo " docs show the src modules documentation on the browser" 48 | @echo " dvc runs dvc commands for model versioning and comparison" 49 | @echo " fastapi starts up fastapi" 50 | @echo " jupyter access Python through the Jupyter Notebook" 51 | @echo " palmer-penguins moves files in examples/palmer_penguins to the main dir" 52 | @echo " pre-commit early run of pre-commit git hooks" 53 | @echo " python3 access Python through the REPL (Read-Eval-Print loop)" 54 | @echo " release release on dev branch. \ 55 | Be sure to update version.toml before running this operation" 56 | @echo " run run src/main.py" 57 | @echo " vanilla moves files in examples/vanilla to the main dir" 58 | @echo " test run all tests using pytest (from within the container)" 59 | @echo " test-no-log same as test but without log generation" 60 | 61 | ################# 62 | # User Commands # 63 | ################# 64 | 65 | build: 66 | mkdir --parents monitor 67 | mkdir --parents logs 68 | mkdir --parent models 69 | $(BUILD) 70 | 71 | build-no-cache: 72 | mkdir --parents monitor 73 | mkdir --parents logs 74 | $(BUILD) --no-cache 75 | 76 | bash: 77 | $(RUN) bash 78 | 79 | python3: 80 | $(RUN) python3 81 | 82 | jupyter: 83 | $(RUN) --service-ports jupyter 84 | 85 | fastapi: 86 | $(RUN) --service-ports fastapi 87 | 88 | test-no-log: 89 | $(RUN) test 90 | 91 | test: 92 | # test and append log to file including datetime in UTC 93 | (date --utc && $(RUN) test) 2>&1 | tee -ai logs/log_test.txt 94 | 95 | run: 96 | # run and append log to file including datetime in UTC 97 | (date --utc && $(RUN) run) 2>&1 | tee -ai logs/log_run.txt 98 | 99 | pre-commit: 100 | pre-commit run --all-files 101 | 102 | dvc: 103 | - dvc checkout 104 | # DVC pipeline 105 | - dvc repro 106 | # Trigger dvc metrics diff file logging 107 | # $(compare-to) is the git rev you are comparing to 108 | dvc metrics diff --all $(compare-to) > logs/log_metrics_diff.txt 109 | 110 | add-commit: 111 | # `-` signalizes that errors will be ignored by make 112 | # Add all files in the current directory 113 | - git add . 114 | # Run hooks in `pre-commit` that cause file changes 115 | - pre-commit run check-toml 116 | - pre-commit run check-yaml 117 | - pre-commit run pretty-format-json 118 | - pre-commit run requirements-txt-fixer 119 | - pre-commit run black 120 | - pre-commit run flake8 121 | # Add currently tracked files (which have been modified) 122 | - git add --update 123 | # Commit with `--message "$(message)"`. 124 | # `pre-commit` will run once again, 125 | # but now for all hooks 126 | git commit --message="$(message)" 127 | 128 | release: 129 | # Create tag based on `version.toml` 130 | # `-` signalizes that errors will be ignored by make 131 | git tag --annotate $(VERSION) \ 132 | --message "VERSION=$(VERSION) read from `version.toml`" 133 | # Push from `HEAD` (on current branch) to `dev`, 134 | # using the tag created above. 135 | # Append log to file including datetime in UTC 136 | (date --utc && git push origin HEAD:dev tag $(VERSION)) \ 137 | 2>&1 | tee -ai logs/log_release.txt 138 | 139 | docs: 140 | # Auto documentation. 141 | # references: https://pdoc.dev/ | https://calmcode.io/makefiles/phony-folders.html 142 | $(RUN) --service-ports docs 143 | 144 | vanilla: 145 | echo "" 146 | echo "COPYING FROM $(MAKEFILE_ABS_PATH)/examples/vanilla/** TO $(MAKEFILE_ABS_PATH)" 147 | echo "" 148 | cp -r $(MAKEFILE_ABS_PATH)/examples/vanilla/** $(MAKEFILE_ABS_PATH) 149 | 150 | palmer-penguins: 151 | echo "" 152 | echo "COPYING FROM $(MAKEFILE_ABS_PATH)/examples/palmer_penguins/** TO $(MAKEFILE_ABS_PATH)" 153 | echo "" 154 | cp -r $(MAKEFILE_ABS_PATH)/examples/palmer_penguins/** $(MAKEFILE_ABS_PATH) 155 | -------------------------------------------------------------------------------- /examples/palmer_penguins/src/main.py: -------------------------------------------------------------------------------- 1 | import greenhouse_clock 2 | import data_sourcing 3 | import data_splitting 4 | import data_preprocessing 5 | import feature_engineering 6 | import eda_monitoring 7 | import modeling 8 | import performance_monitoring 9 | 10 | from prefect import Flow, task, context 11 | 12 | import pandas as pd 13 | 14 | # Pandas options for better shell display 15 | pd.set_option("display.max_rows", 100) 16 | pd.set_option("display.max_columns", None) 17 | pd.set_option("display.width", None) 18 | 19 | start_time = greenhouse_clock.get_time() 20 | 21 | 22 | @task 23 | def sourcing(): 24 | 25 | return data_sourcing.get() 26 | 27 | 28 | @task 29 | def cleansing(df): 30 | 31 | return data_preprocessing.clean(df) 32 | 33 | 34 | @task 35 | def normalizing(df): 36 | 37 | return data_preprocessing.normalize(df) 38 | 39 | 40 | @task(nout=3) 41 | def splitting(df): 42 | 43 | return data_splitting.split(df) 44 | 45 | 46 | @task(nout=3) 47 | def one_hot(train, valid, test, cols): 48 | 49 | logger = context.get("logger") 50 | 51 | logger.info(train) 52 | 53 | train_hot, valid_hot, test_hot = feature_engineering.one_hot_encoding( 54 | train=train, 55 | valid=valid, 56 | test=test, 57 | cols=cols, 58 | ) 59 | 60 | train = train.join(train_hot) 61 | valid = valid.join(valid_hot) 62 | test = test.join(test_hot) 63 | 64 | logger.info(train) 65 | 66 | return train, valid, test 67 | 68 | 69 | @task(nout=3) 70 | def imputation(train, valid, test, cols, imputation_method): 71 | 72 | logger = context.get("logger") 73 | 74 | # Find rows where the numerical variables are nan 75 | mask = train[cols].isna() 76 | 77 | logger.info(train[mask]) 78 | 79 | train_imp, valid_imp, test_imp = feature_engineering.numerical_missing_imputation( 80 | train=train, 81 | valid=valid, 82 | test=test, 83 | cols=cols, 84 | imputation_method=imputation_method, 85 | ) 86 | 87 | train = train.join(train_imp, rsuffix="_imputed") 88 | valid = valid.join(valid_imp, rsuffix="_imputed") 89 | test = test.join(test_imp, rsuffix="_imputed") 90 | 91 | logger.info(train[mask]) 92 | 93 | return train, valid, test 94 | 95 | 96 | @task 97 | def eda(df, path, preffix, suffix): 98 | 99 | eda_monitoring.export_eda_report(df=df, path=path, preffix=preffix, suffix=suffix) 100 | 101 | pass 102 | 103 | 104 | @task(nout=5) 105 | def model(train, valid, test, obs, y_col, x_col): 106 | 107 | mo = modeling.model() 108 | 109 | mo.fit(train=train, y_col=y_col, x_col=x_col) 110 | 111 | lst = list(mo.transform_sets(train=train, valid=valid, test=test)) 112 | 113 | lst.append(mo.transform_new(obs=obs)) 114 | 115 | return lst 116 | 117 | 118 | @task 119 | def threshold(y_true, y_score): 120 | 121 | return performance_monitoring.optimal_threshold(y_true=y_true, y_score=y_score) 122 | 123 | 124 | @task 125 | def performance(y_true, y_score, best_hyperparams, path, opt_thr, suffix): 126 | 127 | return performance_monitoring.report_performance( 128 | y_true=y_true, 129 | y_score=y_score, 130 | best_hyperparams=best_hyperparams, 131 | path=path, 132 | opt_thr=opt_thr, 133 | suffix=suffix, 134 | ) 135 | 136 | 137 | @task 138 | def binarize(binary_map, series): 139 | 140 | return series.map(binary_map) 141 | 142 | 143 | @task 144 | def print_out(s): 145 | 146 | print(s) 147 | 148 | pass 149 | 150 | 151 | @task 152 | def df_to_csv(df, filename): 153 | 154 | df.to_csv(filename) 155 | 156 | pass 157 | 158 | 159 | # Define prefect flow 160 | with Flow("greenhouse") as flow: 161 | 162 | df = sourcing() 163 | df = cleansing(df) 164 | df = normalizing(df) 165 | train, valid, test = splitting(df) 166 | 167 | # eda( 168 | # df=train, 169 | # path="monitor/", 170 | # preffix=start_time, 171 | # suffix="before_feat_eng" 172 | # ) 173 | 174 | # Categorical 175 | cat_cols = [ 176 | "sex", 177 | ] 178 | 179 | train, valid, test = one_hot( 180 | train=train, 181 | valid=valid, 182 | test=test, 183 | cols=cat_cols, 184 | ) 185 | 186 | # Numerical 187 | num_cols = [ 188 | "bill_length_mm", 189 | "bill_depth_mm", 190 | "flipper_length_mm", 191 | "body_mass_g", 192 | ] 193 | 194 | train, valid, test = imputation( 195 | train=train, 196 | valid=valid, 197 | test=test, 198 | cols=num_cols, 199 | imputation_method="median", 200 | ) 201 | 202 | # eda( 203 | # df=train, 204 | # path="monitor/", 205 | # preffix=start_time, 206 | # suffix="after_feat_eng" 207 | # ) 208 | 209 | y_col = ["species"] 210 | 211 | x_col = [ 212 | "sex_male", 213 | "sex_female", 214 | "sex_na", 215 | "bill_length_mm_imputed", 216 | "bill_depth_mm_imputed", 217 | "flipper_length_mm_imputed", 218 | "body_mass_g_imputed", 219 | ] 220 | 221 | # `obs=test` just as an example here. 222 | # It should be actually new data, unseen by the model. 223 | train, valid, test, best_hyperparams, new = model( 224 | train=train, 225 | valid=valid, 226 | test=test, 227 | obs=test, 228 | y_col=y_col, 229 | x_col=x_col, 230 | ) 231 | 232 | path = "data/" 233 | filename = path + "{}_predict_new.csv".format(start_time) 234 | 235 | df_to_csv(df=new, filename=filename) 236 | 237 | # Obtain the optimal threshold of 238 | # class 0 vs 1+2 239 | # from the training set 240 | opt_thr = threshold(y_true=train["actual"], y_score=train["prob_0"]) 241 | 242 | # class 0 --> 1 243 | # class 1 or class 2 --> 0 244 | 245 | binary_map = { 246 | 0: 1, 247 | 1: 0, 248 | 2: 0, 249 | } 250 | 251 | # Performance report over training set 252 | performance( 253 | y_true=binarize(binary_map=binary_map, series=train["actual"]), 254 | y_score=train["prob_0"], 255 | best_hyperparams=best_hyperparams, 256 | path="monitor/", 257 | opt_thr=opt_thr, 258 | suffix="_train", 259 | ) 260 | 261 | # Performance report over validation set 262 | performance( 263 | y_true=binarize(binary_map=binary_map, series=valid["actual"]), 264 | y_score=valid["prob_0"], 265 | best_hyperparams=best_hyperparams, 266 | path="monitor/", 267 | opt_thr=opt_thr, 268 | suffix="_valid", 269 | ) 270 | 271 | # Performance report over test set 272 | performance( 273 | y_true=binarize(binary_map=binary_map, series=test["actual"]), 274 | y_score=test["prob_0"], 275 | best_hyperparams=best_hyperparams, 276 | path="monitor/", 277 | opt_thr=opt_thr, 278 | suffix="_test", 279 | ) 280 | 281 | 282 | if __name__ == "__main__": 283 | 284 | # Run prefect flow 285 | flow.run() 286 | 287 | # Export flow as a PDF 288 | flow.visualize(filename="flow/prefect_flow") 289 | -------------------------------------------------------------------------------- /notebooks/relative_path_imports.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "57843254", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import os\n", 11 | "import sys\n", 12 | "module_path = os.path.abspath(os.path.join('..'))\n", 13 | "if module_path not in sys.path:\n", 14 | " sys.path.append(module_path)" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "id": "07b4ecce", 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "from src import data_sourcing\n", 25 | "from src import data_splitting\n", 26 | "from src import data_preprocessing\n", 27 | "from src import feature_engineering\n", 28 | "from src import monitoring\n", 29 | "from src import modeling" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "id": "f0c1d56e", 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "data": { 40 | "text/html": [ 41 | "
\n", 42 | "\n", 55 | "\n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | "
bill_length_mmbill_depth_mmflipper_length_mmbody_mass_gsexspecies
039.118.7181.03750.0maleAdelie
139.517.4186.03800.0femaleAdelie
240.318.0195.03250.0femaleAdelie
3NaNNaNNaNNaNNaNAdelie
436.719.3193.03450.0femaleAdelie
.....................
33955.819.8207.04000.0maleChinstrap
34043.518.1202.03400.0femaleChinstrap
34149.618.2193.03775.0maleChinstrap
34250.819.0210.04100.0maleChinstrap
34350.218.7198.03775.0femaleChinstrap
\n", 169 | "

344 rows × 6 columns

\n", 170 | "
" 171 | ], 172 | "text/plain": [ 173 | " bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex \\\n", 174 | "0 39.1 18.7 181.0 3750.0 male \n", 175 | "1 39.5 17.4 186.0 3800.0 female \n", 176 | "2 40.3 18.0 195.0 3250.0 female \n", 177 | "3 NaN NaN NaN NaN NaN \n", 178 | "4 36.7 19.3 193.0 3450.0 female \n", 179 | ".. ... ... ... ... ... \n", 180 | "339 55.8 19.8 207.0 4000.0 male \n", 181 | "340 43.5 18.1 202.0 3400.0 female \n", 182 | "341 49.6 18.2 193.0 3775.0 male \n", 183 | "342 50.8 19.0 210.0 4100.0 male \n", 184 | "343 50.2 18.7 198.0 3775.0 female \n", 185 | "\n", 186 | " species \n", 187 | "0 Adelie \n", 188 | "1 Adelie \n", 189 | "2 Adelie \n", 190 | "3 Adelie \n", 191 | "4 Adelie \n", 192 | ".. ... \n", 193 | "339 Chinstrap \n", 194 | "340 Chinstrap \n", 195 | "341 Chinstrap \n", 196 | "342 Chinstrap \n", 197 | "343 Chinstrap \n", 198 | "\n", 199 | "[344 rows x 6 columns]" 200 | ] 201 | }, 202 | "execution_count": 3, 203 | "metadata": {}, 204 | "output_type": "execute_result" 205 | } 206 | ], 207 | "source": [ 208 | "data_sourcing.get()" 209 | ] 210 | } 211 | ], 212 | "metadata": { 213 | "kernelspec": { 214 | "display_name": "Python 3", 215 | "language": "python", 216 | "name": "python3" 217 | }, 218 | "language_info": { 219 | "codemirror_mode": { 220 | "name": "ipython", 221 | "version": 3 222 | }, 223 | "file_extension": ".py", 224 | "mimetype": "text/x-python", 225 | "name": "python", 226 | "nbconvert_exporter": "python", 227 | "pygments_lexer": "ipython3", 228 | "version": "3.9.2" 229 | } 230 | }, 231 | "nbformat": 4, 232 | "nbformat_minor": 5 233 | } 234 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![](/images/greenhouse_github_card_v02.png) 2 | 3 | # py-greenhouse 4 | 5 | A containerized Python framework for a better Data X development workflow. Where X = Science, Engineering, Analytics, etc. 6 | 7 | The name "Greenhouse" is a metaphor. A greenhouse is a structure made of glass to grow plants despite of external conditions such as a cold winter. Likewise, the Greenhouse framework builds a standalone container for Rust developmet which is fully transparent to the user. 8 | 9 | [Watch an overview clip on Twitch! 🖥️💜🖥️💜](https://www.twitch.tv/videos/1013368507) 10 | 11 | ![](/images/greenhouse_architecture_v03.png) 12 | 13 | 14 | # But what is a template? 15 | 16 | `py-greenhouse` is a GitHub template, not a package. This means that you will work on a copy of this project and you will replace placeholders by code that fits your own purposes. 17 | 18 | If you just want to use the Greenhouse template for your new cool Data X or Machine Learning project, please choose the option ["Use this Template"](https://github.com/felipepenha/py-greenhouse/generate). 19 | 20 | The current version of `py-greenhouse` uses the [Palmer Penguins dataset](https://github.com/mcnakhaee/palmerpenguins) called via an API (see [`src/data_sourcing.py`](https://github.com/felipepenha/py-greenhouse/blob/main/src/data_sourcing.py)). You may use other datasets, coming from different sources, and you may need to setup keys for cloud environment access, all of which are not covered here. 21 | 22 | 23 | # Local OS Requirements 24 | 25 | These are requirements for your local machine, ideally a Debian Linux OS: 26 | 27 | ## - [docker](https://docs.docker.com/engine/install/) 28 | 29 | Follow the [instructions in the docker docs](https://docs.docker.com/engine/install/linux-postinstall/) to ensure that $USER has root access to docker. 30 | 31 | ## - [docker-compose](https://docs.docker.com/compose/install/) 32 | 33 | ## - VS Code 34 | 35 | In your local machine: 36 | 37 | 1. [install VS Code](https://code.visualstudio.com/docs/setup/linux), 38 | 39 | 2. install the [`ms-vscode-remote.remote-containers`](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) extension locally, 40 | 41 | A pop-up will open up asking if you would like to reload the workspace in the container: 42 | 43 | ![](/images/Screenshot_from_2021-03-07_18-31-36_VS-Code.png) 44 | 45 | After choosing "Reopen in Container", VS Code will open the "bash" docker-compose service in the greenhouse container, as specified in the manifest `.devcontainer.json`. 46 | 47 | Notice that VS Code will run intilization commands that may take some time to process. 48 | 49 | VS Code will already include the [`ms-python.python`](https://marketplace.visualstudio.com/items?itemName=ms-python.python) extension, without the need to install it in your own local machine. You may add any other extensions that you may need in your Python project in the configuration file `.devcontainer.json` . 50 | 51 | ## - [git](https://git-scm.com/download/linux) 52 | 53 | ``` 54 | sudo apt-get git 55 | ``` 56 | 57 | ## - make 58 | 59 | ``` 60 | sudo apt-get update 61 | sudo apt-get install build-essential 62 | ``` 63 | 64 | ## - awk 65 | ## - tee 66 | ## - touch 67 | 68 | ## - python3 69 | 70 | ``` 71 | sudo apt-get update 72 | sudo apt-get install python3 73 | ``` 74 | 75 | ## - pip3 76 | 77 | ``` 78 | sudo apt-get update 79 | sudo apt-get install python3-pip 80 | ``` 81 | 82 | ## - pre-commit 83 | 84 | ``` 85 | pip3 install pre-commit 86 | ``` 87 | 88 | In the main directory of the project where there is already a `.git/` subdirectory: 89 | 90 | ``` 91 | pre-commit install 92 | pre-commit migrate-config 93 | pre-commit autoupdate 94 | ``` 95 | 96 | The main directory may be either the locally cloned py-greenhouse or a project based on the github template. 97 | 98 | Alternatively, simply run in the terminal `make install-requirements`, to install the `pre-commit` Python package. 99 | 100 | ## - [dvc](https://dvc.org/doc/install/linux) 101 | 102 | ``` 103 | pip3 install dvc 104 | ``` 105 | 106 | ## Do I need to install any other requirements? 107 | 108 | No. After installing the basic local requirements described above, you are all set to run everything else inside a Docker container. 109 | 110 | # Quick Start 111 | 112 | This is a template repository. [Follow this link for instructions to create a repository from a template](https://docs.github.com/en/github/creating-cloning-and-archiving-repositories/creating-a-repository-from-a-template#creating-a-repository-from-a-template). 113 | 114 | 115 | First, make sure `make`, `docker` and `docker-compose` are installed in your system. 116 | 117 | 118 | The greenhouse dev work is performed via `make` commands. 119 | 120 | 121 | To see the most up to date list of available commands run 122 | 123 | ```bash 124 | $ make help 125 | 126 | USAGE 127 | 128 | make 129 | Include 'sudo' when necessary. 130 | To avoid using sudo, follow the steps in 131 | https://docs.docker.com/engine/install/linux-postinstall/ 132 | 133 | 134 | COMMANDS 135 | 136 | add-commit git add, pre-commit, and commit 137 | bash bash REPL (Read-Eval-Print loop), suitable for debugging 138 | build build image using cache 139 | build-no-cache build image from scratch, and not from cache 140 | docs show the src modules documentation on the browser 141 | fastapi starts up fastapi 142 | jupyter access Python through the Jupyter Notebook 143 | pre-commit early run of pre-commit git hooks 144 | python3 access Python through the REPL (Read-Eval-Print loop) 145 | release release on dev branch. Be sure to update version.toml before running this operation 146 | run run src/main.py 147 | test run all tests using pytest (from within the container) 148 | 149 | ``` 150 | 151 | 152 | To build your greenhouse (as it is), you first need to run: 153 | 154 | ```bash 155 | $ make build-no-cache 156 | ``` 157 | 158 | 159 | To access Jupyter in your local browser: 160 | 161 | ```bash 162 | $ make jupyter 163 | 164 | Use Control-C to stop this server and shut down all kernels (twice to skip confirmation). 165 | 166 | To access the notebook, open this file in a browser: 167 | file:///root/.local/share/jupyter/runtime/nbserver-1-open.html 168 | Or copy and paste one of these URLs: 169 | http://...:8888/lab?token=... 170 | ``` 171 | 172 | 173 | Next, you simply need to follow the instructions printed out on your own terminal. 174 | 175 | 176 | In the generic example above, I would paste the following on my browser: 177 | 178 | ```bash 179 | http://...:8888/lab?token=... 180 | ``` 181 | 182 | 183 | Any changes made in the files within the Jupyter interface, for example saved changes in `.rs`, `.ipynb`, and `.py` files, will be reflected in the original files you store locally, and vice-versa. This is ensured by the fact that the whole greenhouse directory is set as a `volume` in the `docker-compose.yml` configuration file. 184 | 185 | 186 | You may also choose to run code using the REPL (Read-Eval-Print loop) in the terminal by running: 187 | 188 | ```bash 189 | $ make python3 190 | ``` 191 | 192 | 193 | Now, you are ready to start developing Python code by creating new `.py` files in the `/src` directory. 194 | 195 | 196 | During development phase, you can normally test out new code in a Jupyter Notebook. 197 | 198 | Check out additional notebooks in the `/notebooks` directory (`.ipynb` files with preffix `example_`). 199 | 200 | 201 | # Greenhouse Structure 202 | 203 | ```bash 204 | . 205 | ├── conftest.py 206 | ├── CONTRIBUTING.md 207 | ├── docker-compose.yml 208 | ├── Dockerfile 209 | ├── notebooks 210 | ├── flow 211 | ├── images 212 | ├── LICENSE 213 | ├── logs 214 | ├── Makefile 215 | ├── monitor 216 | ├── README.md 217 | ├── requirements.txt 218 | ├── src 219 | │ ├── data_preprocessing.py 220 | │ ├── data_sourcing.py 221 | │ ├── data_splitting.py 222 | │ ├── eda_monitoring.py 223 | │ ├── feature_engineering.py 224 | │ ├── greenhouse_clock.py 225 | │ ├── main.py 226 | │ ├── modeling.py 227 | │ ├── performance_monitoring.py 228 | ├── tests 229 | │ ├── test_data_sourcing.py 230 | │ ├── test_data_splitting.py 231 | │ └── test_feature_engineering.py 232 | └── version.toml 233 | ``` 234 | 235 | Highlights: 236 | 237 | * `notebooks/`: notebooks, usually Jupyter Notebooks not in production 238 | * `logs/`: dated logs, usually `.txt` files 239 | * `monitor/`: files exported for monitoring purposes (data, model performance, etc). usually `.html` or `.json`. 240 | * `flow/`: flow diagram as provided by `prefect` 241 | * `requirements.txt`: pip3 requirements for your project 242 | * `src/`: source directory for your Python project 243 | * `src/main.py`: main file where flow is defined 244 | * `test/`: tests of Python code. All tests will run automatically as pre-commit git hooks, in the container. 245 | * `version.toml`: information about your project, such as the version number to be used in the git tag pushed to the repo with `make release`. 246 | 247 | 248 | 249 | # Adding External Dependencies 250 | 251 | You need to include any external dependencies to the `requirements.txt` file in addition to the default list provided here. 252 | 253 | 254 | ## Continuous Integration / Continuous Delivery (CI/CD) 255 | 256 | Follow the instructins in [CONTRIBUTING.md](https://github.com/felipepenha/rust-greenhouse/blob/main/CONTRIBUTING.md). Be sure to update `version.toml` before each new release on the `dev` branch. 257 | 258 | ![](/images/greenhouse_architecture_gitops.png) 259 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | --------------------------------------------------------------------------------