├── .dockerignore
├── .env
├── .gitignore
├── README.md
├── build
    └── mlflow
    │   ├── Dockerfile
    │   └── requirements.txt
├── docker-compose.yml
├── example.env
├── requirements.txt
├── run_pipeline.py
└── src
    └── pipelines
        └── example_training_pipeline.py


/.dockerignore:
--------------------------------------------------------------------------------
1 | notebook
2 | mlartifacts
3 | mlruns
4 | 
5 | README.md


--------------------------------------------------------------------------------
/.env:
--------------------------------------------------------------------------------
 1 | # Mlflow
 2 | MLFLOW_TRACKING_URI=http://mlflow:5000 # using "mflow" as hostname of service in docker compose
 3 | 
 4 | # for Artifacts storage
 5 | MLFLOW_S3_ENDPOINT="http://minio:9000"# # minio server to connect as storage artifacts
 6 | AWS_ACCESS_KEY_ID="minio_root_user" # just use minio username
 7 | AWS_SECRET_ACCESS_KEY="minio_root_password" # just use minio password
 8 | 
 9 | # Minio or S3
10 | MINIO_ROOT_USER=minio_root_user
11 | MINIO_ROOT_PASSWORD=minio_root_password
12 | MINIO_STORAGE_USE_HTTPS=false
13 | 
14 | MINIO_FIRST_BUCKET=mybucket
15 | 
16 | # POSTGRESS
17 | POSTGRESS_USER=user_pg
18 | POSTGRESS_PASSWORD=pass_pg
19 | POSTGRESS_MLFLOW_DB=mlflow_db
20 | 
21 | # PG ADMIN
22 | PGADMIN_DEFAULT_EMAIL=admin@example.com
23 | PGADMIN_DEFAULT_PASSWORD=admin


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | _venv
 2 | 
 3 | # .gitignore
 4 | # .dockerignore
 5 | # .env
 6 | 
 7 | __pycache__
 8 | 
 9 | mlartifacts
10 | mlruns


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ### Template MLOps and Data Science Project
 2 | 
 3 | ---
 4 | 
 5 | - **MLFlow** for Model Tracking and Management.
 6 | - **Minio** Open Source S3 Storage for storing Artifacts Model Machine Learning.
 7 | - Implement **PostgreSQL** as Database for Storing Metadata MLFLow.
 8 | 
 9 | ---
10 | 
11 | #### How to Run:
12 | 
13 | make virtual environment for python:
14 | 
15 | ```console
16 | python -m venv _venv
17 | ```
18 | 
19 | activate virtual environment:
20 | 
21 | ```console
22 | ./_venv/Scripts/activate
23 | ```
24 | 
25 | Install requirements:
26 | 
27 | ```markdown
28 | pip install -r requirements.txt
29 | ```
30 | 
31 | Running Docker compose:
32 | 
33 | ```markdown
34 | docker compose up -d
35 | ```
36 | 
37 | after all services in docker compose is already running, then open other terminal and try running pipeline in local machine with use
38 | 
39 | ```markdown
40 | python run_pipeline.py
41 | ```
42 | 
43 | After all steps done you can check it:
44 | 
45 | - **Minio:** http://localhost:9000
46 | - **PG Admin:** http://localhost:5050
47 | - **MLFlow Dashboard:** http://localhost:5000
48 | 
49 | If you don't know user and password you can check configuration in **.env** or **docker-compose.yml**
50 | 


--------------------------------------------------------------------------------
/build/mlflow/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.12-slim
2 | 
3 | WORKDIR /tmp
4 | 
5 | COPY requirements.txt /tmp
6 | 
7 | RUN pip install --no-cache-dir -r /tmp/requirements.txt 


--------------------------------------------------------------------------------
/build/mlflow/requirements.txt:
--------------------------------------------------------------------------------
1 | boto3==1.35.63
2 | mlflow==2.20.3
3 | psycopg2-binary==2.9.10


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3.8'
 2 | 
 3 | services:
 4 |   postgres:
 5 |     image: postgres:15
 6 |     container_name: postgres_db
 7 |     restart: always
 8 |     environment:
 9 |       POSTGRES_USER: ${POSTGRESS_USER}
10 |       POSTGRES_PASSWORD: ${POSTGRESS_PASSWORD}
11 |       POSTGRES_DB: ${POSTGRESS_MLFLOW_DB}
12 |     ports:
13 |       - "5432:5432"
14 |     volumes:
15 |       - postgres_data:/var/lib/postgresql/data
16 | 
17 |   pgadmin:
18 |     image: dpage/pgadmin4
19 |     container_name: pgadmin
20 |     restart: always
21 |     environment:
22 |       PGADMIN_DEFAULT_EMAIL: ${PGADMIN_DEFAULT_EMAIL}
23 |       PGADMIN_DEFAULT_PASSWORD: ${PGADMIN_DEFAULT_PASSWORD}
24 |     ports:
25 |       - "5050:80"
26 |     depends_on:
27 |       - postgres
28 | 
29 |   minio:
30 |     image: minio/minio
31 |     container_name: minio
32 |     restart: always
33 |     environment:
34 |       MINIO_ROOT_USER: ${MINIO_ROOT_USER} 
35 |       MINIO_ROOT_PASSWORD: ${MINIO_ROOT_PASSWORD} 
36 |       MINIO_STORAGE_USE_HTTPS: ${MINIO_STORAGE_USE_HTTPS}
37 |     ports:
38 |       - "9000:9000"
39 |       # MinIO Console is available at http://localhost:9001
40 |       - "9001:9001"
41 |     volumes:
42 |       - minio_data:/data
43 |     command: server --console-address ":9001" /data
44 | 
45 |   # Create a bucket named "bucket" if it doesn't exist
46 |   minio_bucket:
47 |     image: minio/mc
48 |     container_name: create-minio-bucket
49 |     depends_on:
50 |       - minio
51 |     entrypoint: >
52 |         /bin/sh -c "
53 |           until (/usr/bin/mc alias set minio http://minio:9000 ${MINIO_ROOT_USER} ${MINIO_ROOT_PASSWORD}) do sleep 1; 
54 |           done;
55 |           /usr/bin/mc mb minio/mybucket;
56 |           /usr/bin/mc policy set public minio/mybucket;
57 |           exit 0;
58 |         "
59 | 
60 |   mlflow:
61 |     build: ./build/mlflow
62 |     container_name: mlflow
63 |     restart: always
64 |     environment:
65 |       MLFLOW_TRACKING_URI: ${MLFLOW_TRACKING_URI:-http://mlflow:5000} 
66 |       MLFLOW_S3_ENDPOINT_URL: http://minio:9000 
67 |       AWS_ACCESS_KEY_ID: ${MINIO_ROOT_USER}
68 |       AWS_SECRET_ACCESS_KEY: ${MINIO_ROOT_PASSWORD}
69 |     ports:
70 |       - "5000:5000"          
71 |     command: > 
72 |       mlflow server 
73 |       --backend-store-uri postgresql://${POSTGRESS_USER}:${POSTGRESS_PASSWORD}@postgres:5432/${POSTGRESS_MLFLOW_DB}
74 |       --artifacts-destination s3://mybucket/mlflow-artifacts
75 |       --serve-artifacts
76 |       --port 5000
77 |       --host 0.0.0.0
78 |     depends_on:
79 |       - postgres
80 |       - minio
81 |       - minio_bucket
82 | 
83 | 
84 | volumes:
85 |   postgres_data:
86 |   minio_data:
87 | 
88 | 


--------------------------------------------------------------------------------
/example.env:
--------------------------------------------------------------------------------
 1 | # Mlflow
 2 | MLFLOW_TRACKING_URI="http://mlflow:5000 # using "mflow" as hostname of service in docker compose"
 3 | 
 4 | # for Artifacts storage
 5 | MLFLOW_S3_ENDPOINT="http://minio:9000"# # minio server to connect as storage artifacts
 6 | AWS_ACCESS_KEY_ID="minio_root_user" # just use minio username
 7 | AWS_SECRET_ACCESS_KEY="minio_root_password" # just use minio password
 8 | 
 9 | # Minio or S3
10 | MINIO_ROOT_USER=minio_root_user
11 | MINIO_ROOT_PASSWORD=minio_root_password
12 | MINIO_STORAGE_USE_HTTPS=false
13 | 
14 | MINIO_FIRST_BUCKET=mybucket
15 | 
16 | # POSTGRESS
17 | POSTGRESS_USER=user_pg
18 | POSTGRESS_PASSWORD=pass_pg
19 | POSTGRESS_MLFLOW_DB=mlflow_db
20 | 
21 | # PG ADMIN
22 | PGADMIN_DEFAULT_EMAIL=admin@example.com
23 | PGADMIN_DEFAULT_PASSWORD=admin


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Rezacrown/mlops-datascience-template/7a06597f41b5110c6460f42afc3e4e989865f1f4/requirements.txt


--------------------------------------------------------------------------------
/run_pipeline.py:
--------------------------------------------------------------------------------
 1 | from src.pipelines.example_training_pipeline import example_pipeline
 2 | 
 3 | import mlflow
 4 | 
 5 | 
 6 | def main():
 7 |     example_pipeline()
 8 | 
 9 | 
10 | if __name__ == "__main__":
11 |     # adjust with mlflow server url
12 |     mlflow.set_tracking_uri("http://localhost:5000")
13 | 
14 |     with mlflow.start_run():
15 |         main()
16 | 


--------------------------------------------------------------------------------
/src/pipelines/example_training_pipeline.py:
--------------------------------------------------------------------------------
 1 | # import dataset
 2 | from sklearn.datasets import load_breast_cancer
 3 | 
 4 | # for preprocessing
 5 | from sklearn.model_selection import train_test_split
 6 | from sklearn.preprocessing import StandardScaler
 7 | 
 8 | # model impoort
 9 | from sklearn.linear_model import LogisticRegression
10 | 
11 | # model metrics
12 | from sklearn.metrics import accuracy_score
13 | 
14 | 
15 | # zenml for pipeline standarianization
16 | from zenml import step, pipeline
17 | 
18 | # mlflow for logging and save artifacts
19 | import mlflow
20 | 
21 | # type casting
22 | from typing import Any
23 | 
24 | 
25 | @step
26 | def load_or_ingest_data_step() -> dict[str, Any]:
27 |     # load the dataset
28 |     data = load_breast_cancer()
29 | 
30 |     X, y = data.data, data.target
31 | 
32 |     data = {"X": X, "y": y}
33 | 
34 |     return data
35 | 
36 | 
37 | @step
38 | def split_train_test_data_step(data) -> dict[str, Any]:
39 |     # Split dataset into training and test sets
40 |     X_train, X_test, y_train, y_test = train_test_split(
41 |         data["X"], data["y"], test_size=0.2, random_state=42
42 |     )
43 | 
44 |     split_data = {
45 |         "X_train": X_train,
46 |         "X_test": X_test,
47 |         "y_train": y_train,
48 |         "y_test": y_test,
49 |     }
50 | 
51 |     return split_data
52 | 
53 | 
54 | @step
55 | def train_model_step(data):
56 | 
57 |     # feature engineering data
58 |     sc = StandardScaler()
59 |     X_train = sc.fit_transform(data["X_train"], data["y_train"])
60 | 
61 |     # model Logisctic regression instance
62 |     model = LogisticRegression()
63 | 
64 |     # model train
65 |     model.fit(X=X_train, y=data["y_train"])
66 | 
67 |     return model
68 | 
69 | 
70 | @step
71 | def evaluate_model_step(model, data):
72 |     # get prediction
73 |     y_pred = model.predict(data["X_test"])
74 | 
75 |     # Evaluate model
76 |     accuracy = accuracy_score(data["y_test"], y_pred)
77 |     print(f"Accuracy of Model Logistic Regression: {accuracy:.4f}")
78 | 
79 |     # log to mlflow
80 |     mlflow.log_param("Accuracy", accuracy)
81 | 
82 | 
83 | @pipeline(enable_cache=False)
84 | def example_pipeline():
85 |     mlflow.autolog()
86 | 
87 |     # step 1 load or ingest
88 |     data = load_or_ingest_data_step()
89 | 
90 |     # step 2 split dataset into training and test
91 |     splited_data = split_train_test_data_step(data)
92 | 
93 |     # step 3 feature engineering and train model
94 |     model = train_model_step(splited_data)
95 | 
96 |     # step 4 prediction and evaluation model
97 |     evaluate_model_step(model, splited_data)
98 | 


--------------------------------------------------------------------------------