├── .dockerignore ├── .env ├── .gitignore ├── README.md ├── build └── mlflow │ ├── Dockerfile │ └── requirements.txt ├── docker-compose.yml ├── example.env ├── requirements.txt ├── run_pipeline.py └── src └── pipelines └── example_training_pipeline.py /.dockerignore: -------------------------------------------------------------------------------- 1 | notebook 2 | mlartifacts 3 | mlruns 4 | 5 | README.md -------------------------------------------------------------------------------- /.env: -------------------------------------------------------------------------------- 1 | # Mlflow 2 | MLFLOW_TRACKING_URI=http://mlflow:5000 # using "mflow" as hostname of service in docker compose 3 | 4 | # for Artifacts storage 5 | MLFLOW_S3_ENDPOINT="http://minio:9000"# # minio server to connect as storage artifacts 6 | AWS_ACCESS_KEY_ID="minio_root_user" # just use minio username 7 | AWS_SECRET_ACCESS_KEY="minio_root_password" # just use minio password 8 | 9 | # Minio or S3 10 | MINIO_ROOT_USER=minio_root_user 11 | MINIO_ROOT_PASSWORD=minio_root_password 12 | MINIO_STORAGE_USE_HTTPS=false 13 | 14 | MINIO_FIRST_BUCKET=mybucket 15 | 16 | # POSTGRESS 17 | POSTGRESS_USER=user_pg 18 | POSTGRESS_PASSWORD=pass_pg 19 | POSTGRESS_MLFLOW_DB=mlflow_db 20 | 21 | # PG ADMIN 22 | PGADMIN_DEFAULT_EMAIL=admin@example.com 23 | PGADMIN_DEFAULT_PASSWORD=admin -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | _venv 2 | 3 | # .gitignore 4 | # .dockerignore 5 | # .env 6 | 7 | __pycache__ 8 | 9 | mlartifacts 10 | mlruns -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Template MLOps and Data Science Project 2 | 3 | --- 4 | 5 | - **MLFlow** for Model Tracking and Management. 6 | - **Minio** Open Source S3 Storage for storing Artifacts Model Machine Learning. 7 | - Implement **PostgreSQL** as Database for Storing Metadata MLFLow. 8 | 9 | --- 10 | 11 | #### How to Run: 12 | 13 | make virtual environment for python: 14 | 15 | ```console 16 | python -m venv _venv 17 | ``` 18 | 19 | activate virtual environment: 20 | 21 | ```console 22 | ./_venv/Scripts/activate 23 | ``` 24 | 25 | Install requirements: 26 | 27 | ```markdown 28 | pip install -r requirements.txt 29 | ``` 30 | 31 | Running Docker compose: 32 | 33 | ```markdown 34 | docker compose up -d 35 | ``` 36 | 37 | after all services in docker compose is already running, then open other terminal and try running pipeline in local machine with use 38 | 39 | ```markdown 40 | python run_pipeline.py 41 | ``` 42 | 43 | After all steps done you can check it: 44 | 45 | - **Minio:** http://localhost:9000 46 | - **PG Admin:** http://localhost:5050 47 | - **MLFlow Dashboard:** http://localhost:5000 48 | 49 | If you don't know user and password you can check configuration in **.env** or **docker-compose.yml** 50 | -------------------------------------------------------------------------------- /build/mlflow/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.12-slim 2 | 3 | WORKDIR /tmp 4 | 5 | COPY requirements.txt /tmp 6 | 7 | RUN pip install --no-cache-dir -r /tmp/requirements.txt -------------------------------------------------------------------------------- /build/mlflow/requirements.txt: -------------------------------------------------------------------------------- 1 | boto3==1.35.63 2 | mlflow==2.20.3 3 | psycopg2-binary==2.9.10 -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | postgres: 5 | image: postgres:15 6 | container_name: postgres_db 7 | restart: always 8 | environment: 9 | POSTGRES_USER: ${POSTGRESS_USER} 10 | POSTGRES_PASSWORD: ${POSTGRESS_PASSWORD} 11 | POSTGRES_DB: ${POSTGRESS_MLFLOW_DB} 12 | ports: 13 | - "5432:5432" 14 | volumes: 15 | - postgres_data:/var/lib/postgresql/data 16 | 17 | pgadmin: 18 | image: dpage/pgadmin4 19 | container_name: pgadmin 20 | restart: always 21 | environment: 22 | PGADMIN_DEFAULT_EMAIL: ${PGADMIN_DEFAULT_EMAIL} 23 | PGADMIN_DEFAULT_PASSWORD: ${PGADMIN_DEFAULT_PASSWORD} 24 | ports: 25 | - "5050:80" 26 | depends_on: 27 | - postgres 28 | 29 | minio: 30 | image: minio/minio 31 | container_name: minio 32 | restart: always 33 | environment: 34 | MINIO_ROOT_USER: ${MINIO_ROOT_USER} 35 | MINIO_ROOT_PASSWORD: ${MINIO_ROOT_PASSWORD} 36 | MINIO_STORAGE_USE_HTTPS: ${MINIO_STORAGE_USE_HTTPS} 37 | ports: 38 | - "9000:9000" 39 | # MinIO Console is available at http://localhost:9001 40 | - "9001:9001" 41 | volumes: 42 | - minio_data:/data 43 | command: server --console-address ":9001" /data 44 | 45 | # Create a bucket named "bucket" if it doesn't exist 46 | minio_bucket: 47 | image: minio/mc 48 | container_name: create-minio-bucket 49 | depends_on: 50 | - minio 51 | entrypoint: > 52 | /bin/sh -c " 53 | until (/usr/bin/mc alias set minio http://minio:9000 ${MINIO_ROOT_USER} ${MINIO_ROOT_PASSWORD}) do sleep 1; 54 | done; 55 | /usr/bin/mc mb minio/mybucket; 56 | /usr/bin/mc policy set public minio/mybucket; 57 | exit 0; 58 | " 59 | 60 | mlflow: 61 | build: ./build/mlflow 62 | container_name: mlflow 63 | restart: always 64 | environment: 65 | MLFLOW_TRACKING_URI: ${MLFLOW_TRACKING_URI:-http://mlflow:5000} 66 | MLFLOW_S3_ENDPOINT_URL: http://minio:9000 67 | AWS_ACCESS_KEY_ID: ${MINIO_ROOT_USER} 68 | AWS_SECRET_ACCESS_KEY: ${MINIO_ROOT_PASSWORD} 69 | ports: 70 | - "5000:5000" 71 | command: > 72 | mlflow server 73 | --backend-store-uri postgresql://${POSTGRESS_USER}:${POSTGRESS_PASSWORD}@postgres:5432/${POSTGRESS_MLFLOW_DB} 74 | --artifacts-destination s3://mybucket/mlflow-artifacts 75 | --serve-artifacts 76 | --port 5000 77 | --host 0.0.0.0 78 | depends_on: 79 | - postgres 80 | - minio 81 | - minio_bucket 82 | 83 | 84 | volumes: 85 | postgres_data: 86 | minio_data: 87 | 88 | -------------------------------------------------------------------------------- /example.env: -------------------------------------------------------------------------------- 1 | # Mlflow 2 | MLFLOW_TRACKING_URI="http://mlflow:5000 # using "mflow" as hostname of service in docker compose" 3 | 4 | # for Artifacts storage 5 | MLFLOW_S3_ENDPOINT="http://minio:9000"# # minio server to connect as storage artifacts 6 | AWS_ACCESS_KEY_ID="minio_root_user" # just use minio username 7 | AWS_SECRET_ACCESS_KEY="minio_root_password" # just use minio password 8 | 9 | # Minio or S3 10 | MINIO_ROOT_USER=minio_root_user 11 | MINIO_ROOT_PASSWORD=minio_root_password 12 | MINIO_STORAGE_USE_HTTPS=false 13 | 14 | MINIO_FIRST_BUCKET=mybucket 15 | 16 | # POSTGRESS 17 | POSTGRESS_USER=user_pg 18 | POSTGRESS_PASSWORD=pass_pg 19 | POSTGRESS_MLFLOW_DB=mlflow_db 20 | 21 | # PG ADMIN 22 | PGADMIN_DEFAULT_EMAIL=admin@example.com 23 | PGADMIN_DEFAULT_PASSWORD=admin -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Rezacrown/mlops-datascience-template/7a06597f41b5110c6460f42afc3e4e989865f1f4/requirements.txt -------------------------------------------------------------------------------- /run_pipeline.py: -------------------------------------------------------------------------------- 1 | from src.pipelines.example_training_pipeline import example_pipeline 2 | 3 | import mlflow 4 | 5 | 6 | def main(): 7 | example_pipeline() 8 | 9 | 10 | if __name__ == "__main__": 11 | # adjust with mlflow server url 12 | mlflow.set_tracking_uri("http://localhost:5000") 13 | 14 | with mlflow.start_run(): 15 | main() 16 | -------------------------------------------------------------------------------- /src/pipelines/example_training_pipeline.py: -------------------------------------------------------------------------------- 1 | # import dataset 2 | from sklearn.datasets import load_breast_cancer 3 | 4 | # for preprocessing 5 | from sklearn.model_selection import train_test_split 6 | from sklearn.preprocessing import StandardScaler 7 | 8 | # model impoort 9 | from sklearn.linear_model import LogisticRegression 10 | 11 | # model metrics 12 | from sklearn.metrics import accuracy_score 13 | 14 | 15 | # zenml for pipeline standarianization 16 | from zenml import step, pipeline 17 | 18 | # mlflow for logging and save artifacts 19 | import mlflow 20 | 21 | # type casting 22 | from typing import Any 23 | 24 | 25 | @step 26 | def load_or_ingest_data_step() -> dict[str, Any]: 27 | # load the dataset 28 | data = load_breast_cancer() 29 | 30 | X, y = data.data, data.target 31 | 32 | data = {"X": X, "y": y} 33 | 34 | return data 35 | 36 | 37 | @step 38 | def split_train_test_data_step(data) -> dict[str, Any]: 39 | # Split dataset into training and test sets 40 | X_train, X_test, y_train, y_test = train_test_split( 41 | data["X"], data["y"], test_size=0.2, random_state=42 42 | ) 43 | 44 | split_data = { 45 | "X_train": X_train, 46 | "X_test": X_test, 47 | "y_train": y_train, 48 | "y_test": y_test, 49 | } 50 | 51 | return split_data 52 | 53 | 54 | @step 55 | def train_model_step(data): 56 | 57 | # feature engineering data 58 | sc = StandardScaler() 59 | X_train = sc.fit_transform(data["X_train"], data["y_train"]) 60 | 61 | # model Logisctic regression instance 62 | model = LogisticRegression() 63 | 64 | # model train 65 | model.fit(X=X_train, y=data["y_train"]) 66 | 67 | return model 68 | 69 | 70 | @step 71 | def evaluate_model_step(model, data): 72 | # get prediction 73 | y_pred = model.predict(data["X_test"]) 74 | 75 | # Evaluate model 76 | accuracy = accuracy_score(data["y_test"], y_pred) 77 | print(f"Accuracy of Model Logistic Regression: {accuracy:.4f}") 78 | 79 | # log to mlflow 80 | mlflow.log_param("Accuracy", accuracy) 81 | 82 | 83 | @pipeline(enable_cache=False) 84 | def example_pipeline(): 85 | mlflow.autolog() 86 | 87 | # step 1 load or ingest 88 | data = load_or_ingest_data_step() 89 | 90 | # step 2 split dataset into training and test 91 | splited_data = split_train_test_data_step(data) 92 | 93 | # step 3 feature engineering and train model 94 | model = train_model_step(splited_data) 95 | 96 | # step 4 prediction and evaluation model 97 | evaluate_model_step(model, splited_data) 98 | --------------------------------------------------------------------------------