├── backend
    ├── requirements.txt
    ├── __pycache__
    │   └── main.cpython-310.pyc
    ├── Dockerfile
    └── main.py
├── streamlit_app
    ├── requirements.txt
    ├── Dockerfile
    └── app.py
├── ml_service
    ├── requirements.txt
    ├── tests
    │   └── test_predictor.py
    ├── train.py
    └── main.py
├── mlflow
    └── Dockerfile
├── debezium-connector-config.json
├── postgres
    └── init.sql
├── docker-compose.yml
├── README.md
└── add_cars.sh


/backend/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi==0.104.1
2 | uvicorn==0.24.0
3 | sqlalchemy==2.0.23
4 | psycopg2-binary==2.9.9
5 | pydantic==2.5.2


--------------------------------------------------------------------------------
/streamlit_app/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit==1.31.0
2 | pandas==2.0.3
3 | requests==2.31.0
4 | plotly==5.18.0
5 | scikit-learn==1.3.0


--------------------------------------------------------------------------------
/backend/__pycache__/main.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Stefen-Taime/car-price-predictor/HEAD/backend/__pycache__/main.cpython-310.pyc


--------------------------------------------------------------------------------
/ml_service/requirements.txt:
--------------------------------------------------------------------------------
 1 | pandas==2.0.3
 2 | scikit-learn==1.3.0
 3 | mlflow==2.8.0
 4 | kafka-python==2.0.2
 5 | psycopg2-binary==2.9.9
 6 | boto3==1.28.0
 7 | python-dotenv==1.0.0
 8 | pytest==7.4.0
 9 | numpy>=1.24.0
10 | requests>=2.31.0


--------------------------------------------------------------------------------
/mlflow/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9-slim
 2 | 
 3 | RUN apt-get update && \
 4 |     apt-get install -y \
 5 |     curl \
 6 |     wget \
 7 |     mc \
 8 |     && apt-get clean \
 9 |     && rm -rf /var/lib/apt/lists/*
10 | 
11 | RUN pip install --no-cache-dir \
12 |     mlflow==2.8.1 \
13 |     psycopg2-binary \
14 |     boto3 \
15 |     pymysql
16 | 
17 | EXPOSE 5000
18 | 
19 | HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
20 |     CMD curl --fail http://localhost:5000/health || exit 1


--------------------------------------------------------------------------------
/backend/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9-slim
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | # Install system dependencies
 6 | RUN apt-get update && \
 7 |     apt-get install -y \
 8 |     curl \
 9 |     && apt-get clean \
10 |     && rm -rf /var/lib/apt/lists/*
11 | 
12 | COPY requirements.txt .
13 | RUN pip install --no-cache-dir -r requirements.txt
14 | 
15 | COPY . .
16 | 
17 | EXPOSE 8000
18 | 
19 | HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
20 |     CMD curl --fail http://localhost:8000/health || exit 1
21 | 
22 | CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]


--------------------------------------------------------------------------------
/streamlit_app/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9-slim
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | RUN apt-get update && \
 6 |     apt-get install -y \
 7 |     wget \
 8 |     && apt-get clean \
 9 |     && rm -rf /var/lib/apt/lists/*
10 | 
11 | COPY requirements.txt .
12 | RUN pip install --no-cache-dir -r requirements.txt
13 | 
14 | COPY . .
15 | 
16 | EXPOSE 8501
17 | 
18 | HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
19 |     CMD wget --no-verbose --tries=1 --spider http://localhost:8501 || exit 1
20 | 
21 | ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]


--------------------------------------------------------------------------------
/debezium-connector-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "cars-connector",
 3 |     "config": {
 4 |         "connector.class": "io.debezium.connector.postgresql.PostgresConnector",
 5 |         "tasks.max": "1",
 6 |         "database.hostname": "postgres",
 7 |         "database.port": "5432",
 8 |         "database.user": "postgres",
 9 |         "database.password": "postgres123",
10 |         "database.dbname": "cars_db",
11 |         "database.server.name": "cars",
12 |         "topic.prefix": "cars-db",
13 |         "schema.include.list": "public",
14 |         "table.include.list": "public.listings",
15 |         "plugin.name": "pgoutput"
16 |     }
17 | }


--------------------------------------------------------------------------------
/ml_service/tests/test_predictor.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import pandas as pd
 3 | import numpy as np
 4 | from main import CarPricePredictor
 5 | 
 6 | @pytest.fixture
 7 | def predictor():
 8 |     return CarPricePredictor()
 9 | 
10 | def test_preprocess_data(predictor):
11 |     predictor.load_model_and_artifacts()
12 |     
13 |     test_data = {
14 |         "model": "Fiesta",
15 |         "year": 2020,
16 |         "transmission": "Manual",
17 |         "fuelType": "Petrol",
18 |         "mileage": 10000,
19 |         "tax": 150,
20 |         "mpg": 50.0,
21 |         "engineSize": 1.0
22 |     }
23 |     
24 |     result = predictor.preprocess_data(test_data)
25 |     
26 |     assert isinstance(result, pd.DataFrame)
27 |     assert not result.isnull().values.any()


--------------------------------------------------------------------------------
/postgres/init.sql:
--------------------------------------------------------------------------------
 1 | -- Create databases
 2 | CREATE DATABASE mlflow;
 3 | CREATE DATABASE cars_db;
 4 | 
 5 | -- Connect to mlflow database and set up permissions
 6 | \c mlflow;
 7 | GRANT ALL PRIVILEGES ON DATABASE mlflow TO postgres;
 8 | CREATE SCHEMA IF NOT EXISTS public;
 9 | GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO postgres;
10 | GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA public TO postgres;
11 | ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL PRIVILEGES ON TABLES TO postgres;
12 | ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL PRIVILEGES ON SEQUENCES TO postgres;
13 | 
14 | -- Connect to cars_db and set up the listings table and permissions
15 | \c cars_db;
16 | 
17 | -- Create the listings table
18 | CREATE TABLE IF NOT EXISTS listings (
19 |     id SERIAL PRIMARY KEY,
20 |     model VARCHAR(100),
21 |     year INTEGER,
22 |     price DECIMAL,
23 |     transmission VARCHAR(50),
24 |     mileage INTEGER,
25 |     fuelType VARCHAR(50),
26 |     tax DECIMAL,
27 |     mpg DECIMAL,
28 |     engineSize DECIMAL,
29 |     predicted_price DECIMAL
30 | );
31 | 
32 | -- Grant permissions for cars_db
33 | GRANT ALL PRIVILEGES ON DATABASE cars_db TO postgres;
34 | GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO postgres;
35 | GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA public TO postgres;
36 | ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL PRIVILEGES ON TABLES TO postgres;
37 | ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL PRIVILEGES ON SEQUENCES TO postgres;


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
  1 | version: '3.8'
  2 | 
  3 | services:
  4 |   # PostgreSQL - Source Database
  5 |   postgres:
  6 |     image: postgres:15
  7 |     environment:
  8 |       POSTGRES_DB: cars_db
  9 |       POSTGRES_USER: postgres
 10 |       POSTGRES_PASSWORD: postgres123
 11 |     command: ["postgres", "-c", "wal_level=logical"]
 12 |     ports:
 13 |       - "5432:5432"
 14 |     volumes:
 15 |       - postgres_data:/var/lib/postgresql/data
 16 |       - ./postgres/init.sql:/docker-entrypoint-initdb.d/init.sql
 17 |     healthcheck:
 18 |       test: ["CMD-SHELL", "pg_isready -U postgres"]
 19 |       interval: 10s
 20 |       timeout: 5s
 21 |       retries: 5
 22 | 
 23 | 
 24 |   # Zookeeper
 25 |   zookeeper:
 26 |     image: confluentinc/cp-zookeeper:7.4.0
 27 |     environment:
 28 |       ZOOKEEPER_CLIENT_PORT: 2181
 29 |       ZOOKEEPER_TICK_TIME: 2000
 30 |     ports:
 31 |       - "2181:2181"
 32 |     healthcheck:
 33 |       test: echo srvr | nc zookeeper 2181 || exit 1
 34 |       interval: 10s
 35 |       timeout: 5s
 36 |       retries: 5
 37 | 
 38 |   # Kafka
 39 |   kafka:
 40 |     image: confluentinc/cp-kafka:7.4.0
 41 |     depends_on:
 42 |       zookeeper:
 43 |         condition: service_healthy
 44 |     ports:
 45 |       - "9092:9092"
 46 |     environment:
 47 |       KAFKA_BROKER_ID: 1
 48 |       KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
 49 |       KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092
 50 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
 51 |       KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT
 52 |       KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
 53 |     healthcheck:
 54 |       test: ["CMD-SHELL", "kafka-topics --bootstrap-server localhost:9092 --list"]
 55 |       interval: 30s
 56 |       timeout: 10s
 57 |       retries: 3
 58 | 
 59 |   # Debezium Connect
 60 |   connect:
 61 |     image: debezium/connect:2.4
 62 |     depends_on:
 63 |       kafka:
 64 |         condition: service_healthy
 65 |       postgres:
 66 |         condition: service_healthy
 67 |     ports:
 68 |       - "8083:8083"
 69 |     environment:
 70 |       BOOTSTRAP_SERVERS: kafka:29092
 71 |       GROUP_ID: "1"
 72 |       CONFIG_STORAGE_TOPIC: connect_configs
 73 |       OFFSET_STORAGE_TOPIC: connect_offsets
 74 |       STATUS_STORAGE_TOPIC: connect_statuses
 75 |       KEY_CONVERTER: org.apache.kafka.connect.json.JsonConverter
 76 |       VALUE_CONVERTER: org.apache.kafka.connect.json.JsonConverter
 77 |     healthcheck:
 78 |       test: ["CMD", "curl", "-f", "http://localhost:8083/"]
 79 |       interval: 30s
 80 |       timeout: 10s
 81 |       retries: 3
 82 | 
 83 |   # MinIO (S3-compatible storage)
 84 |   minio:
 85 |     image: minio/minio
 86 |     ports:
 87 |       - "9000:9000"
 88 |       - "9001:9001"
 89 |     environment:
 90 |       MINIO_ROOT_USER: minio
 91 |       MINIO_ROOT_PASSWORD: minio123
 92 |     command: server /data --console-address ":9001"
 93 |     volumes:
 94 |       - minio_data:/data
 95 |     healthcheck:
 96 |       test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
 97 |       interval: 30s
 98 |       timeout: 20s
 99 |       retries: 3
100 | 
101 |   # MLflow
102 |   mlflow:
103 |     build:
104 |       context: ./mlflow
105 |       dockerfile: Dockerfile
106 |     ports:
107 |       - "5000:5000"
108 |     environment:
109 |       MLFLOW_S3_ENDPOINT_URL: http://minio:9000
110 |       AWS_ACCESS_KEY_ID: minio
111 |       AWS_SECRET_ACCESS_KEY: minio123
112 |     depends_on:
113 |       minio:
114 |         condition: service_healthy
115 |       postgres:
116 |         condition: service_healthy
117 |     healthcheck:
118 |       test: curl --fail http://localhost:5000/health || exit 1
119 |       interval: 30s
120 |       timeout: 10s
121 |       retries: 5
122 |       start_period: 30s
123 |     command: |
124 |       sh -c '
125 |         mc config host add minio http://minio:9000 minio minio123 &&
126 |         mc mb minio/mlflow || true &&
127 |         mlflow server \
128 |         --backend-store-uri postgresql://postgres:postgres123@postgres:5432/mlflow \
129 |         --default-artifact-root s3://mlflow/ \
130 |         --host 0.0.0.0 \
131 |         --port 5000 \
132 |         --serve-artifacts
133 |       '
134 | 
135 |   # Redpanda Console (Kafka Web UI)
136 |   kafka-ui:
137 |     image: redpandadata/console:v2.4.3
138 |     ports:
139 |       - "8080:8080"
140 |     depends_on:
141 |       kafka:
142 |         condition: service_healthy
143 |     environment:
144 |       KAFKA_BROKERS: kafka:29092
145 |       SERVER_LISTENPORT: 8080
146 |       AUTH_PROVIDER: none
147 |       CONNECT_ENABLED: "true"
148 |       CONNECT_CLUSTERS_NAME: "kafka-connect"
149 |       CONNECT_CLUSTERS_URL: "http://connect:8083"
150 |     healthcheck:
151 |       test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8080"]
152 |       interval: 30s
153 |       timeout: 10s
154 |       retries: 5
155 |       start_period: 30s
156 | 
157 |   # Adminer Console (Postgres Web UI)
158 |   adminer:
159 |     image: adminer:latest
160 |     ports:
161 |       - "8081:8080"
162 |     depends_on:
163 |       postgres:
164 |         condition: service_healthy
165 |     environment:
166 |       ADMINER_DEFAULT_SERVER: postgres
167 |       ADMINER_DESIGN: pepa-linha
168 |       ADMINER_DEFAULT_DB: cars_db
169 |       ADMINER_DEFAULT_USER: postgres
170 |       ADMINER_DEFAULT_PASSWORD: postgres123
171 |     restart: always
172 | 
173 | volumes:
174 |   postgres_data:
175 |   minio_data:
176 | 
177 | networks:
178 |   default:
179 |     driver: bridge


--------------------------------------------------------------------------------
/ml_service/train.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import mlflow
  3 | import boto3
  4 | from sklearn.model_selection import train_test_split
  5 | from sklearn.ensemble import RandomForestRegressor
  6 | from sklearn.preprocessing import StandardScaler, LabelEncoder
  7 | from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
  8 | import numpy as np
  9 | import os
 10 | import pickle
 11 | import logging
 12 | from pathlib import Path
 13 | from botocore.client import Config
 14 | import warnings
 15 | 
 16 | 
 17 | logging.basicConfig(level=logging.INFO)
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | warnings.filterwarnings('ignore', category=UserWarning)
 21 | warnings.filterwarnings('ignore', category=FutureWarning)
 22 | 
 23 | def setup_minio():
 24 |     """Setup MinIO connection and ensure bucket exists"""
 25 |     try:
 26 |         s3_client = boto3.client(
 27 |             's3',
 28 |             endpoint_url='http://localhost:9000',
 29 |             aws_access_key_id='minio',
 30 |             aws_secret_access_key='minio123',
 31 |             config=Config(signature_version='s3v4'),
 32 |             region_name='us-east-1'
 33 |         )
 34 | 
 35 |         try:
 36 |             s3_client.head_bucket(Bucket='mlflow')
 37 |             logger.info("MLflow bucket exists")
 38 |         except:
 39 |             s3_client.create_bucket(Bucket='mlflow')
 40 |             logger.info("Created MLflow bucket")
 41 | 
 42 |     except Exception as e:
 43 |         logger.error(f"Error setting up MinIO: {str(e)}")
 44 |         raise
 45 | 
 46 | def prepare_data(df):
 47 |     """Prepare and preprocess the data"""
 48 |     numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
 49 |     for col in numeric_columns:
 50 |         df[col] = df[col].astype('float64')
 51 | 
 52 |     X = df.drop('price', axis=1)
 53 |     y = df['price']
 54 | 
 55 |     label_encoders = {}
 56 |     categorical_columns = ['model', 'transmission', 'fuelType']
 57 |     for column in categorical_columns:
 58 |         label_encoders[column] = LabelEncoder()
 59 |         X[column] = label_encoders[column].fit_transform(X[column])
 60 | 
 61 |     return X, y, label_encoders
 62 | 
 63 | def train_model():
 64 |     mlflow.set_tracking_uri("http://localhost:5000")
 65 |     
 66 |     os.environ['AWS_ACCESS_KEY_ID'] = 'minio'
 67 |     os.environ['AWS_SECRET_ACCESS_KEY'] = 'minio123'
 68 |     os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'http://localhost:9000'
 69 |     
 70 |     setup_minio()
 71 |     mlflow.set_experiment("car-price-prediction")
 72 | 
 73 |     current_dir = Path(__file__).parent.parent
 74 |     data_path = current_dir / 'data' / 'ford.csv'
 75 |     
 76 |     logger.info(f"Loading data from {data_path}")
 77 |     if not data_path.exists():
 78 |         raise FileNotFoundError(f"Data file not found at {data_path}")
 79 |         
 80 |     df = pd.read_csv(data_path)
 81 | 
 82 |     X, y, label_encoders = prepare_data(df)
 83 | 
 84 |     encoder_path = Path(__file__).parent / 'label_encoders.pkl'
 85 |     with open(encoder_path, 'wb') as f:
 86 |         pickle.dump(label_encoders, f)
 87 | 
 88 |     numerical_features = ['year', 'mileage', 'tax', 'mpg', 'engineSize']
 89 |     scaler = StandardScaler()
 90 | 
 91 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 92 | 
 93 |     X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
 94 |     X_test[numerical_features] = scaler.transform(X_test[numerical_features])
 95 | 
 96 |     scaler_path = Path(__file__).parent / 'scaler.pkl'
 97 |     with open(scaler_path, 'wb') as f:
 98 |         pickle.dump(scaler, f)
 99 | 
100 |     with mlflow.start_run():
101 |         params = {
102 |             'n_estimators': 100,
103 |             'max_depth': 10,
104 |             'min_samples_split': 2,
105 |             'min_samples_leaf': 1,
106 |             'random_state': 42
107 |         }
108 |         
109 |         logger.info("Training Random Forest model...")
110 |         rf = RandomForestRegressor(**params)
111 |         rf.fit(X_train, y_train)
112 | 
113 |         y_pred = rf.predict(X_test)
114 |         metrics = {
115 |             'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
116 |             'mae': mean_absolute_error(y_test, y_pred),
117 |             'r2': r2_score(y_test, y_pred)
118 |         }
119 | 
120 |         logger.info(f"Model metrics: {metrics}")
121 | 
122 |         mlflow.log_params(params)
123 |         mlflow.log_metrics(metrics)
124 | 
125 |         signature = mlflow.models.signature.infer_signature(X_train, rf.predict(X_train))
126 | 
127 |         mlflow.sklearn.log_model(
128 |             rf,
129 |             "model",
130 |             registered_model_name="car_price_predictor",
131 |             signature=signature
132 |         )
133 | 
134 |         mlflow.log_artifact(str(encoder_path))
135 |         mlflow.log_artifact(str(scaler_path))
136 | 
137 |         logger.info("Model and artifacts logged successfully")
138 | 
139 |         client = mlflow.tracking.MlflowClient()
140 |         model_version = client.search_model_versions("name='car_price_predictor'")[0]
141 |         
142 |         if model_version.current_stage != "Production":
143 |             client.set_registered_model_alias(
144 |                 name="car_price_predictor",
145 |                 alias="production",
146 |                 version=model_version.version
147 |             )
148 |             logger.info("Model set as production version")
149 | 
150 | if __name__ == "__main__":
151 |     try:
152 |         train_model()
153 |     except Exception as e:
154 |         logger.error(f"Training failed: {str(e)}")
155 |         raise


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Car Price Predictor
  2 | 
  3 | A machine learning system for predicting car prices using MLflow and Streamlit. This project implements a complete pipeline for predicting car prices with an interactive web interface and real-time visualization.
  4 | 
  5 | ## Architecture
  6 | 
  7 | - **Frontend**: Streamlit application for data visualization and interaction
  8 | - **Backend**: FastAPI REST API for data management
  9 | - **ML Pipeline**: MLflow for model management and serving
 10 | - **Storage**: PostgreSQL for data storage, MinIO for model artifacts
 11 |   
 12 | ```mermaid
 13 |  graph TB
 14 |     UI[Streamlit UI:8501] --> API[FastAPI:8000]
 15 |     API --> DB[(PostgreSQL:5432)]
 16 |     DB --> DEB[Debezium:8083]
 17 |     DEB --> KAFKA[Kafka:9092]
 18 |     KAFKA --> ML[ML Service]
 19 |     ML --> MLFLOW[MLflow:5000]
 20 |     MLFLOW --> MINIO[(MinIO:9000)]
 21 |     ML --> KAFKA
 22 |     KAFKA --> API
 23 |     API --> DB
 24 | 
 25 |     ADMIN[Adminer:8081] --> DB
 26 |     KAFKAUI[Kafka UI:8080] --> KAFKA
 27 |     ZK[Zookeeper:2181] --> KAFKA
 28 |     
 29 |     subgraph "User Interface"
 30 |         UI
 31 |         ADMIN
 32 |         KAFKAUI
 33 |     end
 34 | 
 35 |     subgraph "Storage"
 36 |         DB
 37 |         MINIO
 38 |     end
 39 | 
 40 |     subgraph "Processing"
 41 |         KAFKA
 42 |         DEB
 43 |         ML
 44 |         MLFLOW
 45 |         ZK
 46 |     end
 47 | 
 48 |     style UI fill:#2563eb,stroke:#1d4ed8,color:#fff
 49 |     style API fill:#2563eb,stroke:#1d4ed8,color:#fff
 50 |     style DB fill:#059669,stroke:#047857,color:#fff
 51 |     style MINIO fill:#059669,stroke:#047857,color:#fff
 52 |     style KAFKA fill:#4b5563,stroke:#374151,color:#fff
 53 |     style ML fill:#7c3aed,stroke:#6d28d9,color:#fff
 54 |     style MLFLOW fill:#7c3aed,stroke:#6d28d9,color:#fff
 55 | 
 56 | ```
 57 | 
 58 | 
 59 | ## Prerequisites
 60 | 
 61 | - Docker and Docker Compose
 62 | - Python 3.9+ (for local development)
 63 | 
 64 | ## Quick Start
 65 | 
 66 | 1. Clone the repository:
 67 | ```bash
 68 | git clone https://github.com/Stefen-Taime/car-price-predictor
 69 | cd car-price-predictor
 70 | ```
 71 | 
 72 | 2. Start the services:
 73 | ```bash
 74 | docker-compose up --build
 75 | ```
 76 | 
 77 | 3. Train the initial model:
 78 | ```bash
 79 | cd ml_service
 80 | python train.py
 81 | ```
 82 | 
 83 | 4. Start the FastAPI backend:
 84 | ```bash
 85 | cd backend
 86 | uvicorn main:app --reload
 87 | ```
 88 | 
 89 | 5. Start the Streamlit frontend:
 90 | ```bash
 91 | cd web-ui
 92 | streamlit run app.py
 93 | ```
 94 | 
 95 | 6. Access the applications:
 96 | - Streamlit UI: http://localhost:8501
 97 | - FastAPI Docs: http://localhost:8000/docs
 98 | - MLflow UI: http://localhost:5000
 99 | - MinIO Console: http://localhost:9001
100 | - Kafka UI: http://localhost:8080
101 | 
102 | ## Project Structure
103 | 
104 | ```
105 | .
106 | ├── backend/                # FastAPI backend service
107 | │   ├── main.py            # Main API application
108 | │   └── requirements.txt   # Python dependencies
109 | ├── data/                  # Training data
110 | │   └── ford.csv          # Sample car data
111 | ├── ml_service/           # ML training and prediction service
112 | │   ├── train.py         # Model training script
113 | │   └── main.py          # Prediction service
114 | ├── mlflow/               # MLflow service configuration
115 | ├── postgres/            # PostgreSQL initialization scripts
116 | ├── web-ui/              # Streamlit frontend application
117 | │   ├── app.py          # Main Streamlit application
118 | │   └── requirements.txt # Python dependencies
119 | ├── docker-compose.yml   # Docker services configuration
120 | └── README.md
121 | ```
122 | 
123 | ## Features
124 | 
125 | - Real-time car price predictions
126 | - Interactive data visualization with Streamlit
127 | - RESTful API with FastAPI
128 | - ML model versioning and tracking with MLflow
129 | - Beautiful charts with Plotly
130 | - Scalable architecture
131 | - API documentation with Swagger UI
132 | 
133 | ## Development
134 | 
135 | ### Backend Development
136 | 
137 | ```bash
138 | cd backend
139 | pip install -r requirements.txt
140 | uvicorn main:app --reload --host 0.0.0.0 --port 8000
141 | ```
142 | 
143 | ### Frontend Development
144 | 
145 | ```bash
146 | cd web-ui
147 | pip install -r requirements.txt
148 | streamlit run app.py
149 | ```
150 | 
151 | ### ML Service Development
152 | 
153 | ```bash
154 | cd ml_service
155 | pip install -r requirements.txt
156 | python train.py
157 | ```
158 | 
159 | ## API Documentation
160 | 
161 | The API documentation is available at `http://localhost:8000/docs` when the backend service is running. The following endpoints are available:
162 | 
163 | - `GET /cars`: List all cars
164 | - `POST /cars`: Add a new car
165 | - `GET /cars/{car_id}`: Get car details
166 | - `GET /health`: Check service health
167 | 
168 | ## Data Flow
169 | 
170 | 1. User submits car data through Streamlit interface
171 | 2. Data is sent to FastAPI backend
172 | 3. ML service makes predictions using MLflow
173 | 4. Results are stored in PostgreSQL
174 | 5. Updated data is displayed in Streamlit UI
175 | 
176 | ## Technologies Used
177 | 
178 | - **Backend**:
179 |   - FastAPI for REST API
180 |   - Pydantic for data validation
181 |   - SQLAlchemy for database ORM
182 | 
183 | - **Frontend**:
184 |   - Streamlit for UI
185 |   - Plotly for data visualization
186 |   - Pandas for data manipulation
187 | 
188 | - **ML Pipeline**:
189 |   - MLflow for model management
190 |   - scikit-learn for ML models
191 |   - PostgreSQL for data storage
192 |   - MinIO for artifact storage
193 | 
194 | ## Contributing
195 | 
196 | 1. Fork the repository
197 | 2. Create your feature branch (`git checkout -b feature/amazing-feature`)
198 | 3. Commit your changes (`git commit -m 'Add some amazing feature'`)
199 | 4. Push to the branch (`git push origin feature/amazing-feature`)
200 | 5. Open a Pull Request
201 | 
202 | 
203 | ## Acknowledgments
204 | 
205 | - Ford Used Car Dataset
206 | - MLflow for ML model management
207 | - FastAPI for the backend API
208 | - Streamlit for the interactive UI
209 | 


--------------------------------------------------------------------------------
/add_cars.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Fiesta variants
  4 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \
  5 | -d '{"model":"Fiesta","year":2020,"price":12500,"transmission":"Manual","mileage":25000,"fuelType":"Petrol","tax":150,"mpg":55.4,"engineSize":1.0}'
  6 | echo -e "\nFiesta 1.0 added"
  7 | 
  8 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \
  9 | -d '{"model":"Fiesta","year":2021,"price":14500,"transmission":"Automatic","mileage":18000,"fuelType":"Petrol","tax":155,"mpg":52.3,"engineSize":1.1}'
 10 | echo -e "\nFiesta 1.1 added"
 11 | 
 12 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \
 13 | -d '{"model":"Fiesta ST","year":2022,"price":22000,"transmission":"Manual","mileage":8000,"fuelType":"Petrol","tax":165,"mpg":45.6,"engineSize":1.5}'
 14 | echo -e "\nFiesta ST added"
 15 | 
 16 | # Focus variants
 17 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \
 18 | -d '{"model":"Focus","year":2021,"price":18500,"transmission":"Manual","mileage":15000,"fuelType":"Petrol","tax":165,"mpg":50.2,"engineSize":1.5}'
 19 | echo -e "\nFocus 1.5 added"
 20 | 
 21 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \
 22 | -d '{"model":"Focus","year":2022,"price":21000,"transmission":"Automatic","mileage":12000,"fuelType":"Diesel","tax":155,"mpg":58.8,"engineSize":2.0}'
 23 | echo -e "\nFocus Diesel added"
 24 | 
 25 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \
 26 | -d '{"model":"Focus ST","year":2023,"price":34000,"transmission":"Manual","mileage":5000,"fuelType":"Petrol","tax":185,"mpg":35.7,"engineSize":2.3}'
 27 | echo -e "\nFocus ST added"
 28 | 
 29 | # Puma variants
 30 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \
 31 | -d '{"model":"Puma","year":2023,"price":22500,"transmission":"Manual","mileage":5000,"fuelType":"Petrol","tax":155,"mpg":52.3,"engineSize":1.0}'
 32 | echo -e "\nPuma 1.0 added"
 33 | 
 34 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \
 35 | -d '{"model":"Puma","year":2022,"price":24500,"transmission":"Automatic","mileage":8000,"fuelType":"Hybrid","tax":145,"mpg":58.9,"engineSize":1.0}'
 36 | echo -e "\nPuma Hybrid added"
 37 | 
 38 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \
 39 | -d '{"model":"Puma ST","year":2023,"price":32000,"transmission":"Manual","mileage":3000,"fuelType":"Petrol","tax":170,"mpg":42.8,"engineSize":1.5}'
 40 | echo -e "\nPuma ST added"
 41 | 
 42 | # Kuga variants
 43 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \
 44 | -d '{"model":"Kuga","year":2022,"price":28000,"transmission":"Manual","mileage":12000,"fuelType":"Diesel","tax":155,"mpg":54.3,"engineSize":1.5}'
 45 | echo -e "\nKuga Diesel added"
 46 | 
 47 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \
 48 | -d '{"model":"Kuga","year":2023,"price":32000,"transmission":"Automatic","mileage":8000,"fuelType":"Hybrid","tax":145,"mpg":48.7,"engineSize":2.0}'
 49 | echo -e "\nKuga Hybrid added"
 50 | 
 51 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \
 52 | -d '{"model":"Kuga PHEV","year":2023,"price":38000,"transmission":"Automatic","mileage":5000,"fuelType":"Hybrid","tax":0,"mpg":201.8,"engineSize":2.5}'
 53 | echo -e "\nKuga PHEV added"
 54 | 
 55 | # Mustang variants
 56 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \
 57 | -d '{"model":"Mustang","year":2021,"price":45000,"transmission":"Manual","mileage":12000,"fuelType":"Petrol","tax":580,"mpg":25.7,"engineSize":5.0}'
 58 | echo -e "\nMustang GT Manual added"
 59 | 
 60 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \
 61 | -d '{"model":"Mustang","year":2022,"price":48000,"transmission":"Automatic","mileage":8000,"fuelType":"Petrol","tax":580,"mpg":24.8,"engineSize":5.0}'
 62 | echo -e "\nMustang GT Auto added"
 63 | 
 64 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \
 65 | -d '{"model":"Mustang Mach-E","year":2023,"price":55000,"transmission":"Automatic","mileage":5000,"fuelType":"Electric","tax":0,"mpg":379.0,"engineSize":0.0}'
 66 | echo -e "\nMustang Mach-E added"
 67 | 
 68 | # Explorer variants
 69 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \
 70 | -d '{"model":"Explorer","year":2022,"price":58000,"transmission":"Automatic","mileage":15000,"fuelType":"Petrol","tax":580,"mpg":25.7,"engineSize":3.0}'
 71 | echo -e "\nExplorer Petrol added"
 72 | 
 73 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \
 74 | -d '{"model":"Explorer","year":2023,"price":65000,"transmission":"Automatic","mileage":8000,"fuelType":"Hybrid","tax":150,"mpg":35.3,"engineSize":3.0}'
 75 | echo -e "\nExplorer Hybrid added"
 76 | 
 77 | # Ranger variants
 78 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \
 79 | -d '{"model":"Ranger","year":2022,"price":32000,"transmission":"Manual","mileage":20000,"fuelType":"Diesel","tax":290,"mpg":35.3,"engineSize":2.0}'
 80 | echo -e "\nRanger Diesel added"
 81 | 
 82 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \
 83 | -d '{"model":"Ranger Raptor","year":2023,"price":48000,"transmission":"Automatic","mileage":5000,"fuelType":"Diesel","tax":290,"mpg":32.1,"engineSize":3.0}'
 84 | echo -e "\nRanger Raptor added"
 85 | 
 86 | # Transit Custom variants
 87 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \
 88 | -d '{"model":"Transit Custom","year":2022,"price":28000,"transmission":"Manual","mileage":25000,"fuelType":"Diesel","tax":275,"mpg":40.9,"engineSize":2.0}'
 89 | echo -e "\nTransit Custom added"
 90 | 
 91 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \
 92 | -d '{"model":"Transit Custom PHEV","year":2023,"price":42000,"transmission":"Automatic","mileage":8000,"fuelType":"Hybrid","tax":0,"mpg":91.7,"engineSize":1.0}'
 93 | echo -e "\nTransit Custom PHEV added"
 94 | 
 95 | # Tourneo variants
 96 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \
 97 | -d '{"model":"Tourneo Custom","year":2022,"price":38000,"transmission":"Automatic","mileage":15000,"fuelType":"Diesel","tax":275,"mpg":38.2,"engineSize":2.0}'
 98 | echo -e "\nTourneo Custom added"
 99 | 
100 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \
101 | -d '{"model":"Tourneo Connect","year":2023,"price":32000,"transmission":"Manual","mileage":8000,"fuelType":"Diesel","tax":155,"mpg":45.6,"engineSize":1.5}'
102 | echo -e "\nTourneo Connect added"
103 | 
104 | echo -e "\nAll cars have been added successfully!"


--------------------------------------------------------------------------------
/backend/main.py:
--------------------------------------------------------------------------------
  1 | from fastapi import FastAPI, HTTPException, Depends
  2 | from fastapi.middleware.cors import CORSMiddleware
  3 | from sqlalchemy import create_engine, Column, Integer, String, Float, DateTime
  4 | from sqlalchemy.ext.declarative import declarative_base
  5 | from sqlalchemy.orm import sessionmaker, Session
  6 | from pydantic import BaseModel, Field, validator
  7 | from typing import Optional, List, Dict, Any
  8 | import os
  9 | from datetime import datetime
 10 | from kafka import KafkaProducer, KafkaConsumer
 11 | import json
 12 | import logging
 13 | from contextlib import contextmanager
 14 | import threading
 15 | import time
 16 | 
 17 | logging.basicConfig(level=logging.INFO)
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | app = FastAPI(title="Car Price Prediction API")
 21 | 
 22 | app.add_middleware(
 23 |     CORSMiddleware,
 24 |     allow_origins=["*"],
 25 |     allow_credentials=True,
 26 |     allow_methods=["*"],
 27 |     allow_headers=["*"],
 28 | )
 29 | 
 30 | SQLALCHEMY_DATABASE_URL = "postgresql://postgres:postgres123@localhost:5432/cars_db"
 31 | engine = create_engine(SQLALCHEMY_DATABASE_URL)
 32 | SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
 33 | Base = declarative_base()
 34 | 
 35 | KAFKA_BOOTSTRAP_SERVERS = "localhost:9092"
 36 | KAFKA_TOPIC_LISTINGS = "cars-db.public.listings"
 37 | KAFKA_TOPIC_PREDICTIONS = "cars.public.predictions"
 38 | 
 39 | class Car(Base):
 40 |     __tablename__ = "listings"
 41 |     
 42 |     id = Column(Integer, primary_key=True, index=True)
 43 |     model = Column(String)
 44 |     year = Column(Integer)
 45 |     price = Column(Float)
 46 |     transmission = Column(String)
 47 |     mileage = Column(Integer)
 48 |     fuelType = Column(String)
 49 |     tax = Column(Float)
 50 |     mpg = Column(Float)
 51 |     engineSize = Column(Float)
 52 |     predicted_price = Column(Float, nullable=True)
 53 |     created_at = Column(DateTime, default=datetime.utcnow)
 54 |     updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
 55 | 
 56 | class CarBase(BaseModel):
 57 |     model: str = Field(..., example="Fiesta")
 58 |     year: int = Field(..., ge=1900, le=datetime.now().year, example=2019)
 59 |     price: float = Field(..., gt=0, example=12000)
 60 |     transmission: str = Field(..., example="Manual")
 61 |     mileage: int = Field(..., ge=0, example=25000)
 62 |     fuelType: str = Field(..., example="Petrol")
 63 |     tax: float = Field(..., ge=0, example=145)
 64 |     mpg: float = Field(..., ge=0, example=55.4)
 65 |     engineSize: float = Field(..., gt=0, example=1.0)
 66 | 
 67 |     @validator('transmission')
 68 |     def validate_transmission(cls, v):
 69 |         allowed = {'Manual', 'Automatic', 'Semi-Auto'}
 70 |         if v not in allowed:
 71 |             raise ValueError(f'transmission must be one of {allowed}')
 72 |         return v
 73 | 
 74 |     @validator('fuelType')
 75 |     def validate_fuel_type(cls, v):
 76 |         allowed = {'Petrol', 'Diesel', 'Hybrid', 'Electric'}
 77 |         if v not in allowed:
 78 |             raise ValueError(f'fuelType must be one of {allowed}')
 79 |         return v
 80 | 
 81 | class CarCreate(CarBase):
 82 |     pass
 83 | 
 84 | class CarResponse(CarBase):
 85 |     id: int
 86 |     predicted_price: Optional[float] = None
 87 |     created_at: datetime
 88 |     updated_at: datetime
 89 | 
 90 |     class Config:
 91 |         from_attributes = True
 92 | 
 93 | @contextmanager
 94 | def get_db():
 95 |     db = SessionLocal()
 96 |     try:
 97 |         yield db
 98 |     finally:
 99 |         db.close()
100 | 
101 | class KafkaManager:
102 |     _producer = None
103 |     _consumer = None
104 |     _consumer_thread = None
105 |     _running = False
106 | 
107 |     @classmethod
108 |     def get_producer(cls):
109 |         if cls._producer is None:
110 |             cls._producer = KafkaProducer(
111 |                 bootstrap_servers=KAFKA_BOOTSTRAP_SERVERS,
112 |                 value_serializer=lambda x: json.dumps(x).encode('utf-8')
113 |             )
114 |         return cls._producer
115 | 
116 |     @classmethod
117 |     def start_consumer(cls):
118 |         if cls._consumer_thread is None:
119 |             cls._running = True
120 |             cls._consumer_thread = threading.Thread(target=cls._consume_predictions)
121 |             cls._consumer_thread.daemon = True
122 |             cls._consumer_thread.start()
123 | 
124 |     @classmethod
125 |     def _consume_predictions(cls):
126 |         consumer = KafkaConsumer(
127 |             KAFKA_TOPIC_PREDICTIONS,
128 |             bootstrap_servers=KAFKA_BOOTSTRAP_SERVERS,
129 |             value_deserializer=lambda x: json.loads(x.decode('utf-8')),
130 |             auto_offset_reset='latest',
131 |             enable_auto_commit=True,
132 |             group_id='backend-consumer'
133 |         )
134 | 
135 |         while cls._running:
136 |             try:
137 |                 messages = consumer.poll(timeout_ms=1000)
138 |                 for topic_partition, msgs in messages.items():
139 |                     for message in msgs:
140 |                         cls._handle_prediction(message.value)
141 |             except Exception as e:
142 |                 logger.error(f"Error consuming messages: {str(e)}")
143 |                 time.sleep(5)
144 | 
145 |         consumer.close()
146 | 
147 |     @classmethod
148 |     def _handle_prediction(cls, prediction_data):
149 |         try:
150 |             with get_db() as db:
151 |                 car_id = prediction_data.get('id')
152 |                 predicted_price = prediction_data.get('predicted_price')
153 |                 
154 |                 if car_id and predicted_price:
155 |                     car = db.query(Car).filter(Car.id == car_id).first()
156 |                     if car:
157 |                         car.predicted_price = predicted_price
158 |                         db.commit()
159 |                         logger.info(f"Updated prediction for car {car_id}: £{predicted_price:,.2f}")
160 |         except Exception as e:
161 |             logger.error(f"Error handling prediction: {str(e)}")
162 | 
163 | # Routes
164 | @app.get("/cars", response_model=List[CarResponse])
165 | def get_cars():
166 |     with get_db() as db:
167 |         cars = db.query(Car).all()
168 |         return cars
169 | 
170 | @app.post("/cars", response_model=CarResponse)
171 | def create_car(car: CarCreate):
172 |     with get_db() as db:
173 |         db_car = Car(**car.dict())
174 |         db.add(db_car)
175 |         db.commit()
176 |         db.refresh(db_car)
177 |         
178 |         try:
179 |             producer = KafkaManager.get_producer()
180 |             producer.send(KAFKA_TOPIC_LISTINGS, db_car.__dict__)
181 |             producer.flush()
182 |             logger.info(f"Sent car {db_car.id} to Kafka")
183 |         except Exception as e:
184 |             logger.error(f"Error sending to Kafka: {str(e)}")
185 |         
186 |         return db_car
187 | 
188 | @app.get("/cars/{car_id}", response_model=CarResponse)
189 | def get_car(car_id: int):
190 |     with get_db() as db:
191 |         car = db.query(Car).filter(Car.id == car_id).first()
192 |         if car is None:
193 |             raise HTTPException(status_code=404, detail="Car not found")
194 |         return car
195 | 
196 | @app.get("/health")
197 | def health_check():
198 |     return {
199 |         "status": "healthy",
200 |         "timestamp": datetime.utcnow().isoformat(),
201 |         "database": "connected" if engine.connect() else "disconnected"
202 |     }
203 | 
204 | @app.on_event("startup")
205 | async def startup_event():
206 |     Base.metadata.create_all(bind=engine)
207 |     KafkaManager.start_consumer()
208 |     logger.info("Application started, Kafka consumer running")
209 | 
210 | @app.on_event("shutdown")
211 | async def shutdown_event():
212 |     KafkaManager._running = False
213 |     if KafkaManager._consumer_thread:
214 |         KafkaManager._consumer_thread.join(timeout=5)
215 |     if KafkaManager._producer:
216 |         KafkaManager._producer.close()
217 |     logger.info("Application shutdown complete")
218 | 
219 | if __name__ == "__main__":
220 |     import uvicorn
221 |     uvicorn.run(app, host="0.0.0.0", port=8000)


--------------------------------------------------------------------------------
/ml_service/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import mlflow
  4 | import pandas as pd
  5 | import numpy as np
  6 | from kafka import KafkaConsumer, KafkaProducer
  7 | from sklearn.preprocessing import StandardScaler, LabelEncoder
  8 | import pickle
  9 | from dotenv import load_dotenv
 10 | import logging
 11 | from typing import Optional, Dict, Any
 12 | import time
 13 | from pathlib import Path
 14 | import boto3
 15 | from botocore.client import Config
 16 | import warnings
 17 | 
 18 | logging.basicConfig(
 19 |     level=logging.INFO,
 20 |     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 21 | )
 22 | logger = logging.getLogger(__name__)
 23 | 
 24 | warnings.filterwarnings('ignore', category=UserWarning)
 25 | warnings.filterwarnings('ignore', category=FutureWarning)
 26 | 
 27 | class CarPricePredictor:
 28 |     def __init__(self):
 29 |         self.model = None
 30 |         self.label_encoders = None
 31 |         self.scaler = None
 32 |         self.consumer = None
 33 |         self.producer = None
 34 | 
 35 |     def setup_minio(self):
 36 |         """Configure MinIO credentials"""
 37 |         os.environ['AWS_ACCESS_KEY_ID'] = 'minio'
 38 |         os.environ['AWS_SECRET_ACCESS_KEY'] = 'minio123'
 39 |         os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'http://localhost:9000'
 40 |         
 41 |         boto3.client(
 42 |             's3',
 43 |             endpoint_url='http://localhost:9000',
 44 |             aws_access_key_id='minio',
 45 |             aws_secret_access_key='minio123',
 46 |             config=Config(signature_version='s3v4'),
 47 |             region_name='us-east-1'
 48 |         )
 49 |         logger.info("MinIO credentials configured")
 50 | 
 51 |     def load_model_and_artifacts(self) -> None:
 52 |         """Load the ML model and preprocessing artifacts"""
 53 |         try:
 54 |             logger.info("Loading model and artifacts...")
 55 |             
 56 |             self.setup_minio()
 57 |             
 58 |             current_dir = Path(__file__).parent
 59 |             
 60 |             encoder_path = current_dir / 'label_encoders.pkl'
 61 |             with open(encoder_path, "rb") as f:
 62 |                 self.label_encoders = pickle.load(f)
 63 |             logger.info("Loaded label encoders")
 64 |             
 65 |             scaler_path = current_dir / 'scaler.pkl'
 66 |             with open(scaler_path, "rb") as f:
 67 |                 self.scaler = pickle.load(f)
 68 |             logger.info("Loaded scaler")
 69 |             
 70 |             mlflow.set_tracking_uri("http://localhost:5000")
 71 |             
 72 |             max_retries = 5
 73 |             for i in range(max_retries):
 74 |                 try:
 75 |                     runs = mlflow.search_runs(experiment_ids=["1"])
 76 |                     if len(runs) > 0:
 77 |                         break
 78 |                 except Exception as e:
 79 |                     if i == max_retries - 1:
 80 |                         raise
 81 |                     logger.warning(f"Failed to connect to MLflow, retrying... ({i+1}/{max_retries})")
 82 |                     time.sleep(5)
 83 |             
 84 |             if len(runs) == 0:
 85 |                 raise Exception("No runs found in MLflow")
 86 |             
 87 |             latest_run = runs.sort_values("start_time", ascending=False).iloc[0]
 88 |             run_id = latest_run.run_id
 89 |             
 90 |             logger.info(f"Loading model from run {run_id}")
 91 |             self.model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model")
 92 |             logger.info("Model loaded successfully")
 93 |                 
 94 |         except Exception as e:
 95 |             logger.error(f"Error loading model and artifacts: {str(e)}")
 96 |             raise
 97 | 
 98 |     def setup_kafka(self) -> None:
 99 |         """Initialize Kafka consumer and producer"""
100 |         try:
101 |             KAFKA_BOOTSTRAP_SERVERS = os.getenv('KAFKA_BOOTSTRAP_SERVERS', 'localhost:9092')
102 |             
103 |             self.consumer = KafkaConsumer(
104 |                 'cars-db.public.listings',
105 |                 bootstrap_servers=KAFKA_BOOTSTRAP_SERVERS,
106 |                 auto_offset_reset='earliest',
107 |                 enable_auto_commit=True,
108 |                 group_id='car_price_predictor',
109 |                 value_deserializer=lambda x: json.loads(x.decode('utf-8')),
110 |                 consumer_timeout_ms=1000
111 |             )
112 |             
113 |             self.producer = KafkaProducer(
114 |                 bootstrap_servers=KAFKA_BOOTSTRAP_SERVERS,
115 |                 value_serializer=lambda x: json.dumps(x).encode('utf-8'),
116 |                 retries=5
117 |             )
118 |             
119 |             logger.info("Kafka consumer and producer initialized")
120 |             
121 |         except Exception as e:
122 |             logger.error(f"Error setting up Kafka: {str(e)}")
123 |             raise
124 | 
125 |     def preprocess_data(self, data: Dict[str, Any]) -> pd.DataFrame:
126 |         """Preprocess the input data"""
127 |         try:
128 |             df = pd.DataFrame([data])
129 |             
130 |             numeric_columns = ['year', 'mileage', 'tax', 'mpg', 'engineSize']
131 |             for col in numeric_columns:
132 |                 df[col] = df[col].astype(float)
133 |             
134 |             categorical_columns = ['model', 'transmission', 'fuelType']
135 |             for column in categorical_columns:
136 |                 df[column] = self.label_encoders[column].transform(df[column])
137 |             
138 |             df[numeric_columns] = self.scaler.transform(df[numeric_columns])
139 |             
140 |             return df
141 |             
142 |         except Exception as e:
143 |             logger.error(f"Error preprocessing data: {str(e)}")
144 |             raise
145 | 
146 |     def process_message(self, message: Any) -> Optional[Dict[str, Any]]:
147 |         """Process incoming Kafka message and return prediction"""
148 |         try:
149 |             data = message.value if isinstance(message.value, dict) else json.loads(message.value)
150 |             
151 |             if 'payload' in data and 'after' in data['payload']:
152 |                 car_data = data['payload']['after']
153 |             else:
154 |                 car_data = data
155 |             
156 |             required_fields = ['model', 'year', 'transmission', 'mileage', 
157 |                              'fuelType', 'tax', 'mpg', 'engineSize']
158 |             
159 |             for field in required_fields:
160 |                 if field not in car_data:
161 |                     logger.warning(f"Missing required field: {field}")
162 |                     return None
163 |             
164 |             processed_data = self.preprocess_data(car_data)
165 |             
166 |             prediction = self.model.predict(processed_data)[0]
167 |             
168 |             car_data['predicted_price'] = float(prediction)
169 |             car_data['prediction_timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S')
170 |             
171 |             return car_data
172 |             
173 |         except Exception as e:
174 |             logger.error(f"Error processing message: {str(e)}")
175 |             return None
176 | 
177 |     def run(self) -> None:
178 |         """Main processing loop"""
179 |         logger.info("Starting to consume messages...")
180 |         
181 |         while True:
182 |             try:
183 |                 messages = self.consumer.poll(timeout_ms=1000)
184 |                 
185 |                 for topic_partition, msgs in messages.items():
186 |                     for message in msgs:
187 |                         result = self.process_message(message)
188 |                         
189 |                         if result:
190 |                             self.producer.send('cars.public.predictions', value=result)
191 |                             self.producer.flush()
192 |                             
193 |                             logger.info(
194 |                                 f"Processed car: {result.get('model', 'Unknown')} "
195 |                                 f"({result.get('year', 'Unknown')}). "
196 |                                 f"Predicted price: £{result['predicted_price']:,.2f}"
197 |                             )
198 |                 
199 |                 time.sleep(0.1)
200 |                     
201 |             except KeyboardInterrupt:
202 |                 logger.info("Stopping the service...")
203 |                 break
204 |                 
205 |             except Exception as e:
206 |                 logger.error(f"Error in processing loop: {str(e)}")
207 |                 time.sleep(5)  
208 |         
209 |         try:
210 |             self.consumer.close()
211 |             self.producer.close()
212 |             logger.info("Kafka connections closed")
213 |         except Exception as e:
214 |             logger.error(f"Error during cleanup: {str(e)}")
215 |         
216 |         logger.info("Service stopped")
217 | 
218 | def main():
219 |     """Main entry point"""
220 |     try:
221 |         load_dotenv()
222 |         
223 |         predictor = CarPricePredictor()
224 |         
225 |         predictor.load_model_and_artifacts()
226 |         
227 |         predictor.setup_kafka()
228 |         
229 |         predictor.run()
230 |         
231 |     except Exception as e:
232 |         logger.error(f"Application failed: {str(e)}")
233 |         raise
234 |     
235 |     finally:
236 |         logger.info("Application shutdown complete")
237 | 
238 | if __name__ == "__main__":
239 |     main()


--------------------------------------------------------------------------------
/streamlit_app/app.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import pandas as pd
  3 | import requests
  4 | import plotly.express as px
  5 | from datetime import datetime
  6 | import time
  7 | 
  8 | API_URL = "http://localhost:8000"
  9 | 
 10 | st.set_page_config(
 11 |     page_title="Car Price Predictor",
 12 |     page_icon="🚗",
 13 |     layout="wide"
 14 | )
 15 | 
 16 | st.markdown("""
 17 |     <style>
 18 |         .stMetric {
 19 |             background-color: #f0f2f6;
 20 |             padding: 10px;
 21 |             border-radius: 5px;
 22 |         }
 23 |         .stAlert {
 24 |             padding: 10px;
 25 |             border-radius: 5px;
 26 |         }
 27 |         .prediction-box {
 28 |             background-color: #e6f3ff;
 29 |             padding: 15px;
 30 |             border-radius: 10px;
 31 |             margin: 10px 0;
 32 |         }
 33 |         .footer {
 34 |             text-align: center;
 35 |             padding: 20px;
 36 |             position: fixed;
 37 |             bottom: 0;
 38 |             width: 100%;
 39 |         }
 40 |     </style>
 41 | """, unsafe_allow_html=True)
 42 | 
 43 | @st.cache_data(ttl=300)
 44 | def fetch_cars():
 45 |     """Fetch cars data from API with caching"""
 46 |     try:
 47 |         st.write("Fetching data from API...")  
 48 |         response = requests.get(f"{API_URL}/cars")
 49 |         st.write(f"Response status: {response.status_code}") 
 50 |         if response.status_code == 200:
 51 |             data = pd.DataFrame(response.json())
 52 |             st.write(f"Data fetched successfully. Shape: {data.shape}") 
 53 |             return data
 54 |         st.write("Failed to fetch data")  
 55 |         return pd.DataFrame()
 56 |     except Exception as e:
 57 |         st.write(f"Error fetching data: {str(e)}")  
 58 |         return pd.DataFrame()
 59 | 
 60 | @st.cache_data(ttl=600)
 61 | def get_unique_values(df, column):
 62 |     """Get unique values from DataFrame column"""
 63 |     try:
 64 |         return sorted(df[column].unique().tolist()) if not df.empty else []
 65 |     except Exception as e:
 66 |         st.write(f"Error getting unique values: {str(e)}")  
 67 |         return []
 68 | 
 69 | def create_plot(df, plot_type):
 70 |     """Create different types of plots based on the data"""
 71 |     try:
 72 |         if df.empty:
 73 |             return None
 74 |             
 75 |         if plot_type == "distribution":
 76 |             fig = px.histogram(
 77 |                 df,
 78 |                 x="price",
 79 |                 title="Price Distribution",
 80 |                 nbins=30,
 81 |                 labels={"price": "Price ($)", "count": "Number of Cars"}
 82 |             )
 83 |             fig.update_layout(showlegend=False)
 84 |             
 85 |         elif plot_type == "scatter":
 86 |             fig = px.scatter(
 87 |                 df[df['price'].notna()],  
 88 |                 x="mileage",
 89 |                 y="price",
 90 |                 color="model",
 91 |                 size="engineSize",
 92 |                 title="Price vs Mileage by Model",
 93 |                 labels={
 94 |                     "mileage": "Mileage (miles)",
 95 |                     "price": "Price ($)",
 96 |                     "model": "Model",
 97 |                     "engineSize": "Engine Size (L)"
 98 |                 }
 99 |             )
100 |             
101 |         elif plot_type == "box":
102 |             fig = px.box(
103 |                 df[df['price'].notna()],  
104 |                 x="year",
105 |                 y="price",
106 |                 title="Price Distribution by Year",
107 |                 labels={"year": "Year", "price": "Price ($)"}
108 |             )
109 |             
110 |         elif plot_type == "prediction":
111 |             valid_data = df[df['price'].notna() & df['predicted_price'].notna()]
112 |             if not valid_data.empty:
113 |                 fig = px.scatter(
114 |                     valid_data,
115 |                     x="price",
116 |                     y="predicted_price",
117 |                     color="model",
118 |                     title="Predicted vs Actual Price",
119 |                     labels={
120 |                         "price": "Actual Price ($)",
121 |                         "predicted_price": "Predicted Price ($)",
122 |                         "model": "Model"
123 |                     }
124 |                 )
125 |                 
126 |                 min_val = min(valid_data['price'].min(), valid_data['predicted_price'].min())
127 |                 max_val = max(valid_data['price'].max(), valid_data['predicted_price'].max())
128 |                 fig.add_shape(
129 |                     type='line',
130 |                     line=dict(dash='dash', color='gray'),
131 |                     x0=min_val,
132 |                     y0=min_val,
133 |                     x1=max_val,
134 |                     y1=max_val
135 |                 )
136 |                 return fig
137 |             return None
138 |                 
139 |         return fig
140 |     except Exception as e:
141 |         st.write(f"Error creating plot: {str(e)}")  
142 |         return None
143 | 
144 | st.title("🚗 Car Price Predictor")
145 | 
146 | with st.sidebar:
147 |     st.header("Add New Car")
148 |     
149 |     with st.form("car_form", clear_on_submit=True):
150 |         model = st.text_input(
151 |             "Model",
152 |             help="Enter the car model (e.g., Ford Fiesta)"
153 |         )
154 |         year = st.number_input(
155 |             "Year",
156 |             min_value=1900,
157 |             max_value=datetime.now().year,
158 |             value=2020,
159 |             help="Select the manufacturing year"
160 |         )
161 |         transmission = st.selectbox(
162 |             "Transmission",
163 |             ["Manual", "Automatic", "Semi-Auto"],
164 |             help="Select the transmission type"
165 |         )
166 |         fuel_type = st.selectbox(
167 |             "Fuel Type",
168 |             ["Petrol", "Diesel", "Hybrid", "Electric"],
169 |             help="Select the fuel type"
170 |         )
171 |         mileage = st.number_input(
172 |             "Mileage",
173 |             min_value=0,
174 |             value=10000,
175 |             help="Enter the total mileage in miles"
176 |         )
177 |         engine_size = st.number_input(
178 |             "Engine Size (L)",
179 |             min_value=0.1,
180 |             max_value=10.0,
181 |             value=1.5,
182 |             step=0.1,
183 |             help="Enter the engine size in liters"
184 |         )
185 |         tax = st.number_input(
186 |             "Tax ($)",
187 |             min_value=0,
188 |             value=150,
189 |             help="Enter the annual road tax"
190 |         )
191 |         mpg = st.number_input(
192 |             "MPG",
193 |             min_value=0.0,
194 |             value=50.0,
195 |             help="Enter the miles per gallon"
196 |         )
197 |         price = st.number_input(
198 |             "Price ($)",
199 |             min_value=0,
200 |             value=15000,
201 |             help="Enter the car price"
202 |         )
203 |         
204 |         submitted = st.form_submit_button("Add Car")
205 |         
206 |         if submitted and model:
207 |             confirmation = st.checkbox("I confirm all details are correct")
208 |             if confirmation:
209 |                 car_data = {
210 |                     "model": model,
211 |                     "year": year,
212 |                     "transmission": transmission,
213 |                     "fuelType": fuel_type,
214 |                     "mileage": mileage,
215 |                     "engineSize": engine_size,
216 |                     "tax": tax,
217 |                     "mpg": mpg,
218 |                     "price": price
219 |                 }
220 |                 
221 |                 with st.spinner('Adding car...'):
222 |                     try:
223 |                         st.write(f"Submitting data: {car_data}")  
224 |                         response = requests.post(f"{API_URL}/cars", json=car_data)
225 |                         st.write(f"Response status: {response.status_code}")  
226 |                         if response.status_code == 200:
227 |                             st.success("Car added successfully!")
228 |                             st.balloons()
229 |                             fetch_cars.clear() 
230 |                             time.sleep(1)
231 |                         else:
232 |                             st.error(f"Failed to add car: {response.text}")
233 |                     except Exception as e:
234 |                         st.error(f"Failed to connect to server: {str(e)}")
235 | 
236 | df = fetch_cars()
237 | 
238 | if not df.empty:
239 |     st.subheader("📊 Dashboard Overview")
240 |     col1, col2, col3, col4 = st.columns(4)
241 |     
242 |     with col1:
243 |         st.metric("Total Cars", len(df))
244 |     
245 |     with col2:
246 |         valid_prices = df['price'].dropna()
247 |         if not valid_prices.empty:
248 |             st.metric("Average Price", f"${valid_prices.mean():,.2f}")
249 |         else:
250 |             st.metric("Average Price", "N/A")
251 |     
252 |     with col3:
253 |         if 'predicted_price' in df.columns:
254 |             valid_predictions = df['predicted_price'].dropna()
255 |             if not valid_predictions.empty:
256 |                 st.metric("Average Predicted Price", 
257 |                          f"${valid_predictions.mean():,.2f}")
258 |             else:
259 |                 st.metric("Average Predicted Price", "Pending")
260 |     
261 |     with col4:
262 |         valid_prices = df['price'].dropna()
263 |         if not valid_prices.empty:
264 |             st.metric("Price Range", 
265 |                      f"${valid_prices.min():,.0f} - ${valid_prices.max():,.0f}")
266 |         else:
267 |             st.metric("Price Range", "N/A")
268 |     
269 |     st.subheader("📈 Price Analysis")
270 |     tab1, tab2, tab3, tab4 = st.tabs([
271 |         "Price Distribution",
272 |         "Price vs Mileage",
273 |         "Price by Year",
274 |         "Prediction Analysis"
275 |     ])
276 |     
277 |     with tab1:
278 |         fig = create_plot(df, "distribution")
279 |         if fig:
280 |             st.plotly_chart(fig, use_container_width=True)
281 |     
282 |     with tab2:
283 |         fig = create_plot(df, "scatter")
284 |         if fig:
285 |             st.plotly_chart(fig, use_container_width=True)
286 |     
287 |     with tab3:
288 |         fig = create_plot(df, "box")
289 |         if fig:
290 |             st.plotly_chart(fig, use_container_width=True)
291 |     
292 |     with tab4:
293 |         fig = create_plot(df, "prediction")
294 |         if fig:
295 |             st.plotly_chart(fig, use_container_width=True)
296 |     
297 |     st.subheader("🚗 Car Listings")
298 |     
299 |     col1, col2, col3 = st.columns(3)
300 |     with col1:
301 |         models = get_unique_values(df, 'model')
302 |         model_filter = st.multiselect("Filter by Model", options=models)
303 |     
304 |     with col2:
305 |         min_year = int(df['year'].min())
306 |         max_year = int(df['year'].max())
307 |         if min_year == max_year:
308 |             year_filter = (min_year, min_year)
309 |             st.info(f"Only cars from {min_year}")
310 |         else:
311 |             year_filter = st.slider(
312 |                 "Filter by Year",
313 |                 min_value=min_year,
314 |                 max_value=max_year,
315 |                 value=(min_year, max_year)
316 |             )
317 |     
318 |     with col3:
319 |         fuel_types = get_unique_values(df, 'fuelType')
320 |         fuel_filter = st.multiselect("Filter by Fuel Type", options=fuel_types)
321 |     
322 |     filtered_df = df.copy()
323 |     if model_filter:
324 |         filtered_df = filtered_df[filtered_df['model'].isin(model_filter)]
325 |     filtered_df = filtered_df[
326 |         (filtered_df['year'] >= year_filter[0]) &
327 |         (filtered_df['year'] <= year_filter[1])
328 |     ]
329 |     if fuel_filter:
330 |         filtered_df = filtered_df[filtered_df['fuelType'].isin(fuel_filter)]
331 |     
332 |     if not filtered_df.empty:
333 |         display_df = filtered_df[[
334 |             'model', 'year', 'price', 'predicted_price', 'mileage',
335 |             'transmission', 'fuelType', 'mpg', 'engineSize'
336 |         ]].copy()
337 |         
338 |         st.dataframe(
339 |             display_df.style.format({
340 |                 'price': lambda x: f'${x:,.2f}' if pd.notnull(x) else 'N/A',
341 |                 'predicted_price': lambda x: f'${x:,.2f}' if pd.notnull(x) else 'Pending',
342 |                 'engineSize': lambda x: f'{x:.1f}L' if pd.notnull(x) else 'N/A',
343 |                 'mpg': lambda x: f'{x:.1f}' if pd.notnull(x) else 'N/A',
344 |                 'mileage': lambda x: f'{x:,.0f}' if pd.notnull(x) else 'N/A'
345 |             }),
346 |             hide_index=True,
347 |             use_container_width=True
348 |         )
349 |         
350 |         csv = filtered_df.to_csv(index=False)
351 |         st.download_button(
352 |             "📥 Download Data",
353 |             csv,
354 |             "car_data.csv",
355 |             "text/csv",
356 |             key='download-csv'
357 |         )
358 |     else:
359 |         st.info("No cars match the selected filters")
360 | else:
361 |     st.info("No cars in the database. Add some cars to get started!")
362 | 
363 | st.markdown("---")
364 | st.markdown(
365 |     """
366 |     <div style='text-align: center; padding: 10px;'>
367 |         Built with ❤️ using Streamlit | 
368 |         <a href='http://localhost:8000/docs' target='_blank'>API Documentation</a>
369 |     </div>
370 |     """,
371 |     unsafe_allow_html=True
372 | )


--------------------------------------------------------------------------------