├── backend ├── requirements.txt ├── __pycache__ │ └── main.cpython-310.pyc ├── Dockerfile └── main.py ├── streamlit_app ├── requirements.txt ├── Dockerfile └── app.py ├── ml_service ├── requirements.txt ├── tests │ └── test_predictor.py ├── train.py └── main.py ├── mlflow └── Dockerfile ├── debezium-connector-config.json ├── postgres └── init.sql ├── docker-compose.yml ├── README.md └── add_cars.sh /backend/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi==0.104.1 2 | uvicorn==0.24.0 3 | sqlalchemy==2.0.23 4 | psycopg2-binary==2.9.9 5 | pydantic==2.5.2 -------------------------------------------------------------------------------- /streamlit_app/requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit==1.31.0 2 | pandas==2.0.3 3 | requests==2.31.0 4 | plotly==5.18.0 5 | scikit-learn==1.3.0 -------------------------------------------------------------------------------- /backend/__pycache__/main.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Stefen-Taime/car-price-predictor/HEAD/backend/__pycache__/main.cpython-310.pyc -------------------------------------------------------------------------------- /ml_service/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==2.0.3 2 | scikit-learn==1.3.0 3 | mlflow==2.8.0 4 | kafka-python==2.0.2 5 | psycopg2-binary==2.9.9 6 | boto3==1.28.0 7 | python-dotenv==1.0.0 8 | pytest==7.4.0 9 | numpy>=1.24.0 10 | requests>=2.31.0 -------------------------------------------------------------------------------- /mlflow/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim 2 | 3 | RUN apt-get update && \ 4 | apt-get install -y \ 5 | curl \ 6 | wget \ 7 | mc \ 8 | && apt-get clean \ 9 | && rm -rf /var/lib/apt/lists/* 10 | 11 | RUN pip install --no-cache-dir \ 12 | mlflow==2.8.1 \ 13 | psycopg2-binary \ 14 | boto3 \ 15 | pymysql 16 | 17 | EXPOSE 5000 18 | 19 | HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ 20 | CMD curl --fail http://localhost:5000/health || exit 1 -------------------------------------------------------------------------------- /backend/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim 2 | 3 | WORKDIR /app 4 | 5 | # Install system dependencies 6 | RUN apt-get update && \ 7 | apt-get install -y \ 8 | curl \ 9 | && apt-get clean \ 10 | && rm -rf /var/lib/apt/lists/* 11 | 12 | COPY requirements.txt . 13 | RUN pip install --no-cache-dir -r requirements.txt 14 | 15 | COPY . . 16 | 17 | EXPOSE 8000 18 | 19 | HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ 20 | CMD curl --fail http://localhost:8000/health || exit 1 21 | 22 | CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] -------------------------------------------------------------------------------- /streamlit_app/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim 2 | 3 | WORKDIR /app 4 | 5 | RUN apt-get update && \ 6 | apt-get install -y \ 7 | wget \ 8 | && apt-get clean \ 9 | && rm -rf /var/lib/apt/lists/* 10 | 11 | COPY requirements.txt . 12 | RUN pip install --no-cache-dir -r requirements.txt 13 | 14 | COPY . . 15 | 16 | EXPOSE 8501 17 | 18 | HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ 19 | CMD wget --no-verbose --tries=1 --spider http://localhost:8501 || exit 1 20 | 21 | ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"] -------------------------------------------------------------------------------- /debezium-connector-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "cars-connector", 3 | "config": { 4 | "connector.class": "io.debezium.connector.postgresql.PostgresConnector", 5 | "tasks.max": "1", 6 | "database.hostname": "postgres", 7 | "database.port": "5432", 8 | "database.user": "postgres", 9 | "database.password": "postgres123", 10 | "database.dbname": "cars_db", 11 | "database.server.name": "cars", 12 | "topic.prefix": "cars-db", 13 | "schema.include.list": "public", 14 | "table.include.list": "public.listings", 15 | "plugin.name": "pgoutput" 16 | } 17 | } -------------------------------------------------------------------------------- /ml_service/tests/test_predictor.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pandas as pd 3 | import numpy as np 4 | from main import CarPricePredictor 5 | 6 | @pytest.fixture 7 | def predictor(): 8 | return CarPricePredictor() 9 | 10 | def test_preprocess_data(predictor): 11 | predictor.load_model_and_artifacts() 12 | 13 | test_data = { 14 | "model": "Fiesta", 15 | "year": 2020, 16 | "transmission": "Manual", 17 | "fuelType": "Petrol", 18 | "mileage": 10000, 19 | "tax": 150, 20 | "mpg": 50.0, 21 | "engineSize": 1.0 22 | } 23 | 24 | result = predictor.preprocess_data(test_data) 25 | 26 | assert isinstance(result, pd.DataFrame) 27 | assert not result.isnull().values.any() -------------------------------------------------------------------------------- /postgres/init.sql: -------------------------------------------------------------------------------- 1 | -- Create databases 2 | CREATE DATABASE mlflow; 3 | CREATE DATABASE cars_db; 4 | 5 | -- Connect to mlflow database and set up permissions 6 | \c mlflow; 7 | GRANT ALL PRIVILEGES ON DATABASE mlflow TO postgres; 8 | CREATE SCHEMA IF NOT EXISTS public; 9 | GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO postgres; 10 | GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA public TO postgres; 11 | ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL PRIVILEGES ON TABLES TO postgres; 12 | ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL PRIVILEGES ON SEQUENCES TO postgres; 13 | 14 | -- Connect to cars_db and set up the listings table and permissions 15 | \c cars_db; 16 | 17 | -- Create the listings table 18 | CREATE TABLE IF NOT EXISTS listings ( 19 | id SERIAL PRIMARY KEY, 20 | model VARCHAR(100), 21 | year INTEGER, 22 | price DECIMAL, 23 | transmission VARCHAR(50), 24 | mileage INTEGER, 25 | fuelType VARCHAR(50), 26 | tax DECIMAL, 27 | mpg DECIMAL, 28 | engineSize DECIMAL, 29 | predicted_price DECIMAL 30 | ); 31 | 32 | -- Grant permissions for cars_db 33 | GRANT ALL PRIVILEGES ON DATABASE cars_db TO postgres; 34 | GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO postgres; 35 | GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA public TO postgres; 36 | ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL PRIVILEGES ON TABLES TO postgres; 37 | ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL PRIVILEGES ON SEQUENCES TO postgres; -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | # PostgreSQL - Source Database 5 | postgres: 6 | image: postgres:15 7 | environment: 8 | POSTGRES_DB: cars_db 9 | POSTGRES_USER: postgres 10 | POSTGRES_PASSWORD: postgres123 11 | command: ["postgres", "-c", "wal_level=logical"] 12 | ports: 13 | - "5432:5432" 14 | volumes: 15 | - postgres_data:/var/lib/postgresql/data 16 | - ./postgres/init.sql:/docker-entrypoint-initdb.d/init.sql 17 | healthcheck: 18 | test: ["CMD-SHELL", "pg_isready -U postgres"] 19 | interval: 10s 20 | timeout: 5s 21 | retries: 5 22 | 23 | 24 | # Zookeeper 25 | zookeeper: 26 | image: confluentinc/cp-zookeeper:7.4.0 27 | environment: 28 | ZOOKEEPER_CLIENT_PORT: 2181 29 | ZOOKEEPER_TICK_TIME: 2000 30 | ports: 31 | - "2181:2181" 32 | healthcheck: 33 | test: echo srvr | nc zookeeper 2181 || exit 1 34 | interval: 10s 35 | timeout: 5s 36 | retries: 5 37 | 38 | # Kafka 39 | kafka: 40 | image: confluentinc/cp-kafka:7.4.0 41 | depends_on: 42 | zookeeper: 43 | condition: service_healthy 44 | ports: 45 | - "9092:9092" 46 | environment: 47 | KAFKA_BROKER_ID: 1 48 | KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 49 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092 50 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT 51 | KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT 52 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 53 | healthcheck: 54 | test: ["CMD-SHELL", "kafka-topics --bootstrap-server localhost:9092 --list"] 55 | interval: 30s 56 | timeout: 10s 57 | retries: 3 58 | 59 | # Debezium Connect 60 | connect: 61 | image: debezium/connect:2.4 62 | depends_on: 63 | kafka: 64 | condition: service_healthy 65 | postgres: 66 | condition: service_healthy 67 | ports: 68 | - "8083:8083" 69 | environment: 70 | BOOTSTRAP_SERVERS: kafka:29092 71 | GROUP_ID: "1" 72 | CONFIG_STORAGE_TOPIC: connect_configs 73 | OFFSET_STORAGE_TOPIC: connect_offsets 74 | STATUS_STORAGE_TOPIC: connect_statuses 75 | KEY_CONVERTER: org.apache.kafka.connect.json.JsonConverter 76 | VALUE_CONVERTER: org.apache.kafka.connect.json.JsonConverter 77 | healthcheck: 78 | test: ["CMD", "curl", "-f", "http://localhost:8083/"] 79 | interval: 30s 80 | timeout: 10s 81 | retries: 3 82 | 83 | # MinIO (S3-compatible storage) 84 | minio: 85 | image: minio/minio 86 | ports: 87 | - "9000:9000" 88 | - "9001:9001" 89 | environment: 90 | MINIO_ROOT_USER: minio 91 | MINIO_ROOT_PASSWORD: minio123 92 | command: server /data --console-address ":9001" 93 | volumes: 94 | - minio_data:/data 95 | healthcheck: 96 | test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"] 97 | interval: 30s 98 | timeout: 20s 99 | retries: 3 100 | 101 | # MLflow 102 | mlflow: 103 | build: 104 | context: ./mlflow 105 | dockerfile: Dockerfile 106 | ports: 107 | - "5000:5000" 108 | environment: 109 | MLFLOW_S3_ENDPOINT_URL: http://minio:9000 110 | AWS_ACCESS_KEY_ID: minio 111 | AWS_SECRET_ACCESS_KEY: minio123 112 | depends_on: 113 | minio: 114 | condition: service_healthy 115 | postgres: 116 | condition: service_healthy 117 | healthcheck: 118 | test: curl --fail http://localhost:5000/health || exit 1 119 | interval: 30s 120 | timeout: 10s 121 | retries: 5 122 | start_period: 30s 123 | command: | 124 | sh -c ' 125 | mc config host add minio http://minio:9000 minio minio123 && 126 | mc mb minio/mlflow || true && 127 | mlflow server \ 128 | --backend-store-uri postgresql://postgres:postgres123@postgres:5432/mlflow \ 129 | --default-artifact-root s3://mlflow/ \ 130 | --host 0.0.0.0 \ 131 | --port 5000 \ 132 | --serve-artifacts 133 | ' 134 | 135 | # Redpanda Console (Kafka Web UI) 136 | kafka-ui: 137 | image: redpandadata/console:v2.4.3 138 | ports: 139 | - "8080:8080" 140 | depends_on: 141 | kafka: 142 | condition: service_healthy 143 | environment: 144 | KAFKA_BROKERS: kafka:29092 145 | SERVER_LISTENPORT: 8080 146 | AUTH_PROVIDER: none 147 | CONNECT_ENABLED: "true" 148 | CONNECT_CLUSTERS_NAME: "kafka-connect" 149 | CONNECT_CLUSTERS_URL: "http://connect:8083" 150 | healthcheck: 151 | test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8080"] 152 | interval: 30s 153 | timeout: 10s 154 | retries: 5 155 | start_period: 30s 156 | 157 | # Adminer Console (Postgres Web UI) 158 | adminer: 159 | image: adminer:latest 160 | ports: 161 | - "8081:8080" 162 | depends_on: 163 | postgres: 164 | condition: service_healthy 165 | environment: 166 | ADMINER_DEFAULT_SERVER: postgres 167 | ADMINER_DESIGN: pepa-linha 168 | ADMINER_DEFAULT_DB: cars_db 169 | ADMINER_DEFAULT_USER: postgres 170 | ADMINER_DEFAULT_PASSWORD: postgres123 171 | restart: always 172 | 173 | volumes: 174 | postgres_data: 175 | minio_data: 176 | 177 | networks: 178 | default: 179 | driver: bridge -------------------------------------------------------------------------------- /ml_service/train.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import mlflow 3 | import boto3 4 | from sklearn.model_selection import train_test_split 5 | from sklearn.ensemble import RandomForestRegressor 6 | from sklearn.preprocessing import StandardScaler, LabelEncoder 7 | from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error 8 | import numpy as np 9 | import os 10 | import pickle 11 | import logging 12 | from pathlib import Path 13 | from botocore.client import Config 14 | import warnings 15 | 16 | 17 | logging.basicConfig(level=logging.INFO) 18 | logger = logging.getLogger(__name__) 19 | 20 | warnings.filterwarnings('ignore', category=UserWarning) 21 | warnings.filterwarnings('ignore', category=FutureWarning) 22 | 23 | def setup_minio(): 24 | """Setup MinIO connection and ensure bucket exists""" 25 | try: 26 | s3_client = boto3.client( 27 | 's3', 28 | endpoint_url='http://localhost:9000', 29 | aws_access_key_id='minio', 30 | aws_secret_access_key='minio123', 31 | config=Config(signature_version='s3v4'), 32 | region_name='us-east-1' 33 | ) 34 | 35 | try: 36 | s3_client.head_bucket(Bucket='mlflow') 37 | logger.info("MLflow bucket exists") 38 | except: 39 | s3_client.create_bucket(Bucket='mlflow') 40 | logger.info("Created MLflow bucket") 41 | 42 | except Exception as e: 43 | logger.error(f"Error setting up MinIO: {str(e)}") 44 | raise 45 | 46 | def prepare_data(df): 47 | """Prepare and preprocess the data""" 48 | numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns 49 | for col in numeric_columns: 50 | df[col] = df[col].astype('float64') 51 | 52 | X = df.drop('price', axis=1) 53 | y = df['price'] 54 | 55 | label_encoders = {} 56 | categorical_columns = ['model', 'transmission', 'fuelType'] 57 | for column in categorical_columns: 58 | label_encoders[column] = LabelEncoder() 59 | X[column] = label_encoders[column].fit_transform(X[column]) 60 | 61 | return X, y, label_encoders 62 | 63 | def train_model(): 64 | mlflow.set_tracking_uri("http://localhost:5000") 65 | 66 | os.environ['AWS_ACCESS_KEY_ID'] = 'minio' 67 | os.environ['AWS_SECRET_ACCESS_KEY'] = 'minio123' 68 | os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'http://localhost:9000' 69 | 70 | setup_minio() 71 | mlflow.set_experiment("car-price-prediction") 72 | 73 | current_dir = Path(__file__).parent.parent 74 | data_path = current_dir / 'data' / 'ford.csv' 75 | 76 | logger.info(f"Loading data from {data_path}") 77 | if not data_path.exists(): 78 | raise FileNotFoundError(f"Data file not found at {data_path}") 79 | 80 | df = pd.read_csv(data_path) 81 | 82 | X, y, label_encoders = prepare_data(df) 83 | 84 | encoder_path = Path(__file__).parent / 'label_encoders.pkl' 85 | with open(encoder_path, 'wb') as f: 86 | pickle.dump(label_encoders, f) 87 | 88 | numerical_features = ['year', 'mileage', 'tax', 'mpg', 'engineSize'] 89 | scaler = StandardScaler() 90 | 91 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 92 | 93 | X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features]) 94 | X_test[numerical_features] = scaler.transform(X_test[numerical_features]) 95 | 96 | scaler_path = Path(__file__).parent / 'scaler.pkl' 97 | with open(scaler_path, 'wb') as f: 98 | pickle.dump(scaler, f) 99 | 100 | with mlflow.start_run(): 101 | params = { 102 | 'n_estimators': 100, 103 | 'max_depth': 10, 104 | 'min_samples_split': 2, 105 | 'min_samples_leaf': 1, 106 | 'random_state': 42 107 | } 108 | 109 | logger.info("Training Random Forest model...") 110 | rf = RandomForestRegressor(**params) 111 | rf.fit(X_train, y_train) 112 | 113 | y_pred = rf.predict(X_test) 114 | metrics = { 115 | 'rmse': np.sqrt(mean_squared_error(y_test, y_pred)), 116 | 'mae': mean_absolute_error(y_test, y_pred), 117 | 'r2': r2_score(y_test, y_pred) 118 | } 119 | 120 | logger.info(f"Model metrics: {metrics}") 121 | 122 | mlflow.log_params(params) 123 | mlflow.log_metrics(metrics) 124 | 125 | signature = mlflow.models.signature.infer_signature(X_train, rf.predict(X_train)) 126 | 127 | mlflow.sklearn.log_model( 128 | rf, 129 | "model", 130 | registered_model_name="car_price_predictor", 131 | signature=signature 132 | ) 133 | 134 | mlflow.log_artifact(str(encoder_path)) 135 | mlflow.log_artifact(str(scaler_path)) 136 | 137 | logger.info("Model and artifacts logged successfully") 138 | 139 | client = mlflow.tracking.MlflowClient() 140 | model_version = client.search_model_versions("name='car_price_predictor'")[0] 141 | 142 | if model_version.current_stage != "Production": 143 | client.set_registered_model_alias( 144 | name="car_price_predictor", 145 | alias="production", 146 | version=model_version.version 147 | ) 148 | logger.info("Model set as production version") 149 | 150 | if __name__ == "__main__": 151 | try: 152 | train_model() 153 | except Exception as e: 154 | logger.error(f"Training failed: {str(e)}") 155 | raise -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Car Price Predictor 2 | 3 | A machine learning system for predicting car prices using MLflow and Streamlit. This project implements a complete pipeline for predicting car prices with an interactive web interface and real-time visualization. 4 | 5 | ## Architecture 6 | 7 | - **Frontend**: Streamlit application for data visualization and interaction 8 | - **Backend**: FastAPI REST API for data management 9 | - **ML Pipeline**: MLflow for model management and serving 10 | - **Storage**: PostgreSQL for data storage, MinIO for model artifacts 11 | 12 | ```mermaid 13 | graph TB 14 | UI[Streamlit UI:8501] --> API[FastAPI:8000] 15 | API --> DB[(PostgreSQL:5432)] 16 | DB --> DEB[Debezium:8083] 17 | DEB --> KAFKA[Kafka:9092] 18 | KAFKA --> ML[ML Service] 19 | ML --> MLFLOW[MLflow:5000] 20 | MLFLOW --> MINIO[(MinIO:9000)] 21 | ML --> KAFKA 22 | KAFKA --> API 23 | API --> DB 24 | 25 | ADMIN[Adminer:8081] --> DB 26 | KAFKAUI[Kafka UI:8080] --> KAFKA 27 | ZK[Zookeeper:2181] --> KAFKA 28 | 29 | subgraph "User Interface" 30 | UI 31 | ADMIN 32 | KAFKAUI 33 | end 34 | 35 | subgraph "Storage" 36 | DB 37 | MINIO 38 | end 39 | 40 | subgraph "Processing" 41 | KAFKA 42 | DEB 43 | ML 44 | MLFLOW 45 | ZK 46 | end 47 | 48 | style UI fill:#2563eb,stroke:#1d4ed8,color:#fff 49 | style API fill:#2563eb,stroke:#1d4ed8,color:#fff 50 | style DB fill:#059669,stroke:#047857,color:#fff 51 | style MINIO fill:#059669,stroke:#047857,color:#fff 52 | style KAFKA fill:#4b5563,stroke:#374151,color:#fff 53 | style ML fill:#7c3aed,stroke:#6d28d9,color:#fff 54 | style MLFLOW fill:#7c3aed,stroke:#6d28d9,color:#fff 55 | 56 | ``` 57 | 58 | 59 | ## Prerequisites 60 | 61 | - Docker and Docker Compose 62 | - Python 3.9+ (for local development) 63 | 64 | ## Quick Start 65 | 66 | 1. Clone the repository: 67 | ```bash 68 | git clone https://github.com/Stefen-Taime/car-price-predictor 69 | cd car-price-predictor 70 | ``` 71 | 72 | 2. Start the services: 73 | ```bash 74 | docker-compose up --build 75 | ``` 76 | 77 | 3. Train the initial model: 78 | ```bash 79 | cd ml_service 80 | python train.py 81 | ``` 82 | 83 | 4. Start the FastAPI backend: 84 | ```bash 85 | cd backend 86 | uvicorn main:app --reload 87 | ``` 88 | 89 | 5. Start the Streamlit frontend: 90 | ```bash 91 | cd web-ui 92 | streamlit run app.py 93 | ``` 94 | 95 | 6. Access the applications: 96 | - Streamlit UI: http://localhost:8501 97 | - FastAPI Docs: http://localhost:8000/docs 98 | - MLflow UI: http://localhost:5000 99 | - MinIO Console: http://localhost:9001 100 | - Kafka UI: http://localhost:8080 101 | 102 | ## Project Structure 103 | 104 | ``` 105 | . 106 | ├── backend/ # FastAPI backend service 107 | │ ├── main.py # Main API application 108 | │ └── requirements.txt # Python dependencies 109 | ├── data/ # Training data 110 | │ └── ford.csv # Sample car data 111 | ├── ml_service/ # ML training and prediction service 112 | │ ├── train.py # Model training script 113 | │ └── main.py # Prediction service 114 | ├── mlflow/ # MLflow service configuration 115 | ├── postgres/ # PostgreSQL initialization scripts 116 | ├── web-ui/ # Streamlit frontend application 117 | │ ├── app.py # Main Streamlit application 118 | │ └── requirements.txt # Python dependencies 119 | ├── docker-compose.yml # Docker services configuration 120 | └── README.md 121 | ``` 122 | 123 | ## Features 124 | 125 | - Real-time car price predictions 126 | - Interactive data visualization with Streamlit 127 | - RESTful API with FastAPI 128 | - ML model versioning and tracking with MLflow 129 | - Beautiful charts with Plotly 130 | - Scalable architecture 131 | - API documentation with Swagger UI 132 | 133 | ## Development 134 | 135 | ### Backend Development 136 | 137 | ```bash 138 | cd backend 139 | pip install -r requirements.txt 140 | uvicorn main:app --reload --host 0.0.0.0 --port 8000 141 | ``` 142 | 143 | ### Frontend Development 144 | 145 | ```bash 146 | cd web-ui 147 | pip install -r requirements.txt 148 | streamlit run app.py 149 | ``` 150 | 151 | ### ML Service Development 152 | 153 | ```bash 154 | cd ml_service 155 | pip install -r requirements.txt 156 | python train.py 157 | ``` 158 | 159 | ## API Documentation 160 | 161 | The API documentation is available at `http://localhost:8000/docs` when the backend service is running. The following endpoints are available: 162 | 163 | - `GET /cars`: List all cars 164 | - `POST /cars`: Add a new car 165 | - `GET /cars/{car_id}`: Get car details 166 | - `GET /health`: Check service health 167 | 168 | ## Data Flow 169 | 170 | 1. User submits car data through Streamlit interface 171 | 2. Data is sent to FastAPI backend 172 | 3. ML service makes predictions using MLflow 173 | 4. Results are stored in PostgreSQL 174 | 5. Updated data is displayed in Streamlit UI 175 | 176 | ## Technologies Used 177 | 178 | - **Backend**: 179 | - FastAPI for REST API 180 | - Pydantic for data validation 181 | - SQLAlchemy for database ORM 182 | 183 | - **Frontend**: 184 | - Streamlit for UI 185 | - Plotly for data visualization 186 | - Pandas for data manipulation 187 | 188 | - **ML Pipeline**: 189 | - MLflow for model management 190 | - scikit-learn for ML models 191 | - PostgreSQL for data storage 192 | - MinIO for artifact storage 193 | 194 | ## Contributing 195 | 196 | 1. Fork the repository 197 | 2. Create your feature branch (`git checkout -b feature/amazing-feature`) 198 | 3. Commit your changes (`git commit -m 'Add some amazing feature'`) 199 | 4. Push to the branch (`git push origin feature/amazing-feature`) 200 | 5. Open a Pull Request 201 | 202 | 203 | ## Acknowledgments 204 | 205 | - Ford Used Car Dataset 206 | - MLflow for ML model management 207 | - FastAPI for the backend API 208 | - Streamlit for the interactive UI 209 | -------------------------------------------------------------------------------- /add_cars.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Fiesta variants 4 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \ 5 | -d '{"model":"Fiesta","year":2020,"price":12500,"transmission":"Manual","mileage":25000,"fuelType":"Petrol","tax":150,"mpg":55.4,"engineSize":1.0}' 6 | echo -e "\nFiesta 1.0 added" 7 | 8 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \ 9 | -d '{"model":"Fiesta","year":2021,"price":14500,"transmission":"Automatic","mileage":18000,"fuelType":"Petrol","tax":155,"mpg":52.3,"engineSize":1.1}' 10 | echo -e "\nFiesta 1.1 added" 11 | 12 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \ 13 | -d '{"model":"Fiesta ST","year":2022,"price":22000,"transmission":"Manual","mileage":8000,"fuelType":"Petrol","tax":165,"mpg":45.6,"engineSize":1.5}' 14 | echo -e "\nFiesta ST added" 15 | 16 | # Focus variants 17 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \ 18 | -d '{"model":"Focus","year":2021,"price":18500,"transmission":"Manual","mileage":15000,"fuelType":"Petrol","tax":165,"mpg":50.2,"engineSize":1.5}' 19 | echo -e "\nFocus 1.5 added" 20 | 21 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \ 22 | -d '{"model":"Focus","year":2022,"price":21000,"transmission":"Automatic","mileage":12000,"fuelType":"Diesel","tax":155,"mpg":58.8,"engineSize":2.0}' 23 | echo -e "\nFocus Diesel added" 24 | 25 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \ 26 | -d '{"model":"Focus ST","year":2023,"price":34000,"transmission":"Manual","mileage":5000,"fuelType":"Petrol","tax":185,"mpg":35.7,"engineSize":2.3}' 27 | echo -e "\nFocus ST added" 28 | 29 | # Puma variants 30 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \ 31 | -d '{"model":"Puma","year":2023,"price":22500,"transmission":"Manual","mileage":5000,"fuelType":"Petrol","tax":155,"mpg":52.3,"engineSize":1.0}' 32 | echo -e "\nPuma 1.0 added" 33 | 34 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \ 35 | -d '{"model":"Puma","year":2022,"price":24500,"transmission":"Automatic","mileage":8000,"fuelType":"Hybrid","tax":145,"mpg":58.9,"engineSize":1.0}' 36 | echo -e "\nPuma Hybrid added" 37 | 38 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \ 39 | -d '{"model":"Puma ST","year":2023,"price":32000,"transmission":"Manual","mileage":3000,"fuelType":"Petrol","tax":170,"mpg":42.8,"engineSize":1.5}' 40 | echo -e "\nPuma ST added" 41 | 42 | # Kuga variants 43 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \ 44 | -d '{"model":"Kuga","year":2022,"price":28000,"transmission":"Manual","mileage":12000,"fuelType":"Diesel","tax":155,"mpg":54.3,"engineSize":1.5}' 45 | echo -e "\nKuga Diesel added" 46 | 47 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \ 48 | -d '{"model":"Kuga","year":2023,"price":32000,"transmission":"Automatic","mileage":8000,"fuelType":"Hybrid","tax":145,"mpg":48.7,"engineSize":2.0}' 49 | echo -e "\nKuga Hybrid added" 50 | 51 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \ 52 | -d '{"model":"Kuga PHEV","year":2023,"price":38000,"transmission":"Automatic","mileage":5000,"fuelType":"Hybrid","tax":0,"mpg":201.8,"engineSize":2.5}' 53 | echo -e "\nKuga PHEV added" 54 | 55 | # Mustang variants 56 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \ 57 | -d '{"model":"Mustang","year":2021,"price":45000,"transmission":"Manual","mileage":12000,"fuelType":"Petrol","tax":580,"mpg":25.7,"engineSize":5.0}' 58 | echo -e "\nMustang GT Manual added" 59 | 60 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \ 61 | -d '{"model":"Mustang","year":2022,"price":48000,"transmission":"Automatic","mileage":8000,"fuelType":"Petrol","tax":580,"mpg":24.8,"engineSize":5.0}' 62 | echo -e "\nMustang GT Auto added" 63 | 64 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \ 65 | -d '{"model":"Mustang Mach-E","year":2023,"price":55000,"transmission":"Automatic","mileage":5000,"fuelType":"Electric","tax":0,"mpg":379.0,"engineSize":0.0}' 66 | echo -e "\nMustang Mach-E added" 67 | 68 | # Explorer variants 69 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \ 70 | -d '{"model":"Explorer","year":2022,"price":58000,"transmission":"Automatic","mileage":15000,"fuelType":"Petrol","tax":580,"mpg":25.7,"engineSize":3.0}' 71 | echo -e "\nExplorer Petrol added" 72 | 73 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \ 74 | -d '{"model":"Explorer","year":2023,"price":65000,"transmission":"Automatic","mileage":8000,"fuelType":"Hybrid","tax":150,"mpg":35.3,"engineSize":3.0}' 75 | echo -e "\nExplorer Hybrid added" 76 | 77 | # Ranger variants 78 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \ 79 | -d '{"model":"Ranger","year":2022,"price":32000,"transmission":"Manual","mileage":20000,"fuelType":"Diesel","tax":290,"mpg":35.3,"engineSize":2.0}' 80 | echo -e "\nRanger Diesel added" 81 | 82 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \ 83 | -d '{"model":"Ranger Raptor","year":2023,"price":48000,"transmission":"Automatic","mileage":5000,"fuelType":"Diesel","tax":290,"mpg":32.1,"engineSize":3.0}' 84 | echo -e "\nRanger Raptor added" 85 | 86 | # Transit Custom variants 87 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \ 88 | -d '{"model":"Transit Custom","year":2022,"price":28000,"transmission":"Manual","mileage":25000,"fuelType":"Diesel","tax":275,"mpg":40.9,"engineSize":2.0}' 89 | echo -e "\nTransit Custom added" 90 | 91 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \ 92 | -d '{"model":"Transit Custom PHEV","year":2023,"price":42000,"transmission":"Automatic","mileage":8000,"fuelType":"Hybrid","tax":0,"mpg":91.7,"engineSize":1.0}' 93 | echo -e "\nTransit Custom PHEV added" 94 | 95 | # Tourneo variants 96 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \ 97 | -d '{"model":"Tourneo Custom","year":2022,"price":38000,"transmission":"Automatic","mileage":15000,"fuelType":"Diesel","tax":275,"mpg":38.2,"engineSize":2.0}' 98 | echo -e "\nTourneo Custom added" 99 | 100 | curl -X POST "http://localhost:8000/cars" -H "Content-Type: application/json" \ 101 | -d '{"model":"Tourneo Connect","year":2023,"price":32000,"transmission":"Manual","mileage":8000,"fuelType":"Diesel","tax":155,"mpg":45.6,"engineSize":1.5}' 102 | echo -e "\nTourneo Connect added" 103 | 104 | echo -e "\nAll cars have been added successfully!" -------------------------------------------------------------------------------- /backend/main.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI, HTTPException, Depends 2 | from fastapi.middleware.cors import CORSMiddleware 3 | from sqlalchemy import create_engine, Column, Integer, String, Float, DateTime 4 | from sqlalchemy.ext.declarative import declarative_base 5 | from sqlalchemy.orm import sessionmaker, Session 6 | from pydantic import BaseModel, Field, validator 7 | from typing import Optional, List, Dict, Any 8 | import os 9 | from datetime import datetime 10 | from kafka import KafkaProducer, KafkaConsumer 11 | import json 12 | import logging 13 | from contextlib import contextmanager 14 | import threading 15 | import time 16 | 17 | logging.basicConfig(level=logging.INFO) 18 | logger = logging.getLogger(__name__) 19 | 20 | app = FastAPI(title="Car Price Prediction API") 21 | 22 | app.add_middleware( 23 | CORSMiddleware, 24 | allow_origins=["*"], 25 | allow_credentials=True, 26 | allow_methods=["*"], 27 | allow_headers=["*"], 28 | ) 29 | 30 | SQLALCHEMY_DATABASE_URL = "postgresql://postgres:postgres123@localhost:5432/cars_db" 31 | engine = create_engine(SQLALCHEMY_DATABASE_URL) 32 | SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) 33 | Base = declarative_base() 34 | 35 | KAFKA_BOOTSTRAP_SERVERS = "localhost:9092" 36 | KAFKA_TOPIC_LISTINGS = "cars-db.public.listings" 37 | KAFKA_TOPIC_PREDICTIONS = "cars.public.predictions" 38 | 39 | class Car(Base): 40 | __tablename__ = "listings" 41 | 42 | id = Column(Integer, primary_key=True, index=True) 43 | model = Column(String) 44 | year = Column(Integer) 45 | price = Column(Float) 46 | transmission = Column(String) 47 | mileage = Column(Integer) 48 | fuelType = Column(String) 49 | tax = Column(Float) 50 | mpg = Column(Float) 51 | engineSize = Column(Float) 52 | predicted_price = Column(Float, nullable=True) 53 | created_at = Column(DateTime, default=datetime.utcnow) 54 | updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) 55 | 56 | class CarBase(BaseModel): 57 | model: str = Field(..., example="Fiesta") 58 | year: int = Field(..., ge=1900, le=datetime.now().year, example=2019) 59 | price: float = Field(..., gt=0, example=12000) 60 | transmission: str = Field(..., example="Manual") 61 | mileage: int = Field(..., ge=0, example=25000) 62 | fuelType: str = Field(..., example="Petrol") 63 | tax: float = Field(..., ge=0, example=145) 64 | mpg: float = Field(..., ge=0, example=55.4) 65 | engineSize: float = Field(..., gt=0, example=1.0) 66 | 67 | @validator('transmission') 68 | def validate_transmission(cls, v): 69 | allowed = {'Manual', 'Automatic', 'Semi-Auto'} 70 | if v not in allowed: 71 | raise ValueError(f'transmission must be one of {allowed}') 72 | return v 73 | 74 | @validator('fuelType') 75 | def validate_fuel_type(cls, v): 76 | allowed = {'Petrol', 'Diesel', 'Hybrid', 'Electric'} 77 | if v not in allowed: 78 | raise ValueError(f'fuelType must be one of {allowed}') 79 | return v 80 | 81 | class CarCreate(CarBase): 82 | pass 83 | 84 | class CarResponse(CarBase): 85 | id: int 86 | predicted_price: Optional[float] = None 87 | created_at: datetime 88 | updated_at: datetime 89 | 90 | class Config: 91 | from_attributes = True 92 | 93 | @contextmanager 94 | def get_db(): 95 | db = SessionLocal() 96 | try: 97 | yield db 98 | finally: 99 | db.close() 100 | 101 | class KafkaManager: 102 | _producer = None 103 | _consumer = None 104 | _consumer_thread = None 105 | _running = False 106 | 107 | @classmethod 108 | def get_producer(cls): 109 | if cls._producer is None: 110 | cls._producer = KafkaProducer( 111 | bootstrap_servers=KAFKA_BOOTSTRAP_SERVERS, 112 | value_serializer=lambda x: json.dumps(x).encode('utf-8') 113 | ) 114 | return cls._producer 115 | 116 | @classmethod 117 | def start_consumer(cls): 118 | if cls._consumer_thread is None: 119 | cls._running = True 120 | cls._consumer_thread = threading.Thread(target=cls._consume_predictions) 121 | cls._consumer_thread.daemon = True 122 | cls._consumer_thread.start() 123 | 124 | @classmethod 125 | def _consume_predictions(cls): 126 | consumer = KafkaConsumer( 127 | KAFKA_TOPIC_PREDICTIONS, 128 | bootstrap_servers=KAFKA_BOOTSTRAP_SERVERS, 129 | value_deserializer=lambda x: json.loads(x.decode('utf-8')), 130 | auto_offset_reset='latest', 131 | enable_auto_commit=True, 132 | group_id='backend-consumer' 133 | ) 134 | 135 | while cls._running: 136 | try: 137 | messages = consumer.poll(timeout_ms=1000) 138 | for topic_partition, msgs in messages.items(): 139 | for message in msgs: 140 | cls._handle_prediction(message.value) 141 | except Exception as e: 142 | logger.error(f"Error consuming messages: {str(e)}") 143 | time.sleep(5) 144 | 145 | consumer.close() 146 | 147 | @classmethod 148 | def _handle_prediction(cls, prediction_data): 149 | try: 150 | with get_db() as db: 151 | car_id = prediction_data.get('id') 152 | predicted_price = prediction_data.get('predicted_price') 153 | 154 | if car_id and predicted_price: 155 | car = db.query(Car).filter(Car.id == car_id).first() 156 | if car: 157 | car.predicted_price = predicted_price 158 | db.commit() 159 | logger.info(f"Updated prediction for car {car_id}: £{predicted_price:,.2f}") 160 | except Exception as e: 161 | logger.error(f"Error handling prediction: {str(e)}") 162 | 163 | # Routes 164 | @app.get("/cars", response_model=List[CarResponse]) 165 | def get_cars(): 166 | with get_db() as db: 167 | cars = db.query(Car).all() 168 | return cars 169 | 170 | @app.post("/cars", response_model=CarResponse) 171 | def create_car(car: CarCreate): 172 | with get_db() as db: 173 | db_car = Car(**car.dict()) 174 | db.add(db_car) 175 | db.commit() 176 | db.refresh(db_car) 177 | 178 | try: 179 | producer = KafkaManager.get_producer() 180 | producer.send(KAFKA_TOPIC_LISTINGS, db_car.__dict__) 181 | producer.flush() 182 | logger.info(f"Sent car {db_car.id} to Kafka") 183 | except Exception as e: 184 | logger.error(f"Error sending to Kafka: {str(e)}") 185 | 186 | return db_car 187 | 188 | @app.get("/cars/{car_id}", response_model=CarResponse) 189 | def get_car(car_id: int): 190 | with get_db() as db: 191 | car = db.query(Car).filter(Car.id == car_id).first() 192 | if car is None: 193 | raise HTTPException(status_code=404, detail="Car not found") 194 | return car 195 | 196 | @app.get("/health") 197 | def health_check(): 198 | return { 199 | "status": "healthy", 200 | "timestamp": datetime.utcnow().isoformat(), 201 | "database": "connected" if engine.connect() else "disconnected" 202 | } 203 | 204 | @app.on_event("startup") 205 | async def startup_event(): 206 | Base.metadata.create_all(bind=engine) 207 | KafkaManager.start_consumer() 208 | logger.info("Application started, Kafka consumer running") 209 | 210 | @app.on_event("shutdown") 211 | async def shutdown_event(): 212 | KafkaManager._running = False 213 | if KafkaManager._consumer_thread: 214 | KafkaManager._consumer_thread.join(timeout=5) 215 | if KafkaManager._producer: 216 | KafkaManager._producer.close() 217 | logger.info("Application shutdown complete") 218 | 219 | if __name__ == "__main__": 220 | import uvicorn 221 | uvicorn.run(app, host="0.0.0.0", port=8000) -------------------------------------------------------------------------------- /ml_service/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import mlflow 4 | import pandas as pd 5 | import numpy as np 6 | from kafka import KafkaConsumer, KafkaProducer 7 | from sklearn.preprocessing import StandardScaler, LabelEncoder 8 | import pickle 9 | from dotenv import load_dotenv 10 | import logging 11 | from typing import Optional, Dict, Any 12 | import time 13 | from pathlib import Path 14 | import boto3 15 | from botocore.client import Config 16 | import warnings 17 | 18 | logging.basicConfig( 19 | level=logging.INFO, 20 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' 21 | ) 22 | logger = logging.getLogger(__name__) 23 | 24 | warnings.filterwarnings('ignore', category=UserWarning) 25 | warnings.filterwarnings('ignore', category=FutureWarning) 26 | 27 | class CarPricePredictor: 28 | def __init__(self): 29 | self.model = None 30 | self.label_encoders = None 31 | self.scaler = None 32 | self.consumer = None 33 | self.producer = None 34 | 35 | def setup_minio(self): 36 | """Configure MinIO credentials""" 37 | os.environ['AWS_ACCESS_KEY_ID'] = 'minio' 38 | os.environ['AWS_SECRET_ACCESS_KEY'] = 'minio123' 39 | os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'http://localhost:9000' 40 | 41 | boto3.client( 42 | 's3', 43 | endpoint_url='http://localhost:9000', 44 | aws_access_key_id='minio', 45 | aws_secret_access_key='minio123', 46 | config=Config(signature_version='s3v4'), 47 | region_name='us-east-1' 48 | ) 49 | logger.info("MinIO credentials configured") 50 | 51 | def load_model_and_artifacts(self) -> None: 52 | """Load the ML model and preprocessing artifacts""" 53 | try: 54 | logger.info("Loading model and artifacts...") 55 | 56 | self.setup_minio() 57 | 58 | current_dir = Path(__file__).parent 59 | 60 | encoder_path = current_dir / 'label_encoders.pkl' 61 | with open(encoder_path, "rb") as f: 62 | self.label_encoders = pickle.load(f) 63 | logger.info("Loaded label encoders") 64 | 65 | scaler_path = current_dir / 'scaler.pkl' 66 | with open(scaler_path, "rb") as f: 67 | self.scaler = pickle.load(f) 68 | logger.info("Loaded scaler") 69 | 70 | mlflow.set_tracking_uri("http://localhost:5000") 71 | 72 | max_retries = 5 73 | for i in range(max_retries): 74 | try: 75 | runs = mlflow.search_runs(experiment_ids=["1"]) 76 | if len(runs) > 0: 77 | break 78 | except Exception as e: 79 | if i == max_retries - 1: 80 | raise 81 | logger.warning(f"Failed to connect to MLflow, retrying... ({i+1}/{max_retries})") 82 | time.sleep(5) 83 | 84 | if len(runs) == 0: 85 | raise Exception("No runs found in MLflow") 86 | 87 | latest_run = runs.sort_values("start_time", ascending=False).iloc[0] 88 | run_id = latest_run.run_id 89 | 90 | logger.info(f"Loading model from run {run_id}") 91 | self.model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model") 92 | logger.info("Model loaded successfully") 93 | 94 | except Exception as e: 95 | logger.error(f"Error loading model and artifacts: {str(e)}") 96 | raise 97 | 98 | def setup_kafka(self) -> None: 99 | """Initialize Kafka consumer and producer""" 100 | try: 101 | KAFKA_BOOTSTRAP_SERVERS = os.getenv('KAFKA_BOOTSTRAP_SERVERS', 'localhost:9092') 102 | 103 | self.consumer = KafkaConsumer( 104 | 'cars-db.public.listings', 105 | bootstrap_servers=KAFKA_BOOTSTRAP_SERVERS, 106 | auto_offset_reset='earliest', 107 | enable_auto_commit=True, 108 | group_id='car_price_predictor', 109 | value_deserializer=lambda x: json.loads(x.decode('utf-8')), 110 | consumer_timeout_ms=1000 111 | ) 112 | 113 | self.producer = KafkaProducer( 114 | bootstrap_servers=KAFKA_BOOTSTRAP_SERVERS, 115 | value_serializer=lambda x: json.dumps(x).encode('utf-8'), 116 | retries=5 117 | ) 118 | 119 | logger.info("Kafka consumer and producer initialized") 120 | 121 | except Exception as e: 122 | logger.error(f"Error setting up Kafka: {str(e)}") 123 | raise 124 | 125 | def preprocess_data(self, data: Dict[str, Any]) -> pd.DataFrame: 126 | """Preprocess the input data""" 127 | try: 128 | df = pd.DataFrame([data]) 129 | 130 | numeric_columns = ['year', 'mileage', 'tax', 'mpg', 'engineSize'] 131 | for col in numeric_columns: 132 | df[col] = df[col].astype(float) 133 | 134 | categorical_columns = ['model', 'transmission', 'fuelType'] 135 | for column in categorical_columns: 136 | df[column] = self.label_encoders[column].transform(df[column]) 137 | 138 | df[numeric_columns] = self.scaler.transform(df[numeric_columns]) 139 | 140 | return df 141 | 142 | except Exception as e: 143 | logger.error(f"Error preprocessing data: {str(e)}") 144 | raise 145 | 146 | def process_message(self, message: Any) -> Optional[Dict[str, Any]]: 147 | """Process incoming Kafka message and return prediction""" 148 | try: 149 | data = message.value if isinstance(message.value, dict) else json.loads(message.value) 150 | 151 | if 'payload' in data and 'after' in data['payload']: 152 | car_data = data['payload']['after'] 153 | else: 154 | car_data = data 155 | 156 | required_fields = ['model', 'year', 'transmission', 'mileage', 157 | 'fuelType', 'tax', 'mpg', 'engineSize'] 158 | 159 | for field in required_fields: 160 | if field not in car_data: 161 | logger.warning(f"Missing required field: {field}") 162 | return None 163 | 164 | processed_data = self.preprocess_data(car_data) 165 | 166 | prediction = self.model.predict(processed_data)[0] 167 | 168 | car_data['predicted_price'] = float(prediction) 169 | car_data['prediction_timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S') 170 | 171 | return car_data 172 | 173 | except Exception as e: 174 | logger.error(f"Error processing message: {str(e)}") 175 | return None 176 | 177 | def run(self) -> None: 178 | """Main processing loop""" 179 | logger.info("Starting to consume messages...") 180 | 181 | while True: 182 | try: 183 | messages = self.consumer.poll(timeout_ms=1000) 184 | 185 | for topic_partition, msgs in messages.items(): 186 | for message in msgs: 187 | result = self.process_message(message) 188 | 189 | if result: 190 | self.producer.send('cars.public.predictions', value=result) 191 | self.producer.flush() 192 | 193 | logger.info( 194 | f"Processed car: {result.get('model', 'Unknown')} " 195 | f"({result.get('year', 'Unknown')}). " 196 | f"Predicted price: £{result['predicted_price']:,.2f}" 197 | ) 198 | 199 | time.sleep(0.1) 200 | 201 | except KeyboardInterrupt: 202 | logger.info("Stopping the service...") 203 | break 204 | 205 | except Exception as e: 206 | logger.error(f"Error in processing loop: {str(e)}") 207 | time.sleep(5) 208 | 209 | try: 210 | self.consumer.close() 211 | self.producer.close() 212 | logger.info("Kafka connections closed") 213 | except Exception as e: 214 | logger.error(f"Error during cleanup: {str(e)}") 215 | 216 | logger.info("Service stopped") 217 | 218 | def main(): 219 | """Main entry point""" 220 | try: 221 | load_dotenv() 222 | 223 | predictor = CarPricePredictor() 224 | 225 | predictor.load_model_and_artifacts() 226 | 227 | predictor.setup_kafka() 228 | 229 | predictor.run() 230 | 231 | except Exception as e: 232 | logger.error(f"Application failed: {str(e)}") 233 | raise 234 | 235 | finally: 236 | logger.info("Application shutdown complete") 237 | 238 | if __name__ == "__main__": 239 | main() -------------------------------------------------------------------------------- /streamlit_app/app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import requests 4 | import plotly.express as px 5 | from datetime import datetime 6 | import time 7 | 8 | API_URL = "http://localhost:8000" 9 | 10 | st.set_page_config( 11 | page_title="Car Price Predictor", 12 | page_icon="🚗", 13 | layout="wide" 14 | ) 15 | 16 | st.markdown(""" 17 | 41 | """, unsafe_allow_html=True) 42 | 43 | @st.cache_data(ttl=300) 44 | def fetch_cars(): 45 | """Fetch cars data from API with caching""" 46 | try: 47 | st.write("Fetching data from API...") 48 | response = requests.get(f"{API_URL}/cars") 49 | st.write(f"Response status: {response.status_code}") 50 | if response.status_code == 200: 51 | data = pd.DataFrame(response.json()) 52 | st.write(f"Data fetched successfully. Shape: {data.shape}") 53 | return data 54 | st.write("Failed to fetch data") 55 | return pd.DataFrame() 56 | except Exception as e: 57 | st.write(f"Error fetching data: {str(e)}") 58 | return pd.DataFrame() 59 | 60 | @st.cache_data(ttl=600) 61 | def get_unique_values(df, column): 62 | """Get unique values from DataFrame column""" 63 | try: 64 | return sorted(df[column].unique().tolist()) if not df.empty else [] 65 | except Exception as e: 66 | st.write(f"Error getting unique values: {str(e)}") 67 | return [] 68 | 69 | def create_plot(df, plot_type): 70 | """Create different types of plots based on the data""" 71 | try: 72 | if df.empty: 73 | return None 74 | 75 | if plot_type == "distribution": 76 | fig = px.histogram( 77 | df, 78 | x="price", 79 | title="Price Distribution", 80 | nbins=30, 81 | labels={"price": "Price ($)", "count": "Number of Cars"} 82 | ) 83 | fig.update_layout(showlegend=False) 84 | 85 | elif plot_type == "scatter": 86 | fig = px.scatter( 87 | df[df['price'].notna()], 88 | x="mileage", 89 | y="price", 90 | color="model", 91 | size="engineSize", 92 | title="Price vs Mileage by Model", 93 | labels={ 94 | "mileage": "Mileage (miles)", 95 | "price": "Price ($)", 96 | "model": "Model", 97 | "engineSize": "Engine Size (L)" 98 | } 99 | ) 100 | 101 | elif plot_type == "box": 102 | fig = px.box( 103 | df[df['price'].notna()], 104 | x="year", 105 | y="price", 106 | title="Price Distribution by Year", 107 | labels={"year": "Year", "price": "Price ($)"} 108 | ) 109 | 110 | elif plot_type == "prediction": 111 | valid_data = df[df['price'].notna() & df['predicted_price'].notna()] 112 | if not valid_data.empty: 113 | fig = px.scatter( 114 | valid_data, 115 | x="price", 116 | y="predicted_price", 117 | color="model", 118 | title="Predicted vs Actual Price", 119 | labels={ 120 | "price": "Actual Price ($)", 121 | "predicted_price": "Predicted Price ($)", 122 | "model": "Model" 123 | } 124 | ) 125 | 126 | min_val = min(valid_data['price'].min(), valid_data['predicted_price'].min()) 127 | max_val = max(valid_data['price'].max(), valid_data['predicted_price'].max()) 128 | fig.add_shape( 129 | type='line', 130 | line=dict(dash='dash', color='gray'), 131 | x0=min_val, 132 | y0=min_val, 133 | x1=max_val, 134 | y1=max_val 135 | ) 136 | return fig 137 | return None 138 | 139 | return fig 140 | except Exception as e: 141 | st.write(f"Error creating plot: {str(e)}") 142 | return None 143 | 144 | st.title("🚗 Car Price Predictor") 145 | 146 | with st.sidebar: 147 | st.header("Add New Car") 148 | 149 | with st.form("car_form", clear_on_submit=True): 150 | model = st.text_input( 151 | "Model", 152 | help="Enter the car model (e.g., Ford Fiesta)" 153 | ) 154 | year = st.number_input( 155 | "Year", 156 | min_value=1900, 157 | max_value=datetime.now().year, 158 | value=2020, 159 | help="Select the manufacturing year" 160 | ) 161 | transmission = st.selectbox( 162 | "Transmission", 163 | ["Manual", "Automatic", "Semi-Auto"], 164 | help="Select the transmission type" 165 | ) 166 | fuel_type = st.selectbox( 167 | "Fuel Type", 168 | ["Petrol", "Diesel", "Hybrid", "Electric"], 169 | help="Select the fuel type" 170 | ) 171 | mileage = st.number_input( 172 | "Mileage", 173 | min_value=0, 174 | value=10000, 175 | help="Enter the total mileage in miles" 176 | ) 177 | engine_size = st.number_input( 178 | "Engine Size (L)", 179 | min_value=0.1, 180 | max_value=10.0, 181 | value=1.5, 182 | step=0.1, 183 | help="Enter the engine size in liters" 184 | ) 185 | tax = st.number_input( 186 | "Tax ($)", 187 | min_value=0, 188 | value=150, 189 | help="Enter the annual road tax" 190 | ) 191 | mpg = st.number_input( 192 | "MPG", 193 | min_value=0.0, 194 | value=50.0, 195 | help="Enter the miles per gallon" 196 | ) 197 | price = st.number_input( 198 | "Price ($)", 199 | min_value=0, 200 | value=15000, 201 | help="Enter the car price" 202 | ) 203 | 204 | submitted = st.form_submit_button("Add Car") 205 | 206 | if submitted and model: 207 | confirmation = st.checkbox("I confirm all details are correct") 208 | if confirmation: 209 | car_data = { 210 | "model": model, 211 | "year": year, 212 | "transmission": transmission, 213 | "fuelType": fuel_type, 214 | "mileage": mileage, 215 | "engineSize": engine_size, 216 | "tax": tax, 217 | "mpg": mpg, 218 | "price": price 219 | } 220 | 221 | with st.spinner('Adding car...'): 222 | try: 223 | st.write(f"Submitting data: {car_data}") 224 | response = requests.post(f"{API_URL}/cars", json=car_data) 225 | st.write(f"Response status: {response.status_code}") 226 | if response.status_code == 200: 227 | st.success("Car added successfully!") 228 | st.balloons() 229 | fetch_cars.clear() 230 | time.sleep(1) 231 | else: 232 | st.error(f"Failed to add car: {response.text}") 233 | except Exception as e: 234 | st.error(f"Failed to connect to server: {str(e)}") 235 | 236 | df = fetch_cars() 237 | 238 | if not df.empty: 239 | st.subheader("📊 Dashboard Overview") 240 | col1, col2, col3, col4 = st.columns(4) 241 | 242 | with col1: 243 | st.metric("Total Cars", len(df)) 244 | 245 | with col2: 246 | valid_prices = df['price'].dropna() 247 | if not valid_prices.empty: 248 | st.metric("Average Price", f"${valid_prices.mean():,.2f}") 249 | else: 250 | st.metric("Average Price", "N/A") 251 | 252 | with col3: 253 | if 'predicted_price' in df.columns: 254 | valid_predictions = df['predicted_price'].dropna() 255 | if not valid_predictions.empty: 256 | st.metric("Average Predicted Price", 257 | f"${valid_predictions.mean():,.2f}") 258 | else: 259 | st.metric("Average Predicted Price", "Pending") 260 | 261 | with col4: 262 | valid_prices = df['price'].dropna() 263 | if not valid_prices.empty: 264 | st.metric("Price Range", 265 | f"${valid_prices.min():,.0f} - ${valid_prices.max():,.0f}") 266 | else: 267 | st.metric("Price Range", "N/A") 268 | 269 | st.subheader("📈 Price Analysis") 270 | tab1, tab2, tab3, tab4 = st.tabs([ 271 | "Price Distribution", 272 | "Price vs Mileage", 273 | "Price by Year", 274 | "Prediction Analysis" 275 | ]) 276 | 277 | with tab1: 278 | fig = create_plot(df, "distribution") 279 | if fig: 280 | st.plotly_chart(fig, use_container_width=True) 281 | 282 | with tab2: 283 | fig = create_plot(df, "scatter") 284 | if fig: 285 | st.plotly_chart(fig, use_container_width=True) 286 | 287 | with tab3: 288 | fig = create_plot(df, "box") 289 | if fig: 290 | st.plotly_chart(fig, use_container_width=True) 291 | 292 | with tab4: 293 | fig = create_plot(df, "prediction") 294 | if fig: 295 | st.plotly_chart(fig, use_container_width=True) 296 | 297 | st.subheader("🚗 Car Listings") 298 | 299 | col1, col2, col3 = st.columns(3) 300 | with col1: 301 | models = get_unique_values(df, 'model') 302 | model_filter = st.multiselect("Filter by Model", options=models) 303 | 304 | with col2: 305 | min_year = int(df['year'].min()) 306 | max_year = int(df['year'].max()) 307 | if min_year == max_year: 308 | year_filter = (min_year, min_year) 309 | st.info(f"Only cars from {min_year}") 310 | else: 311 | year_filter = st.slider( 312 | "Filter by Year", 313 | min_value=min_year, 314 | max_value=max_year, 315 | value=(min_year, max_year) 316 | ) 317 | 318 | with col3: 319 | fuel_types = get_unique_values(df, 'fuelType') 320 | fuel_filter = st.multiselect("Filter by Fuel Type", options=fuel_types) 321 | 322 | filtered_df = df.copy() 323 | if model_filter: 324 | filtered_df = filtered_df[filtered_df['model'].isin(model_filter)] 325 | filtered_df = filtered_df[ 326 | (filtered_df['year'] >= year_filter[0]) & 327 | (filtered_df['year'] <= year_filter[1]) 328 | ] 329 | if fuel_filter: 330 | filtered_df = filtered_df[filtered_df['fuelType'].isin(fuel_filter)] 331 | 332 | if not filtered_df.empty: 333 | display_df = filtered_df[[ 334 | 'model', 'year', 'price', 'predicted_price', 'mileage', 335 | 'transmission', 'fuelType', 'mpg', 'engineSize' 336 | ]].copy() 337 | 338 | st.dataframe( 339 | display_df.style.format({ 340 | 'price': lambda x: f'${x:,.2f}' if pd.notnull(x) else 'N/A', 341 | 'predicted_price': lambda x: f'${x:,.2f}' if pd.notnull(x) else 'Pending', 342 | 'engineSize': lambda x: f'{x:.1f}L' if pd.notnull(x) else 'N/A', 343 | 'mpg': lambda x: f'{x:.1f}' if pd.notnull(x) else 'N/A', 344 | 'mileage': lambda x: f'{x:,.0f}' if pd.notnull(x) else 'N/A' 345 | }), 346 | hide_index=True, 347 | use_container_width=True 348 | ) 349 | 350 | csv = filtered_df.to_csv(index=False) 351 | st.download_button( 352 | "📥 Download Data", 353 | csv, 354 | "car_data.csv", 355 | "text/csv", 356 | key='download-csv' 357 | ) 358 | else: 359 | st.info("No cars match the selected filters") 360 | else: 361 | st.info("No cars in the database. Add some cars to get started!") 362 | 363 | st.markdown("---") 364 | st.markdown( 365 | """ 366 |