├── rightmove
    ├── backend
    │   ├── __init__.py
    │   ├── app
    │   │   ├── __init__.py
    │   │   ├── models
    │   │   │   ├── __init__.py
    │   │   │   ├── coordinates.py
    │   │   │   ├── property.py
    │   │   │   └── pricing_category.py
    │   │   ├── data_processing
    │   │   │   ├── __init__.py
    │   │   │   ├── DataPreprocessor.py
    │   │   │   └── walk_score_processing.py
    │   │   └── main.py
    │   ├── requirements.txt
    │   ├── Dockerfile
    │   ├── fastapi.yaml
    │   └── tests
    │   │   └── integration_test.py
    ├── dashboard
    │   ├── streamlit
    │   │   ├── __init__.py
    │   │   ├── data_processing
    │   │   │   ├── __init__.py
    │   │   │   └── processing.py
    │   │   ├── pages
    │   │   │   ├── 05_WordCloud.py
    │   │   │   ├── 04_MachineLearning.py
    │   │   │   ├── 03_WalkScore.py
    │   │   │   └── 02_Price.py
    │   │   └── 01_LandingPage.py
    │   ├── requirements.txt
    │   └── Dockerfile
    ├── data_ingestion
    │   ├── rightmove_scraper
    │   │   ├── rightmove_scraper
    │   │   │   ├── __init__.py
    │   │   │   ├── spiders
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── rightmove.py
    │   │   │   ├── items.py
    │   │   │   ├── pipelines.py
    │   │   │   ├── settings.py
    │   │   │   └── middlewares.py
    │   │   ├── requirements.txt
    │   │   ├── scrapyd.conf
    │   │   ├── scrapy.cfg
    │   │   ├── setup.py
    │   │   └── Dockerfile
    │   └── scrapy.yaml
    ├── orchestration
    │   └── airflow_app
    │   │   ├── dags
    │   │       └── rightmove
    │   │       │   ├── data_processing
    │   │       │       ├── __init__.py
    │   │       │       ├── data_processor.py
    │   │       │       ├── rightmove_processing.py
    │   │       │       └── metric_extraction.py
    │   │       │   ├── visualization_data.py
    │   │       │   ├── rightmove_ingest.py
    │   │       │   ├── train_model.py
    │   │       │   └── ml_monitoring.py
    │   │   ├── requirements.txt
    │   │   ├── setup.py
    │   │   └── Dockerfile
    ├── mlflow
    │   ├── requirements.txt
    │   └── Dockerfile
    └── monitoring
    │   └── config
    │       ├── grafana_datasources.yaml
    │       └── grafana_dashboards.yaml
├── config.yaml
├── static
    └── images
    │   ├── mlops_pipeline.png
    │   ├── model_monitoring.png
    │   ├── scrapy_monitoring.png
    │   ├── Rightmove extraction.png
    │   ├── model_training_pipeline.png
    │   └── Processing_pipeline_rightmove.png
├── infrastructure
    ├── aws
    │   ├── variables.tf
    │   ├── main.tf
    │   └── database.tf
    └── gcp
    │   ├── bucket.tf
    │   └── main.tf
├── setup.py
├── .gitignore
├── notebooks
    ├── data_processing
    │   └── process_boundaries.py
    ├── data_ingestion
    │   ├── fetch_outcodes.ipynb
    │   ├── .ipynb_checkpoints
    │   │   └── fetch_outcodes-checkpoint.ipynb
    │   └── scrapy_connection.ipynb
    ├── data_storage
    │   ├── mongo_integration.ipynb
    │   └── .ipynb_checkpoints
    │   │   └── mongo_integration-checkpoint.ipynb
    └── resources
    │   └── data
    │       ├── property_1.json
    │       ├── property.json
    │       └── .ipynb_checkpoints
    │           └── property-checkpoint.json
├── README.md
├── requirements.txt
└── docker-compose.yaml


/rightmove/backend/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rightmove/backend/app/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rightmove/backend/app/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rightmove/dashboard/streamlit/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rightmove/backend/app/data_processing/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rightmove/dashboard/streamlit/data_processing/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rightmove/data_ingestion/rightmove_scraper/rightmove_scraper/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rightmove/orchestration/airflow_app/dags/rightmove/data_processing/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rightmove/data_ingestion/rightmove_scraper/rightmove_scraper/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rightmove/mlflow/requirements.txt:
--------------------------------------------------------------------------------
1 | mlflow
2 | psycopg2
3 | google-cloud-storage
4 | boto3


--------------------------------------------------------------------------------
/rightmove/dashboard/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | streamlit
3 | numpy
4 | requests
5 | google-cloud-storage
6 | 


--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: ConfigMap
3 | metadata:
4 |   name: config
5 | data:
6 |   mongo-url: "mongodb://mongodb:27017/"
7 | 


--------------------------------------------------------------------------------
/static/images/mlops_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexandergirardet/london_rightmove/HEAD/static/images/mlops_pipeline.png


--------------------------------------------------------------------------------
/static/images/model_monitoring.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexandergirardet/london_rightmove/HEAD/static/images/model_monitoring.png


--------------------------------------------------------------------------------
/static/images/scrapy_monitoring.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexandergirardet/london_rightmove/HEAD/static/images/scrapy_monitoring.png


--------------------------------------------------------------------------------
/rightmove/data_ingestion/rightmove_scraper/requirements.txt:
--------------------------------------------------------------------------------
1 | scrapy
2 | pymongo
3 | scrapyd
4 | scrapyd-client
5 | beautifulsoup4
6 | psycopg2-binary


--------------------------------------------------------------------------------
/static/images/Rightmove extraction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexandergirardet/london_rightmove/HEAD/static/images/Rightmove extraction.png


--------------------------------------------------------------------------------
/static/images/model_training_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexandergirardet/london_rightmove/HEAD/static/images/model_training_pipeline.png


--------------------------------------------------------------------------------
/infrastructure/aws/variables.tf:
--------------------------------------------------------------------------------
1 | variable "db_password" {
2 |   description = "The database admin password"
3 |   type        = string
4 |   sensitive   = true
5 | }
6 | 


--------------------------------------------------------------------------------
/static/images/Processing_pipeline_rightmove.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alexandergirardet/london_rightmove/HEAD/static/images/Processing_pipeline_rightmove.png


--------------------------------------------------------------------------------
/infrastructure/gcp/bucket.tf:
--------------------------------------------------------------------------------
1 | resource "google_storage_bucket" "rightmove-artifacts-ml" {
2 |   name     = "rightmove-artifacts-ml"
3 |   location = "europe-west2"
4 | }
5 | 


--------------------------------------------------------------------------------
/rightmove/backend/app/models/coordinates.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel, ValidationError, validator
2 | 
3 | 
4 | class Coordinates(BaseModel):
5 |     longitude: float
6 |     latitude: float
7 | 


--------------------------------------------------------------------------------
/rightmove/backend/requirements.txt:
--------------------------------------------------------------------------------
 1 | fastapi
 2 | pydantic
 3 | uvicorn
 4 | scipy
 5 | pandas
 6 | numpy
 7 | pymongo
 8 | mlflow==2.10.2
 9 | pytest
10 | psycopg2
11 | google-cloud-storage
12 | scikit-learn==1.3.2


--------------------------------------------------------------------------------
/rightmove/data_ingestion/rightmove_scraper/scrapyd.conf:
--------------------------------------------------------------------------------
1 | [scrapyd]
2 | bind_address= 0.0.0.0
3 | http_port   = 6800
4 | eggs_dir    = /scrapyd/eggs
5 | logs_dir    = /scrapyd/logs
6 | items_dir   = /scrapyd/items
7 | dbs_dir     = /scrapyd/dbs


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | 
3 | setup(
4 |     name='rightmove_scraper',
5 |     version='0.1',
6 |     package_dir={'': 'src'},  # Tells setuptools that packages are under src
7 |     packages=find_packages(where='src'),
8 | )


--------------------------------------------------------------------------------
/rightmove/orchestration/airflow_app/requirements.txt:
--------------------------------------------------------------------------------
 1 | awscli
 2 | requests
 3 | beautifulsoup4
 4 | pendulum
 5 | apache-airflow
 6 | pymongo
 7 | scikit-learn
 8 | psycopg2-binary
 9 | pandas
10 | numpy
11 | mlflow
12 | apache-beam
13 | evidently
14 | scipy
15 | 


--------------------------------------------------------------------------------
/rightmove/orchestration/airflow_app/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | 
3 | setup(
4 |     name="data_processing",
5 |     version="0.1.0",
6 |     packages=find_packages(),
7 |     description="Data processing module for Airflow pipelines",
8 | )
9 | 


--------------------------------------------------------------------------------
/rightmove/backend/app/models/property.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel, ValidationError, validator
 2 | 
 3 | 
 4 | class Property(BaseModel):
 5 |     bedrooms: float
 6 |     bathrooms: float
 7 |     longitude: float
 8 |     latitude: float
 9 |     walk_score: float
10 | 


--------------------------------------------------------------------------------
/infrastructure/aws/main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_providers {
 3 |     aws = {
 4 |       source  = "hashicorp/aws"
 5 |       version = "~> 4.16"
 6 |     }
 7 |   }
 8 | 
 9 |   required_version = ">= 1.2.0"
10 | }
11 | 
12 | provider "aws" {
13 |   region = "eu-west-2"
14 | }
15 | 


--------------------------------------------------------------------------------
/rightmove/backend/Dockerfile:
--------------------------------------------------------------------------------
 1 | #
 2 | FROM python:3.9
 3 | 
 4 | #
 5 | WORKDIR /code
 6 | 
 7 | #
 8 | COPY requirements.txt /code/requirements.txt
 9 | 
10 | #
11 | RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
12 | 
13 | #
14 | COPY app /code/app
15 | 
16 | #
17 | CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]


--------------------------------------------------------------------------------
/rightmove/data_ingestion/rightmove_scraper/rightmove_scraper/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # https://docs.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | import scrapy
 7 | 
 8 | 
 9 | class RightmoveScraperItem(scrapy.Item):
10 |     # define the fields for your item here like:
11 |     # name = scrapy.Field()
12 |     pass
13 | 


--------------------------------------------------------------------------------
/rightmove/data_ingestion/rightmove_scraper/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = rightmove_scraper.settings
 8 | 
 9 | [deploy:development]
10 | url = http://localhost:6800/
11 | project = rightmove_scraper
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/rightmove/data_ingestion/rightmove_scraper/setup.py:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapyd-deploy
 2 | 
 3 | from setuptools import setup, find_packages
 4 | 
 5 | setup(
 6 |     name="rightmove_scraper",
 7 |     version="1.0",
 8 |     packages=find_packages(),
 9 |     entry_points={"scrapy": ["settings = rightmove_scraper.settings"]},
10 |     package_data={"rightmove_scraper": ["resources/data/*.csv"]},
11 | )
12 | 


--------------------------------------------------------------------------------
/infrastructure/aws/database.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_db_instance" "realestate-database" {
 2 |   # These fields are examples; modify them according to your existing resource's configuration
 3 |   allocated_storage    = 20
 4 |   engine               = "postgres"
 5 |   engine_version       = "12.3"
 6 |   instance_class       = "db.t3.micro"
 7 |   username             = "postgres"
 8 |   password             = var.db_password
 9 | }
10 | 


--------------------------------------------------------------------------------
/rightmove/dashboard/Dockerfile:
--------------------------------------------------------------------------------
 1 | # app/Dockerfile
 2 | 
 3 | FROM python:3.9-slim
 4 | 
 5 | WORKDIR /code
 6 | 
 7 | COPY requirements.txt /code/requirements.txt
 8 | 
 9 | RUN pip3 install -r /code/requirements.txt
10 | 
11 | COPY streamlit /code/app
12 | 
13 | EXPOSE 8501
14 | 
15 | HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
16 | 
17 | ENTRYPOINT ["streamlit", "run", "app/main.py", "--server.port=8501", "--server.address=0.0.0.0"]


--------------------------------------------------------------------------------
/rightmove/backend/app/models/pricing_category.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel, ValidationError, validator
 2 | 
 3 | 
 4 | class PricingCategory(BaseModel):
 5 |     category: str
 6 | 
 7 |     # Optional: Validator to provide a more specific error message
 8 |     @validator("category")
 9 |     def check_category(cls, v):
10 |         if v not in ["Cheap", "Average", "Expensive"]:
11 |             raise ValidationError('Pricing must be "Cheap", "Average", or "Expensive"')
12 |         return v
13 | 


--------------------------------------------------------------------------------
/rightmove/monitoring/config/grafana_datasources.yaml:
--------------------------------------------------------------------------------
 1 | # config file version
 2 | apiVersion: 1
 3 | 
 4 | # list of datasources to insert/update
 5 | # available in the database
 6 | datasources:
 7 |   - name: grafana-postgresql-datasource
 8 |     type: postgres
 9 |     access: proxy
10 |     url: realestate-database.czkkjkojmucd.eu-west-2.rds.amazonaws.com:5432
11 |     database: monitoring
12 |     user: postgres
13 |     secureJsonData:
14 |       password: 'postgres'
15 |     jsonData:
16 |       sslmode: 'require'


--------------------------------------------------------------------------------
/infrastructure/gcp/main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_providers {
 3 |     google = {
 4 |       source = "hashicorp/google"
 5 |       version = "4.51.0"
 6 |     }
 7 |   }
 8 | }
 9 | 
10 | provider "google" {
11 |   credentials = file("/Users/alexander.girardet/Code/Personal/projects/rightmove_project/credentials/airflow-service-account.json")
12 | 
13 |   project = "personal-projects-411616"
14 |   region  = "europe-west2"
15 |   zone    = "europe-west2-a"
16 | }
17 | 
18 | #resource "google_compute_network" "vpc_network" {
19 | #  name = "terraform-network"
20 | #}
21 | 


--------------------------------------------------------------------------------
/rightmove/mlflow/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Start from a base image with Python installed
 2 | FROM python:3.9
 3 | 
 4 | # Set the working directory in the container
 5 | WORKDIR /app
 6 | 
 7 | # Copy the requirements file into the container
 8 | COPY requirements.txt /app
 9 | 
10 | # Install mlflow and dependencies
11 | RUN pip install --no-cache-dir -r requirements.txt
12 | 
13 | # Expose the default MLflow server port
14 | EXPOSE 5001
15 | 
16 | # Start the MLflow server when the container starts
17 | CMD mlflow server --backend-store-uri $MLFLOW_BACKEND_STORE_URI --default-artifact-root $MLFLOW_ARTIFACTS_DESTINATION --host 0.0.0.0 --port 5001
18 | 


--------------------------------------------------------------------------------
/rightmove/data_ingestion/rightmove_scraper/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Use an official Python runtime as a parent image
 2 | FROM python:3.8-slim
 3 | 
 4 | # Copy the requirements file into the container at /usr/src/app
 5 | COPY requirements.txt ./
 6 | 
 7 | # Install any needed packages specified in requirements.txt
 8 | # (Assuming requirements.txt includes scrapy and scrapyd)
 9 | RUN pip install --no-cache-dir -r requirements.txt
10 | 
11 | RUN mkdir /etc/scrapyd
12 | RUN mkdir -p /scrapyd/logs
13 | 
14 | # Make port 6800 available to the world outside this container
15 | # (scrapyd default port)
16 | EXPOSE 6800
17 | 
18 | COPY . .
19 | 
20 | # Run scrapyd when the container launches
21 | CMD ["scrapyd"]


--------------------------------------------------------------------------------
/rightmove/monitoring/config/grafana_dashboards.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: 1
 2 | 
 3 | providers:
 4 |   - name: 'My Dashboards' # A friendly name for this provisioning configuration
 5 |     orgId: 1 # The ID of the Org in Grafana where you want to provision the dashboards
 6 |     folder: '' # The name of the folder where you want these dashboards to appear. Leave empty for the General folder.
 7 |     type: file # The type of the provider. In this case, 'file' for file-based provisioning.
 8 |     disableDeletion: false # Whether Grafana should delete dashboards not in the JSON files.
 9 |     updateIntervalSeconds: 10 # How often Grafana will scan for changed dashboard files.
10 |     options:
11 |       path: /var/lib/grafana/dashboards
12 | 


--------------------------------------------------------------------------------
/rightmove/data_ingestion/scrapy.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: scrapy-deployment
 5 | spec:
 6 |   replicas: 1
 7 |   selector:
 8 |     matchLabels:
 9 |       app: scrapy
10 |   template:
11 |     metadata:
12 |       labels:
13 |         app: scrapy
14 |     spec:
15 |       containers:
16 |       - name: scrapy
17 |         image: <your-scrapy-docker-image>
18 |         ports:
19 |         - containerPort: 6800
20 |         env:
21 |         - name: MONGO_URL
22 |           value: "mongodb://<mongodb-service-name>:27017/"
23 | 
24 | ---
25 | 
26 | apiVersion: v1
27 | kind: Service
28 | metadata:
29 |   name: scrapy-service
30 | spec:
31 |   type: NodePort
32 |   selector:
33 |     app: scrapy
34 |   ports:
35 |     - protocol: TCP
36 |       port: 6800
37 |       targetPort: 6800


--------------------------------------------------------------------------------
/rightmove/backend/fastapi.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: fastapi-deployment
 5 |   labels:
 6 |     app: fastapi
 7 | spec:
 8 |   replicas: 1
 9 |   selector:
10 |     matchLabels:
11 |       app: fastapi
12 |   template:
13 |     metadata:
14 |       labels:
15 |         app: fastapi
16 |     spec:
17 |       containers:
18 |       - name: webapp
19 |         image: alexgirardet123/fastapi:latest
20 |         ports:
21 |         - containerPort: 80
22 |         env:
23 |           - name: MONGO_DB_URL
24 |             valueFrom:
25 |               configMapKeyRef:
26 |                 name: mongo-config
27 |                 key: mongo-url
28 | 
29 | ---
30 | 
31 | apiVersion: v1
32 | kind: Service
33 | metadata:
34 |   name: fastapi-service # End point to access fastapi
35 | spec:
36 |   type: NodePort # External Service type
37 |   selector: # Selects the pods to forward the requests to. Forwards to pods by their label.
38 |     app: fastapi
39 |   ports:
40 |     - protocol: TCP
41 |       port: 80
42 |       targetPort: 80 # The port of the pods that belong to the service. The target port should be the same as container port
43 |       nodePort: 30100 # Port to access the service from outside the cluster


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled source #
 2 | ###################
 3 | *.com
 4 | *.class
 5 | *.dll
 6 | *.exe
 7 | *.o
 8 | *.so
 9 | 
10 | # Pyenv
11 | rightmove_env
12 | 
13 | # Packages #
14 | ############
15 | # it's better to unpack these files and commit the raw source because packages can contain binary data
16 | *.7z
17 | *.dmg
18 | *.gz
19 | *.iso
20 | *.jar
21 | *.rar
22 | *.tar
23 | *.zip
24 | 
25 | # Logs and databases #
26 | ######################
27 | *.log
28 | *.sql
29 | *.sqlite
30 | 
31 | # OS generated files #
32 | ######################
33 | .DS_Store
34 | .DS_Store?
35 | ._*
36 | .Spotlight-V100
37 | .Trashes
38 | ehthumbs.db
39 | Thumbs.db
40 | 
41 | # Editor directories and files #
42 | ################################
43 | .idea
44 | *.swp
45 | *.swo
46 | *.sublime-workspace
47 | *.sublime-project
48 | .vscode/
49 | *.code-workspace
50 | 
51 | # build outputs #
52 | #################
53 | bin/
54 | obj/
55 | out/
56 | build/
57 | dist/
58 | *.dmg
59 | *.exe
60 | *.msi
61 | *.deb
62 | *.rpm
63 | *.tgz
64 | *.pkg
65 | 
66 | # Dependencies #
67 | ################
68 | # Node.js dependencies
69 | node_modules/
70 | 
71 | # Python #
72 | ##########
73 | # Byte-compiled / optimized / DLL files
74 | __pycache__/
75 | *.py[cod]
76 | *$py.class
77 | *.egg-info
78 | *.benchmarks
79 | *.pytest_cache
80 | *.env
81 | 


--------------------------------------------------------------------------------
/rightmove/backend/tests/integration_test.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import pandas as pd
 3 | 
 4 | # Define the URL of your batch prediction endpoint
 5 | url = "http://127.0.0.1:8000/batch-predict"
 6 | 
 7 | # Create a DataFrame with your test properties. This should match the structure expected by your API.
 8 | # For example, if your Property model expects 'size' and 'location', your DataFrame should reflect that.
 9 | df = pd.read_csv(
10 |     "gs://rightmove-artifacts-ml/data/2024-02-17-14-18-14/test.csv", index_col=0
11 | )
12 | 
13 | df = df[["bedrooms", "bathrooms", "longitude", "latitude", "walk_score"]]
14 | 
15 | print(df.head())
16 | 
17 | # Convert the DataFrame to a list of dictionaries
18 | properties_list = df.to_dict("records")
19 | 
20 | 
21 | # Make a POST request to the batch prediction endpoint
22 | response = requests.post(url, json=properties_list)
23 | 
24 | # Check the status code to ensure the request was successful
25 | assert response.status_code == 200
26 | 
27 | # Convert the response to JSON and retrieve the predictions
28 | predictions = response.json().get("predictions")
29 | 
30 | # Perform any additional checks you need on the predictions
31 | # For example, check the number of predictions matches the number of input properties
32 | assert len(predictions) == len(properties_list)
33 | 
34 | print("Predictions:", predictions)
35 | 


--------------------------------------------------------------------------------
/rightmove/orchestration/airflow_app/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM apache/airflow:latest
 2 | 
 3 | ENV AIRFLOW_HOME=/opt/airflow
 4 | 
 5 | USER root
 6 | RUN apt-get update -qq && apt-get install vim -qqq
 7 | # git gcc g++ -qqq
 8 | 
 9 | COPY requirements.txt .
10 | 
11 | USER $AIRFLOW_UID
12 | RUN pip install --no-cache-dir -r requirements.txt
13 | 
14 | USER root
15 | 
16 | # Ref: https://airflow.apache.org/docs/docker-stack/recipes.html
17 | 
18 | SHELL ["/bin/bash", "-o", "pipefail", "-e", "-u", "-x", "-c"]
19 | 
20 | ARG CLOUD_SDK_VERSION=322.0.0
21 | ENV GCLOUD_HOME=/home/google-cloud-sdk
22 | 
23 | ENV PATH="${GCLOUD_HOME}/bin/:${PATH}"
24 | 
25 | RUN apt-get update && apt-get install -y libpq-dev && rm -rf /var/lib/apt/lists/*
26 | 
27 | RUN DOWNLOAD_URL="https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-${CLOUD_SDK_VERSION}-linux-x86_64.tar.gz" \
28 |     && TMP_DIR="$(mktemp -d)" \
29 |     && curl -fL "${DOWNLOAD_URL}" --output "${TMP_DIR}/google-cloud-sdk.tar.gz" \
30 |     && mkdir -p "${GCLOUD_HOME}" \
31 |     && tar xzf "${TMP_DIR}/google-cloud-sdk.tar.gz" -C "${GCLOUD_HOME}" --strip-components=1 \
32 |     && "${GCLOUD_HOME}/install.sh" \
33 |     --bash-completion=false \
34 |     --path-update=false \
35 |     --usage-reporting=false \
36 |     --quiet \
37 |     && rm -rf "${TMP_DIR}" \
38 |     && gcloud --version
39 | 
40 | WORKDIR $AIRFLOW_HOME
41 | USER $AIRFLOW_UID


--------------------------------------------------------------------------------
/notebooks/data_processing/process_boundaries.py:
--------------------------------------------------------------------------------
 1 | from pyrosm import OSM, get_data
 2 | import pandas as pd
 3 | import requests
 4 | import os
 5 | pd.options.mode.chained_assignment = None
 6 | import tempfile
 7 | 
 8 | 
 9 | def process_geodata():
10 |     # file_path = f"../resources/boundary_date/data/{file_name}"
11 |     file_name = "greater-london-latest.osm.pbf"
12 |     file_path = "/Users/alexander.girardet/Code/Personal/projects/rightmove_project/data/greater-london-latest.osm.pbf"
13 |     print(f"processing: {file_name}")
14 | 
15 |     osm = OSM(file_path)
16 | 
17 |     boundary_name = file_name.split(".osm")[0]
18 | 
19 |     boundary_df = osm.get_boundaries()
20 |     boundary_df = boundary_df.rename(columns={"id": "boundary_id"})
21 | 
22 |     output_filename = f'geodata/{boundary_name}.geojson'  # Specifying the path in writable storage
23 |     boundary_df.to_file(output_filename, driver='GeoJSON')
24 |     print(f"Loaded {output_filename}")
25 | 
26 | # already_processed = os.listdir("geodata")
27 | # processed_names = [name.split(".geojson")[0] for name in already_processed]
28 | # files = os.listdir("../resources/boundary_date/data")
29 | 
30 | # for file_name in files:
31 | #     name = file_name.split(".osm.pbf")[0]
32 | #     if name not in processed_names:
33 | #         if name != "scotland-latest":
34 | #             try:
35 | #                 process_geodata(file_name)
36 | #             except:
37 | #                 print(f"Failed to process: {name}")
38 | process_geodata()


--------------------------------------------------------------------------------
/rightmove/data_ingestion/rightmove_scraper/rightmove_scraper/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | 
 7 | # useful for handling different item types with a single interface
 8 | from itemadapter import ItemAdapter
 9 | from pymongo import MongoClient
10 | import os
11 | import datetime
12 | 
13 | # MONGO_URL = "mongodb://mongodb:27017/"
14 | MONGO_URI = os.environ.get("MONGO_URI")
15 | 
16 | 
17 | class RightmoveScraperPipeline:
18 |     def __init__(self):
19 |         self.batch = []
20 | 
21 |         self.client = MongoClient(MONGO_URI)
22 |         db = self.client["rightmove"]
23 |         self.collection = db["properties"]
24 | 
25 |     def process_item(self, item, spider):
26 |         """
27 |         Sending items to MongoDB in batches to reduce I/O operations
28 |         """
29 | 
30 |         item["extraction_timestamp"] = datetime.datetime.utcnow().timestamp()
31 | 
32 |         self.batch.append(item)
33 | 
34 |         if len(self.batch) >= 50:  # Batch size of file
35 |             self.collection.insert_many(self.batch)
36 |             self.batch = []
37 | 
38 |         return item
39 | 
40 |     def close_spider(self, spider):
41 |         print("SPIDER CLOSING...")
42 | 
43 |         if len(self.batch) > 0:
44 |             self.collection.insert_many(self.batch)  # Send remaining items
45 | 
46 |         self.client.close()
47 | 


--------------------------------------------------------------------------------
/rightmove/dashboard/streamlit/pages/05_WordCloud.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import requests
 3 | import pydeck as pdk
 4 | from wordcloud import WordCloud
 5 | import pandas as pd
 6 | import geopandas
 7 | import matplotlib.pyplot as plt
 8 | import json
 9 | 
10 | 
11 | @st.cache_data
12 | def load_data():
13 |     df = pd.read_parquet(
14 |         "gs://rightmove-artifacts-ml/streamlit_data/2024-02-27-12-32-07/data.parquet"
15 |     )  #
16 |     return df
17 | 
18 | 
19 | df = load_data()
20 | 
21 | 
22 | def generate_wordcloud(text):
23 |     wordcloud = WordCloud(width=800, height=400, background_color="white").generate(
24 |         text
25 |     )
26 |     fig, ax = plt.subplots(figsize=(10, 5))
27 |     ax.imshow(wordcloud)
28 |     ax.axis("off")
29 |     return fig
30 | 
31 | 
32 | def fetch_corpus(category, df):
33 |     category_df = df[df["price_category"] == category]
34 |     combined_text = " ".join(category_df["text"].tolist())
35 |     return combined_text
36 | 
37 | 
38 | st.title("Wordcloud Generator")
39 | 
40 | # Category selection
41 | category = st.selectbox("Select a category:", ("Expensive", "Cheap", "Average"))
42 | 
43 | corpus = fetch_corpus(category, df)
44 | 
45 | # Implement word filter mechanism to accept multiple words
46 | filter_words = st.text_input(
47 |     "Enter words to filter out (separated by commas) and regenerate wordcloud:"
48 | )
49 | 
50 | if filter_words:
51 |     # Split the filter_words by commas, strip spaces, and convert to lowercase for case-insensitive comparison
52 |     filter_words_list = [word.strip().lower() for word in filter_words.split(",")]
53 |     # Filter out the words
54 |     filtered_corpus = " ".join(
55 |         [word for word in corpus.split() if word.lower() not in filter_words_list]
56 |     )
57 | else:
58 |     # If no filter words are provided, use the original corpus
59 |     filtered_corpus = corpus
60 | 
61 | # Display the wordcloud
62 | st.write("Generated Wordcloud:")
63 | fig = generate_wordcloud(filtered_corpus)  # Generate wordcloud with filtered corpus
64 | st.pyplot(fig)  # Display the figure
65 | 


--------------------------------------------------------------------------------
/rightmove/dashboard/streamlit/data_processing/processing.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | class DataPreprocessor:
 5 |     def __init__(self, with_text=False, with_binary=False):
 6 |         self.with_text = with_text
 7 |         self.with_binary = with_binary
 8 | 
 9 |     @staticmethod
10 |     def convert_frequencies(x):
11 |         frequency = x["frequency"]
12 |         price = x["amount"]
13 | 
14 |         if frequency == "monthly":
15 |             return price * 12
16 |         elif frequency == "weekly":
17 |             return (price / 7) * 365
18 |         elif frequency == "daily":
19 |             return price * 365
20 |         elif frequency == "quarterly":
21 |             return price * 4
22 |         else:  # Yearly
23 |             return price
24 | 
25 |     @staticmethod
26 |     def remove_anomalies(df, percentile_threshold=0.99):
27 |         percentile_thresholds = df[["price", "bedrooms", "bathrooms"]].quantile(
28 |             percentile_threshold
29 |         )
30 | 
31 |         filtered_df = df[
32 |             (df["price"] <= percentile_thresholds["price"])
33 |             & (df["bedrooms"] <= percentile_thresholds["bedrooms"])
34 |             & (df["bathrooms"] <= percentile_thresholds["bathrooms"])
35 |         ]
36 |         return filtered_df
37 | 
38 |     @staticmethod
39 |     def merge_text(x):
40 |         summary, feature_list = x[0], x[1]
41 |         feature_list_joined = ", ".join(feature_list) if feature_list else ""
42 |         return feature_list_joined + " , " + summary
43 | 
44 |     @staticmethod
45 |     def label_price(price):
46 |         if price < 8000:
47 |             return "Cheap"
48 |         elif price < 20_000:
49 |             return "Average"
50 |         else:
51 |             return "Expensive"
52 | 
53 |     def preprocess_properties(self, df):
54 |         df["longitude"] = df["location"].apply(lambda x: x["longitude"])
55 |         df["latitude"] = df["location"].apply(lambda x: x["latitude"])
56 |         df["price"] = df["price"].apply(self.convert_frequencies)
57 | 
58 |         if self.with_text:
59 |             df["text"] = df[["summary", "feature_list"]].apply(self.merge_text, axis=1)
60 |         if self.with_binary:
61 |             df["commercial"] = df["commercial"].apply(lambda x: 1 if x else 0)
62 |             df["development"] = df["development"].apply(lambda x: 1 if x else 0)
63 |             df["students"] = df["students"].apply(lambda x: 1 if x else 0)
64 | 
65 |         df["price_category"] = df["price"].apply(self.label_price)
66 |         df["listingUpdateReason"] = df["listingUpdate"].apply(
67 |             lambda x: x["listingUpdateReason"]
68 |         )
69 |         df["firstVisibleDate"] = pd.to_datetime(df["firstVisibleDate"], utc=True)
70 |         df = self.remove_anomalies(df)
71 |         df = df.drop(columns=["location", "_id", "listingUpdate"])
72 |         return df
73 | 
74 |     @staticmethod
75 |     def preprocess_walk_score(df):
76 |         df = df.drop_duplicates(subset=["id"])
77 |         df["walk_score"] = df["scores"].apply(lambda x: int(x["walk_score"]))
78 |         df = df.drop(columns=["_id", "scores"])
79 |         return df


--------------------------------------------------------------------------------
/rightmove/backend/app/data_processing/DataPreprocessor.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | class DataPreprocessor:
 5 |     def __init__(self):
 6 |         pass
 7 | 
 8 |     @staticmethod
 9 |     def convert_frequencies(x):
10 |         frequency = x["frequency"]
11 |         price = x["amount"]
12 | 
13 |         if frequency == "monthly":
14 |             return price * 12
15 |         elif frequency == "weekly":
16 |             return (price / 7) * 365
17 |         elif frequency == "daily":
18 |             return price * 365
19 |         elif frequency == "quarterly":
20 |             return price * 4
21 |         else:  # Yearly
22 |             return price
23 | 
24 |     @staticmethod
25 |     def remove_anomalies(df, percentile_threshold=0.99):
26 |         percentile_thresholds = df[["price", "bedrooms", "bathrooms"]].quantile(
27 |             percentile_threshold
28 |         )
29 | 
30 |         # Filter the dataset to remove anomalies above the 98th percentile
31 |         filtered_df = df[
32 |             (df["price"] <= percentile_thresholds["price"])
33 |             & (df["bedrooms"] <= percentile_thresholds["bedrooms"])
34 |             & (df["bathrooms"] <= percentile_thresholds["bathrooms"])
35 |         ]
36 |         return filtered_df
37 | 
38 |     @staticmethod
39 |     def merge_text(x):
40 |         summary, feature_list = x[0], x[1]
41 |         feature_list_joined = ", ".join(feature_list) if feature_list else ""
42 |         return feature_list_joined + " , " + summary
43 | 
44 |     def preprocess_properties_with_binary(self, df):
45 |         df["longitude"] = df["location"].apply(lambda x: x["longitude"])
46 |         df["latitude"] = df["location"].apply(lambda x: x["latitude"])
47 |         df = df.drop(columns=["location"])
48 |         df["price"] = df["price"].apply(self.convert_frequencies)
49 |         df["commercial"] = df["commercial"].apply(lambda x: 1 if x else 0)
50 |         df["development"] = df["development"].apply(lambda x: 1 if x else 0)
51 |         df["students"] = df["students"].apply(lambda x: 1 if x else 0)
52 |         df["text"] = df[["summary", "feature_list"]].apply(self.merge_text, axis=1)
53 |         df = self.remove_anomalies(df)
54 |         return df
55 | 
56 |     @staticmethod
57 |     def label_price(price):
58 |         if price < 8000:
59 |             return "Cheap"
60 |         elif price < 20_000:
61 |             return "Average"
62 |         else:
63 |             return "Expensive"
64 | 
65 |     def preprocess_properties(self, df):
66 |         df["longitude"] = df["location"].apply(lambda x: x["longitude"])
67 |         df["latitude"] = df["location"].apply(lambda x: x["latitude"])
68 |         df = df.drop(columns=["location", "_id"])
69 |         df["price"] = df["price"].apply(self.convert_frequencies)
70 |         df["text"] = df[["summary", "feature_list"]].apply(self.merge_text, axis=1)
71 |         df["price_category"] = df["price"].apply(self.label_price)
72 |         df["listingUpdateReason"] = df["listingUpdate"].apply(
73 |             lambda x: x["listingUpdateReason"]
74 |         )
75 |         df["firstVisibleDate"] = pd.to_datetime(df["firstVisibleDate"], utc=True)
76 |         df = self.remove_anomalies(df)
77 |         return df
78 | 
79 |     @staticmethod
80 |     def preprocess_walk_score(df):
81 |         df = df.drop_duplicates(subset=["id"])
82 |         df["walk_score"] = df["scores"].apply(lambda x: int(x["walk_score"]))
83 |         df = df.drop(columns=["_id", "scores"])
84 |         return df
85 | 
86 |     @staticmethod
87 |     def merge_dataframes(df, walk_df):
88 |         merged_df = df.merge(walk_df, on="id", how="left")
89 | 
90 |         return merged_df
91 | 


--------------------------------------------------------------------------------
/rightmove/backend/app/data_processing/walk_score_processing.py:
--------------------------------------------------------------------------------
  1 | from sklearn.neighbors import BallTree
  2 | import math
  3 | 
  4 | from math import radians
  5 | import pandas as pd
  6 | import numpy as np
  7 | 
  8 | GCS_PARQUET_URL = (
  9 |     "https://storage.googleapis.com/rightmove-resources-public/UK_pois.parquet"
 10 | )
 11 | WALK_SCORES_COLLECTION = "walk_scores"
 12 | 
 13 | 
 14 | class WalkScoreProcessor:
 15 |     def __init__(self):
 16 |         self.earth_radius = 6371000  # Earth radius in metres
 17 |         self.pois_df = pd.read_parquet(GCS_PARQUET_URL)
 18 |         self.ball_tree = BallTree(
 19 |             self.pois_df[["lon_rad", "lat_rad"]].values, metric="haversine"
 20 |         )  # What is the ball tree doing?
 21 |         self.amenity_weights = {
 22 |             "grocery": [3],
 23 |             "restaurants": [
 24 |                 0.75,
 25 |                 0.45,
 26 |                 0.25,
 27 |                 0.25,
 28 |                 0.225,
 29 |                 0.225,
 30 |                 0.225,
 31 |                 0.225,
 32 |                 0.2,
 33 |                 0.2,
 34 |             ],
 35 |             "shopping": [0.5, 0.45, 0.4, 0.35, 0.3],
 36 |             "coffee": [1.25, 0.75],
 37 |             "banks": [1],
 38 |             "parks": [1],
 39 |             "schools": [1],
 40 |             "books": [1],
 41 |             "entertainment": [1],
 42 |         }
 43 | 
 44 |     def process_results_df(self, distance_series, pois_df):
 45 |         results_df = pd.DataFrame(distance_series)
 46 | 
 47 |         results_df = results_df.join(pois_df["amenities"], how="left")
 48 | 
 49 |         results_df["distance_in_metres"] = results_df["distance"].apply(
 50 |             lambda x: x * self.earth_radius
 51 |         )
 52 | 
 53 |         results_df["distance_decayed"] = results_df["distance_in_metres"].apply(
 54 |             lambda x: float(self.distance_decay(x))
 55 |         )
 56 | 
 57 |         return results_df
 58 | 
 59 |     def distance_decay(self, distance):
 60 |         dist = distance / 1000
 61 |         score = math.e ** ((-5.0 * (dist / 4)) ** 5.0)
 62 |         return score
 63 | 
 64 |     def calculate_amenity_walk_score(self, property_distance_df, amenity, weights):
 65 |         k = len(weights)
 66 |         weight_array = np.array(weights)
 67 | 
 68 |         dist_array = (
 69 |             property_distance_df[property_distance_df["amenities"] == amenity]
 70 |             .iloc[0:k]["distance_decayed"]
 71 |             .values
 72 |         )
 73 |         dist_array_padded = np.pad(
 74 |             dist_array, (0, weight_array.size - dist_array.size), "constant"
 75 |         )
 76 | 
 77 |         scores_array = dist_array_padded * weight_array
 78 | 
 79 |         amenity_score = scores_array.sum()
 80 | 
 81 |         return amenity_score
 82 | 
 83 |     def calculuate_walk_score(self, longitude, latitude):
 84 |         radian_longitude = radians(longitude)
 85 |         radian_latitude = radians(latitude)
 86 | 
 87 |         k = 100  # Maximum number of amenities to return
 88 | 
 89 |         distances, indices = self.ball_tree.query(
 90 |             [[radian_longitude, radian_latitude]], k=k, return_distance=True
 91 |         )
 92 | 
 93 |         dist_series = pd.Series(distances[0], index=indices[0], name="distance")
 94 | 
 95 |         results_df = self.process_results_df(dist_series, self.pois_df)
 96 | 
 97 |         scores_dict = {}
 98 | 
 99 |         for key, values in self.amenity_weights.items():
100 |             amenity_score = self.calculate_amenity_walk_score(results_df, key, values)
101 | 
102 |             scores_dict[key] = amenity_score
103 | 
104 |         return scores_dict
105 | 


--------------------------------------------------------------------------------
/rightmove/data_ingestion/rightmove_scraper/rightmove_scraper/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for rightmove_scraper project
 2 | #
 3 | # For simplicity, this file contains only settings considered important or
 4 | # commonly used. You can find more settings consulting the documentation:
 5 | #
 6 | #     https://docs.scrapy.org/en/latest/topics/settings.html
 7 | #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 8 | #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 9 | 
10 | BOT_NAME = "rightmove_scraper"
11 | 
12 | SPIDER_MODULES = ["rightmove_scraper.spiders"]
13 | NEWSPIDER_MODULE = "rightmove_scraper.spiders"
14 | 
15 | 
16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
17 | # USER_AGENT = "rightmove_scraper (+http://www.yourdomain.com)"
18 | 
19 | # Obey robots.txt rules
20 | ROBOTSTXT_OBEY = False
21 | 
22 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
23 | # CONCURRENT_REQUESTS = 32
24 | 
25 | # Configure a delay for requests for the same website (default: 0)
26 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
27 | # See also autothrottle settings and docs
28 | # DOWNLOAD_DELAY = 3
29 | # The download delay setting will honor only one of:
30 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
31 | # CONCURRENT_REQUESTS_PER_IP = 16
32 | 
33 | # Disable cookies (enabled by default)
34 | # COOKIES_ENABLED = False
35 | 
36 | # Disable Telnet Console (enabled by default)
37 | # TELNETCONSOLE_ENABLED = False
38 | 
39 | # Override the default request headers:
40 | # DEFAULT_REQUEST_HEADERS = {
41 | #    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
42 | #    "Accept-Language": "en",
43 | # }
44 | 
45 | # Enable or disable spider middlewares
46 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
47 | SPIDER_MIDDLEWARES = {
48 |     "rightmove_scraper.middlewares.RightmoveScraperSpiderMiddleware": 543,
49 | }
50 | 
51 | # Enable or disable downloader middlewares
52 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
53 | # DOWNLOADER_MIDDLEWARES = {
54 | #    "rightmove_scraper.middlewares.RightmoveScraperDownloaderMiddleware": 543,
55 | # }
56 | 
57 | # Enable or disable extensions
58 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
59 | # EXTENSIONS = {
60 | #    "scrapy.extensions.telnet.TelnetConsole": None,
61 | # }
62 | 
63 | # Configure item pipelines
64 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
65 | ITEM_PIPELINES = {
66 |     "rightmove_scraper.pipelines.RightmoveScraperPipeline": 300,
67 | }
68 | 
69 | # Enable and configure the AutoThrottle extension (disabled by default)
70 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
71 | # AUTOTHROTTLE_ENABLED = True
72 | # The initial download delay
73 | # AUTOTHROTTLE_START_DELAY = 5
74 | # The maximum download delay to be set in case of high latencies
75 | # AUTOTHROTTLE_MAX_DELAY = 60
76 | # The average number of requests Scrapy should be sending in parallel to
77 | # each remote server
78 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
79 | # Enable showing throttling stats for every response received:
80 | # AUTOTHROTTLE_DEBUG = False
81 | 
82 | # Enable and configure HTTP caching (disabled by default)
83 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
84 | # HTTPCACHE_ENABLED = True
85 | # HTTPCACHE_EXPIRATION_SECS = 0
86 | # HTTPCACHE_DIR = "httpcache"
87 | # HTTPCACHE_IGNORE_HTTP_CODES = []
88 | # HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
89 | 
90 | # LOG_ENABLED = True
91 | # LOG_LEVEL = 'INFO'
92 | # LOG_FILE = None
93 | 
94 | # Set settings whose default value is deprecated to a future-proof value
95 | REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
96 | TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
97 | FEED_EXPORT_ENCODING = "utf-8"
98 | 


--------------------------------------------------------------------------------
/rightmove/dashboard/streamlit/pages/04_MachineLearning.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import streamlit as st
 4 | import requests
 5 | import pandas as pd
 6 | from streamlit_folium import folium_static
 7 | import folium
 8 | 
 9 | # Function to send the HTTP POST request
10 | def predict_property_value(features):
11 |     url = "http://localhost:8000/predict"  # Replace with your actual endpoint URL
12 |     response = requests.post(url, json=features)
13 |     return response.json()
14 | 
15 | def get_walk_score(coordinates):
16 |     print(coordinates)
17 |     url = "http://localhost:8000/walk_score"  # Replace with your actual endpoint URL
18 |     response = requests.post(url, json=coordinates)
19 |     return response.json()
20 | 
21 | 
22 | # Streamlit user interface setup
23 | st.title("Property Value Prediction")
24 | 
25 | st.write(
26 |     "Enter the property details below and choose to either generate a Walk Score based on the location or input it manually to see the impact on the property value prediction.")
27 | 
28 | # Input fields for property features
29 | with st.form(key='property_details'):
30 |     bedrooms = st.number_input("Number of Bedrooms", min_value=1, value=3)
31 |     bathrooms = st.number_input("Number of Bathrooms", min_value=1, value=2)
32 |     latitude = st.number_input("Latitude", value=51.53)
33 |     longitude = st.number_input("Longitude", value=-0.06)
34 | 
35 |     # Instructions for Walk Score
36 |     st.write("You can either generate a Walk Score based on the coordinates or input a Walk Score manually.")
37 | 
38 |     if st.form_submit_button("Generate Walk Score"):
39 |         coordinates = {"longitude": longitude, "latitude": latitude}
40 |         walk_score_generated = round(get_walk_score(coordinates)['walk_score'], 2)
41 |         st.session_state.generated_walk_score = walk_score_generated
42 |         st.success(f"Generated Walk Score: {walk_score_generated}")
43 |     walk_score = st.number_input("Or Input Walk Score", min_value=0, value=50, key="manual_walk_score")
44 | 
45 |     # Use generated walk score if available, else use manual input
46 |     final_walk_score = st.session_state.get('generated_walk_score', walk_score)
47 | 
48 |     submitted = st.form_submit_button("Confirm Inputs")
49 |     if submitted:
50 |         st.write("### Inputs for Prediction")
51 |         st.write(f"- Number of Bedrooms: {bedrooms}")
52 |         st.write(f"- Number of Bathrooms: {bathrooms}")
53 |         st.write(f"- Latitude: {latitude}")
54 |         st.write(f"- Longitude: {longitude}")
55 |         st.write(f"- Walk Score: {final_walk_score}")
56 |         st.write("Use the 'Launch Prediction' button below to predict the property value based on these inputs.")
57 | 
58 | # Button to launch prediction after reviewing inputs
59 | if st.button("Launch Prediction"):
60 |     features = {
61 |         "bedrooms": bedrooms,
62 |         "bathrooms": bathrooms,
63 |         "walk_score": final_walk_score,
64 |         "latitude": latitude,
65 |         "longitude": longitude
66 |     }
67 |     prediction = predict_property_value(features)
68 |     monthly_value = prediction["prediction"] / 12
69 |     formatted_value = f"£{monthly_value:,.2f} per month"  # Formats the number with comma as thousands separator and two decimal places
70 |     # st.success(f"Predicted Property Value: {formatted_value}")
71 |     st.success(f"A property with {features['bedrooms']} bedrooms and {features['bathrooms']} bathrooms, located at ({features['latitude']:,.2f}, {features['longitude']:,.2f}), with a Walk Score of {features['walk_score']}, is estimated to be worth {formatted_value}")
72 | st.subheader("Select Property Location on Map")
73 | m = folium.Map(location=[latitude, longitude], zoom_start=11)  # Default location, change as needed
74 | folium.Marker(location=[latitude, longitude], tooltip="Move this marker to your property location", draggable=True).add_to(m)
75 | folium_static(m)
76 | 
77 | # Button to make prediction  # Convert from yearly to monthly
78 | 
79 | 


--------------------------------------------------------------------------------
/rightmove/orchestration/airflow_app/dags/rightmove/data_processing/data_processor.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | class DataPreprocessor:
 5 |     def __init__(self, with_text=False, with_binary=False):
 6 |         self.with_text = with_text
 7 |         self.with_binary = with_binary
 8 | 
 9 |     @staticmethod
10 |     def convert_frequencies(x):
11 |         frequency = x["frequency"]
12 |         price = x["amount"]
13 | 
14 |         if frequency == "monthly":
15 |             return price * 12
16 |         elif frequency == "weekly":
17 |             return (price / 7) * 365
18 |         elif frequency == "daily":
19 |             return price * 365
20 |         elif frequency == "quarterly":
21 |             return price * 4
22 |         else:  # Yearly
23 |             return price
24 | 
25 |     @staticmethod
26 |     def remove_anomalies(df, percentile_threshold=0.99):
27 |         percentile_thresholds = df[["price", "bedrooms", "bathrooms"]].quantile(
28 |             percentile_threshold
29 |         )
30 | 
31 |         # Filter the dataset to remove anomalies above the 98th percentile
32 |         filtered_df = df[
33 |             (df["price"] <= percentile_thresholds["price"])
34 |             & (df["bedrooms"] <= percentile_thresholds["bedrooms"])
35 |             & (df["bathrooms"] <= percentile_thresholds["bathrooms"])
36 |         ]
37 |         return filtered_df
38 | 
39 |     @staticmethod
40 |     def merge_text(x):
41 |         summary, feature_list = x[0], x[1]
42 |         feature_list_joined = ", ".join(feature_list) if feature_list else ""
43 |         return feature_list_joined + " , " + summary
44 | 
45 |     # def preprocess_properties_with_binary(self, df):
46 |     #     df['longitude'] = df['location'].apply(lambda x: x['longitude'])
47 |     #     df['latitude'] = df['location'].apply(lambda x: x['latitude'])
48 |     #     df = df.drop(columns=['location'])
49 |     #     df['price'] = df['price'].apply(self.convert_frequencies)
50 |     #     df['commercial'] = df['commercial'].apply(lambda x: 1 if x else 0)
51 |     #     df['development'] = df['development'].apply(lambda x: 1 if x else 0)
52 |     #     df['students'] = df['students'].apply(lambda x: 1 if x else 0)
53 |     #     df['text'] = df[['summary', 'feature_list']].apply(self.merge_text, axis=1)
54 |     #     df = self.remove_anomalies(df)
55 |     #     return df
56 | 
57 |     @staticmethod
58 |     def label_price(price):
59 |         if price < 8000:
60 |             return "Cheap"
61 |         elif price < 20_000:
62 |             return "Average"
63 |         else:
64 |             return "Expensive"
65 | 
66 |     def preprocess_properties(self, df):
67 |         df["longitude"] = df["location"].apply(lambda x: x["longitude"])
68 |         df["latitude"] = df["location"].apply(lambda x: x["latitude"])
69 |         df["price"] = df["price"].apply(self.convert_frequencies)
70 | 
71 |         if self.with_text:
72 |             df["text"] = df[["summary", "feature_list"]].apply(self.merge_text, axis=1)
73 |         if self.with_binary:
74 |             df["commercial"] = df["commercial"].apply(lambda x: 1 if x else 0)
75 |             df["development"] = df["development"].apply(lambda x: 1 if x else 0)
76 |             df["students"] = df["students"].apply(lambda x: 1 if x else 0)
77 | 
78 |         df["price_category"] = df["price"].apply(self.label_price)
79 |         df["listingUpdateReason"] = df["listingUpdate"].apply(
80 |             lambda x: x["listingUpdateReason"]
81 |         )
82 |         df["firstVisibleDate"] = pd.to_datetime(df["firstVisibleDate"], utc=True)
83 |         df = self.remove_anomalies(df)
84 |         df = df.drop(columns=["location", "_id", "listingUpdate"])
85 |         return df
86 | 
87 |     @staticmethod
88 |     def preprocess_walk_score(df):
89 |         df = df.drop_duplicates(subset=["id"])
90 |         df["walk_score"] = df["scores"].apply(lambda x: int(x["walk_score"]))
91 |         df = df.drop(columns=["_id", "scores"])
92 |         return df
93 | 


--------------------------------------------------------------------------------
/rightmove/orchestration/airflow_app/dags/rightmove/visualization_data.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, timedelta
  2 | import pandas as pd
  3 | from pymongo import MongoClient
  4 | import os
  5 | from google.cloud import storage
  6 | import logging
  7 | from io import BytesIO
  8 | 
  9 | from airflow import DAG
 10 | from airflow.operators.python_operator import PythonOperator
 11 | from airflow.operators.dummy_operator import DummyOperator
 12 | from rightmove.data_processing.data_processor import DataPreprocessor
 13 | 
 14 | logging.basicConfig(level=logging.INFO)
 15 | 
 16 | client = storage.Client()
 17 | bucket = client.get_bucket("rightmove-artifacts-ml")
 18 | 
 19 | MONGO_URI = os.environ.get("MONGO_URI")
 20 | def load_data_from_mongo(collection_name="properties", fields=None):
 21 |     logging.info("Loading data from mongo")
 22 | 
 23 |     client = MongoClient(MONGO_URI)  # Hosted with Docker
 24 | 
 25 |     db = client["rightmove"]
 26 | 
 27 |     collection = db[collection_name]
 28 | 
 29 |     query = {}
 30 | 
 31 |     data = collection.find(query, fields)
 32 | 
 33 |     df = pd.DataFrame(list(data))
 34 | 
 35 |     if len(df) == 0:
 36 |         raise ValueError(f"No data found in collection {collection_name}")
 37 |     else:
 38 |         logging.info(f"Data loaded from collection {collection_name}")
 39 | 
 40 |     return df
 41 | 
 42 | def generate_foldername():
 43 |     now = datetime.now()
 44 |     return now.strftime("%Y-%m-%d-%H-%M-%S")
 45 | 
 46 | 
 47 | def load_df_to_gcs_parquet(df, dest_path):
 48 |     # Create an in-memory bytes buffer
 49 |     buffer = BytesIO()
 50 | 
 51 |     try:
 52 |         # Save the dataframe to the buffer in parquet format
 53 |         df.to_parquet(buffer, index=False)
 54 | 
 55 |         # Move the buffer's pointer to the beginning of the file
 56 |         buffer.seek(0)
 57 | 
 58 |         # Create a blob in the specified GCS bucket path
 59 |         blob = bucket.blob(dest_path)
 60 | 
 61 |         # Upload the buffer content as a parquet file
 62 |         blob.upload_from_file(buffer, content_type='application/octet-stream')
 63 | 
 64 |         logging.info(f"Data uploaded to {dest_path} in Parquet format")
 65 |         return True
 66 |     except Exception as e:
 67 |         logging.error(f"Failed to upload data to {dest_path}: {e}")
 68 |         return False
 69 | 
 70 | def preprocess_data(property_df, walkscore_df):
 71 |     preprocessor = DataPreprocessor(with_text=True, with_binary=False)
 72 | 
 73 |     property_df = preprocessor.preprocess_properties(property_df)
 74 |     walk_df = preprocessor.preprocess_walk_score(walkscore_df)
 75 | 
 76 |     df = property_df.merge(walk_df, on="id", how="left")
 77 | 
 78 |     logging.info("Data preprocessed")
 79 | 
 80 |     return df
 81 | 
 82 | def fetch_preprocess_data():
 83 |     property_df = load_data_from_mongo(
 84 |         collection_name="properties",
 85 |         fields={
 86 |             "id": 1,
 87 |             "price.amount": 1,
 88 |             "price.frequency": 1,
 89 |             "firstVisibleDate": 1,
 90 |             "bedrooms": 1,
 91 |             "bathrooms": 1,
 92 |             "listingUpdate": 1,
 93 |             "location": 1,
 94 |             "summary": 1,
 95 |             "feature_list": 1,
 96 |         },
 97 |     )
 98 |     walkscore_df = load_data_from_mongo(
 99 |         collection_name="walk_scores", fields={"id": 1, "scores": 1}
100 |     )
101 | 
102 |     df = preprocess_data(property_df, walkscore_df)
103 | 
104 |     dest_path = f"streamlit_data/{generate_foldername()}/data.parquet"
105 |     load_df_to_gcs_parquet(df, dest_path)
106 | 
107 |     logging.info(f"Data saved to {dest_path}")
108 |     
109 |     return df
110 | def load_data():
111 |     df = fetch_preprocess_data()
112 |     logging.info("Data loaded")
113 | 
114 | default_args = {
115 |     "owner": "airflow_app",
116 |     "depends_on_past": False,
117 |     "email_on_failure": False,
118 |     "email_on_retry": False,
119 |     "retries": 1,
120 |     "retry_delay": timedelta(minutes=5),
121 | }
122 | 
123 | dag = DAG(
124 |     "streamlit_data_extraction",
125 |     default_args=default_args,
126 |     description="DAG for extracting data for Streamlit app",
127 |     schedule_interval=timedelta(days=1),
128 |     start_date=datetime(2023, 1, 1),
129 |     catchup=False,
130 |     max_active_runs=1,
131 | )
132 | 
133 | start_task = DummyOperator(task_id="start", dag=dag)
134 | 
135 | load_data_task = PythonOperator(
136 |     task_id="load_data", python_callable=load_data, dag=dag
137 | )
138 | 
139 | end_task = DummyOperator(task_id="end", dag=dag)
140 | 
141 | start_task>> load_data_task >> end_task
142 | 


--------------------------------------------------------------------------------
/rightmove/backend/app/main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from http.client import HTTPException
  3 | from typing import Union
  4 | 
  5 | from fastapi import FastAPI
  6 | import psycopg2
  7 | from psycopg2.extras import RealDictCursor
  8 | 
  9 | from pydantic import BaseModel
 10 | from app.data_processing.walk_score_processing import WalkScoreProcessor
 11 | # from data_processing.walk_score_processing import WalkScoreProcessor
 12 | from sklearn.neighbors import BallTree
 13 | import json
 14 | 
 15 | from math import radians
 16 | from typing import List
 17 | import mlflow.pyfunc
 18 | 
 19 | import pandas as pd
 20 | 
 21 | import math
 22 | import numpy as np
 23 | 
 24 | # from dotenv import load_dotenv
 25 | # load_dotenv("/Users/alexander.girardet/Code/Personal/projects/rightmove_project/.env")
 26 | 
 27 | 
 28 | class Property(BaseModel):
 29 |     bedrooms: float
 30 |     bathrooms: float
 31 |     longitude: float
 32 |     latitude: float
 33 |     walk_score: float
 34 | 
 35 | 
 36 | class Coordinates(BaseModel):
 37 |     longitude: float
 38 |     latitude: float
 39 | 
 40 | 
 41 | MLFLOW_TRACKING_URI = os.environ.get("MLFLOW_TRACKING_URI")
 42 | 
 43 | app = FastAPI()
 44 | 
 45 | model_name = "Random Forest Walk Score"
 46 | model_stage = "Staging"
 47 | model_uri = f"models:/{model_name}/{model_stage}"
 48 | model = mlflow.pyfunc.load_model(model_uri)
 49 | 
 50 | MONGO_URI = os.environ.get("MONGO_URI")
 51 | 
 52 | GCS_PARQUET_URL = (
 53 |     "https://storage.googleapis.com/rightmove-resources-public/UK_pois.parquet"
 54 | )
 55 | WALK_SCORES_COLLECTION = "walk_scores"
 56 | 
 57 | BATCH_SIZE = 50
 58 | 
 59 | 
 60 | @app.post("/predict")
 61 | async def predict_rent(input_property: Property):
 62 |     try:
 63 |         features_df = pd.DataFrame(input_property.dict(), index=[0])
 64 |         prediction = model.predict(features_df)
 65 |         return {"prediction": prediction[0]}
 66 | 
 67 |     except Exception as e:
 68 |         raise HTTPException()
 69 | 
 70 | 
 71 | @app.post("/batch-predict")
 72 | async def batch_predict_rent(input_properties: List[Property]):
 73 |     try:
 74 |         properties_dicts = [property.dict() for property in input_properties]
 75 |         features_df = pd.DataFrame(properties_dicts)
 76 |         predictions = model.predict(features_df)
 77 | 
 78 |         return {"predictions": predictions.tolist()}
 79 | 
 80 |     except Exception as e:
 81 |         raise HTTPException(status_code=400, detail=str(e))
 82 | 
 83 | 
 84 | @app.post("/walk_score")
 85 | async def generate_walk_score(input_coordinates: Coordinates):
 86 |     try:
 87 |         walk_score_processor = WalkScoreProcessor()
 88 |         input_coordinates = input_coordinates.dict()
 89 |         longitude = input_coordinates["longitude"]
 90 |         latitude = input_coordinates["latitude"]
 91 |         scores_dict = walk_score_processor.calculuate_walk_score(longitude, latitude)
 92 |         walk_score = sum(scores_dict.values()) * 6.67
 93 |         return {"walk_score": walk_score}
 94 | 
 95 |     except Exception as e:
 96 |         raise HTTPException()
 97 | 
 98 | 
 99 | SQL_QUERY = """
100 | SELECT d.dataset_source
101 | FROM datasets d
102 | INNER JOIN (
103 |     SELECT i.source_id AS dataset_id
104 |     FROM inputs i
105 |     INNER JOIN (
106 |         SELECT run_id
107 |         FROM model_versions
108 |         ORDER BY version DESC
109 |         LIMIT 1
110 |     ) mv ON i.destination_id = mv.run_id
111 |     WHERE i.source_type = 'DATASET'
112 |     LIMIT 1
113 | ) subquery ON d.dataset_uuid = subquery.dataset_id;
114 | """
115 | def fix_database_uri(uri: str) -> str:
116 |     # Check if URI contains '+psycopg2' and remove it
117 |     if "+psycopg2" in uri:
118 |         uri = uri.replace("+psycopg2", "")
119 |     return uri
120 | @app.get("/latest-dataset")
121 | def get_latest_dataset_source():
122 |     try:
123 |         PG_URI = fix_database_uri(MLFLOW_TRACKING_URI)
124 | 
125 |         with psycopg2.connect(PG_URI, cursor_factory=RealDictCursor) as conn:
126 |             with conn.cursor() as cur:
127 |                 cur.execute(SQL_QUERY)
128 | 
129 |                 result = cur.fetchone()
130 |                 if result:
131 |                     dataset_source_info = json.loads(result["dataset_source"])
132 |                     uri = dataset_source_info.get("uri", "URI not found")
133 |                     return {"uri": uri}
134 |                 else:
135 |                     return {"error": "No dataset source found for the latest model version."}
136 | 
137 |     except Exception as e:
138 |         return {"error": str(e)}
139 | 
140 | 
141 | 
142 | @app.get("/")
143 | def read_root():
144 |     return {"Hello": "World"}
145 | 


--------------------------------------------------------------------------------
/rightmove/orchestration/airflow_app/dags/rightmove/rightmove_ingest.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, timedelta
  2 | import time
  3 | import requests
  4 | import logging
  5 | 
  6 | from airflow import DAG
  7 | from airflow.operators.python_operator import PythonOperator
  8 | from airflow.operators.dummy_operator import DummyOperator
  9 | 
 10 | from rightmove.data_processing.rightmove_processing import run
 11 | 
 12 | 
 13 | SCRAPYD_ENDPOINT = "http://scrapy_app:6800"
 14 | SPIDER = "rightmove"
 15 | PROJECT = "scraper"
 16 | 
 17 | 
 18 | def start_spider():
 19 |     payload = f"project={PROJECT}&spider={SPIDER}"
 20 |     headers = {"Content-Type": "application/x-www-form-urlencoded"}
 21 | 
 22 |     url = SCRAPYD_ENDPOINT + "/schedule.json"
 23 | 
 24 |     response = requests.request("POST", url, headers=headers, data=payload)
 25 | 
 26 |     if response.status_code == 200:
 27 |         logging.info("Request successful")
 28 |         if response.json()["status"] == "ok":
 29 |             logging.info("Spider started successfully")
 30 |             job_id = response.json()["jobid"]
 31 |             return job_id
 32 |         else:
 33 |             logging.info(response.text)
 34 |             raise ValueError("Spider has not been started")
 35 |     else:
 36 |         print(response.text)
 37 |         raise ValueError("Request failed")
 38 | 
 39 | 
 40 | def cancel_spider(**kwargs):
 41 |     job_id = kwargs["ti"].xcom_pull(task_ids="start_spider")
 42 | 
 43 |     print(f"Cancelling job id: {job_id}")
 44 | 
 45 |     payload = f"project={PROJECT}&job={job_id}"
 46 |     headers = {"Content-Type": "application/x-www-form-urlencoded"}
 47 | 
 48 |     url = SCRAPYD_ENDPOINT + "/cancel.json"
 49 | 
 50 |     response = requests.request("POST", url, headers=headers, data=payload)
 51 | 
 52 |     print(response.text)
 53 |     if response.status_code == 200:
 54 |         print("Request successful")
 55 |         if response.json()["status"] == "ok":
 56 |             print("Job cancelled successfully")
 57 |         else:
 58 |             print(response.text)
 59 |     else:
 60 |         print(response.text)
 61 |         raise ValueError("Request failed spider has not been canceled")
 62 | 
 63 |     return "Success"
 64 | 
 65 | 
 66 | def repeated_requests(**kwargs):
 67 |     end_time = datetime.now() + timedelta(seconds=900)  # 15 minute scraping session
 68 | 
 69 |     # url = f"http://scrapyapp:6800/listjobs.json?project={PROJECT}"
 70 | 
 71 |     url = SCRAPYD_ENDPOINT + "/listjobs.json?project=" + PROJECT
 72 | 
 73 |     payload = {}
 74 |     headers = {}
 75 | 
 76 |     job_id = kwargs["ti"].xcom_pull(task_ids="start_spider")
 77 | 
 78 |     while datetime.now() < end_time:
 79 |         response = requests.request("GET", url, headers=headers, data=payload)
 80 | 
 81 |         print(f"Response code: {response.status_code}")
 82 |         if response.status_code == 200:
 83 |             print("Request successful")
 84 |             if response.json()["status"] == "ok":
 85 |                 print("Scrapy status is okay")
 86 | 
 87 |                 running_jobs = response.json()["running"]
 88 | 
 89 |                 if job_id in [job["id"] for job in running_jobs]:
 90 |                     print("Job is running")
 91 | 
 92 |             elif response.json()["status"] == "error":
 93 |                 print("Scrapy status is error")
 94 |                 print(response.json()["message"])
 95 |                 raise ValueError("Scrapy status is error")
 96 |             else:
 97 |                 print(response.text)
 98 | 
 99 |         time.sleep(30)  # wait for 30 seconds before next request
100 |     return "Success"
101 | 
102 | 
103 | default_args = {
104 |     "owner": "airflow_app",
105 |     "depends_on_past": False,
106 |     "email_on_failure": False,
107 |     "email_on_retry": False,
108 |     "retries": 1,
109 |     "retry_delay": timedelta(minutes=5),
110 | }
111 | 
112 | dag = DAG(
113 |     "scrape_rightmove",
114 |     default_args=default_args,
115 |     description="DAG for making scraping rightmove",
116 |     schedule_interval=timedelta(days=1),
117 |     start_date=datetime(2023, 1, 1),
118 |     catchup=False,
119 |     max_active_runs=1,
120 | )
121 | 
122 | start_task = DummyOperator(task_id="start", dag=dag)
123 | 
124 | start_spider_task = PythonOperator(
125 |     task_id="start_spider", python_callable=start_spider, dag=dag
126 | )
127 | 
128 | periodic_requests = PythonOperator(
129 |     task_id="periodic_requests",
130 |     python_callable=repeated_requests,
131 |     provide_context=True,
132 |     dag=dag,
133 | )
134 | 
135 | cancel_spider_task = PythonOperator(
136 |     task_id="cancel_spider",
137 |     python_callable=cancel_spider,
138 |     provide_context=True,
139 |     dag=dag,
140 | )
141 | 
142 | run_beam_pipeline = PythonOperator(
143 |     task_id="run_beam_pipeline", python_callable=run, dag=dag
144 | )
145 | 
146 | 
147 | end_task = DummyOperator(task_id="end", dag=dag)
148 | 
149 | (
150 |     start_task
151 |     >> start_spider_task
152 |     >> periodic_requests
153 |     >> cancel_spider_task
154 |     >> run_beam_pipeline
155 |     >> end_task
156 | )
157 | 


--------------------------------------------------------------------------------
/notebooks/data_ingestion/fetch_outcodes.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 5,
  6 |    "id": "7995756c-fe5f-48d9-8512-d797b73e5157",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "from selenium import webdriver\n",
 11 |     "from selenium.webdriver.common.keys import Keys\n",
 12 |     "from selenium.webdriver.common.by import By\n",
 13 |     "\n",
 14 |     "import pandas as pd\n",
 15 |     "import re"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "id": "f9e3c5c5-bcad-4d4b-bdc9-0a43adbfd444",
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "# Function that returns right ID for a given outcode\n",
 26 |     "def get_outcode_value(postcode, driver):\n",
 27 |     "    driver.get(\"https://www.rightmove.co.uk/property-to-rent.html\")\n",
 28 |     "    input_box = driver.find_element(By.XPATH, '//*[@id=\"searchLocation\"]')\n",
 29 |     "    input_box.send_keys(postcode)\n",
 30 |     "    search_box = driver.find_element(By.XPATH, '//*[@id=\"search\"]')\n",
 31 |     "    search_box.click()\n",
 32 |     "    \n",
 33 |     "    try:\n",
 34 |     "        submit = driver.find_element(By.ID, \"submit\")\n",
 35 |     "        submit.click()\n",
 36 |     "        url = driver.current_url\n",
 37 |     "        outcode_value = re.findall(\"(?<=locationIdentifier=OUTCODE%5E)(.*)(?=&insId)\", url)[0]\n",
 38 |     "    except:\n",
 39 |     "        header_title = driver.find_element(By.ID, \"headerTitle\")\n",
 40 |     "        outcode_value = None\n",
 41 |     "    \n",
 42 |     "        \n",
 43 |     "    return outcode_value\n",
 44 |     "\n",
 45 |     "# Function to fetch currently loaded outcodes in case selenium crashed\n",
 46 |     "def fetch_current_rightmove_outcodes(cursor):\n",
 47 |     "    cursor.execute(\"SELECT outcode FROM rightmove_outcodes\")\n",
 48 |     "    fetched_outcodes = cursor.fetchall()\n",
 49 |     "    outcode_list = [x[0] for x in fetched_outcodes]\n",
 50 |     "\n",
 51 |     "    return outcode_list"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "id": "adcf7674-49d6-4c70-991c-6f9463e11782",
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "def fetch_outcodes(df, cursor, driver):\n",
 62 |     "    for row in df.itertuples():\n",
 63 |     "        outcode = row.postcode\n",
 64 |     "        index = row.Index\n",
 65 |     "        \n",
 66 |     "        if outcode not in current_outcodes:\n",
 67 |     "            outcode_value = get_outcode_value(outcode, driver)\n",
 68 |     "            \n",
 69 |     "            if outcode_value is not None:\n",
 70 |     "\n",
 71 |     "                transaction = \"INSERT IGNORE INTO rightmove_outcodes(outcode, rightmove_code) VALUES ('{}', {});\".format(\n",
 72 |     "                outcode, outcode_value)\n",
 73 |     "\n",
 74 |     "                cursor.execute(transaction)\n",
 75 |     "\n",
 76 |     "                con.commit()\n",
 77 |     "            else:\n",
 78 |     "                pass\n",
 79 |     "        else:\n",
 80 |     "            pass"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "id": "76829d13-b123-4c27-88b5-c516e5fac8a1",
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "def run():\n",
 91 |     "    driver = webdriver.Firefox()\n",
 92 |     "\n",
 93 |     "    current_outcodes = fetch_current_rightmove_outcodes(driver)\n",
 94 |     "\n",
 95 |     "    # Load UK outcode csv file into pandas\n",
 96 |     "    df = pd.read_csv(\"../../data/outcodes.csv\", index_col=0)\n",
 97 |     "\n",
 98 |     "    fetch_outcodes(df, cursor, driver) "
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "id": "281a6c10-cf5d-488a-82d5-f5ef567b2179",
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "import os\n",
109 |     "gecko_path = os.path.expanduser('~/Downloads/geckodriver')\n",
110 |     "driver = webdriver.Firefox(executable_path=gecko_path)\n",
111 |     "\n",
112 |     "driver.get(\"https://www.rightmove.co.uk/property-to-rent.html\")\n",
113 |     "\n",
114 |     "input_box = driver.find_element(By.XPATH, '//*[@id=\"searchLocation\"]')\n",
115 |     "input_box.send_keys(postcode)\n",
116 |     "search_box = driver.find_element(By.XPATH, '//*[@id=\"search\"]')\n",
117 |     "search_box.click()"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "id": "a842cd6d-42df-47f4-89a0-a43f33c429d9",
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": []
127 |   }
128 |  ],
129 |  "metadata": {
130 |   "kernelspec": {
131 |    "display_name": "Python 3 (ipykernel)",
132 |    "language": "python",
133 |    "name": "python3"
134 |   },
135 |   "language_info": {
136 |    "codemirror_mode": {
137 |     "name": "ipython",
138 |     "version": 3
139 |    },
140 |    "file_extension": ".py",
141 |    "mimetype": "text/x-python",
142 |    "name": "python",
143 |    "nbconvert_exporter": "python",
144 |    "pygments_lexer": "ipython3",
145 |    "version": "3.11.4"
146 |   }
147 |  },
148 |  "nbformat": 4,
149 |  "nbformat_minor": 5
150 | }
151 | 


--------------------------------------------------------------------------------
/notebooks/data_ingestion/.ipynb_checkpoints/fetch_outcodes-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 5,
  6 |    "id": "7995756c-fe5f-48d9-8512-d797b73e5157",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "from selenium import webdriver\n",
 11 |     "from selenium.webdriver.common.keys import Keys\n",
 12 |     "from selenium.webdriver.common.by import By\n",
 13 |     "\n",
 14 |     "import pandas as pd\n",
 15 |     "import re"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "id": "f9e3c5c5-bcad-4d4b-bdc9-0a43adbfd444",
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "# Function that returns right ID for a given outcode\n",
 26 |     "def get_outcode_value(postcode, driver):\n",
 27 |     "    driver.get(\"https://www.rightmove.co.uk/property-to-rent.html\")\n",
 28 |     "    input_box = driver.find_element(By.XPATH, '//*[@id=\"searchLocation\"]')\n",
 29 |     "    input_box.send_keys(postcode)\n",
 30 |     "    search_box = driver.find_element(By.XPATH, '//*[@id=\"search\"]')\n",
 31 |     "    search_box.click()\n",
 32 |     "    \n",
 33 |     "    try:\n",
 34 |     "        submit = driver.find_element(By.ID, \"submit\")\n",
 35 |     "        submit.click()\n",
 36 |     "        url = driver.current_url\n",
 37 |     "        outcode_value = re.findall(\"(?<=locationIdentifier=OUTCODE%5E)(.*)(?=&insId)\", url)[0]\n",
 38 |     "    except:\n",
 39 |     "        header_title = driver.find_element(By.ID, \"headerTitle\")\n",
 40 |     "        outcode_value = None\n",
 41 |     "    \n",
 42 |     "        \n",
 43 |     "    return outcode_value\n",
 44 |     "\n",
 45 |     "# Function to fetch currently loaded outcodes in case selenium crashed\n",
 46 |     "def fetch_current_rightmove_outcodes(cursor):\n",
 47 |     "    cursor.execute(\"SELECT outcode FROM rightmove_outcodes\")\n",
 48 |     "    fetched_outcodes = cursor.fetchall()\n",
 49 |     "    outcode_list = [x[0] for x in fetched_outcodes]\n",
 50 |     "\n",
 51 |     "    return outcode_list"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "id": "adcf7674-49d6-4c70-991c-6f9463e11782",
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "def fetch_outcodes(df, cursor, driver):\n",
 62 |     "    for row in df.itertuples():\n",
 63 |     "        outcode = row.postcode\n",
 64 |     "        index = row.Index\n",
 65 |     "        \n",
 66 |     "        if outcode not in current_outcodes:\n",
 67 |     "            outcode_value = get_outcode_value(outcode, driver)\n",
 68 |     "            \n",
 69 |     "            if outcode_value is not None:\n",
 70 |     "\n",
 71 |     "                transaction = \"INSERT IGNORE INTO rightmove_outcodes(outcode, rightmove_code) VALUES ('{}', {});\".format(\n",
 72 |     "                outcode, outcode_value)\n",
 73 |     "\n",
 74 |     "                cursor.execute(transaction)\n",
 75 |     "\n",
 76 |     "                con.commit()\n",
 77 |     "            else:\n",
 78 |     "                pass\n",
 79 |     "        else:\n",
 80 |     "            pass"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "id": "76829d13-b123-4c27-88b5-c516e5fac8a1",
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "def run():\n",
 91 |     "    driver = webdriver.Firefox()\n",
 92 |     "\n",
 93 |     "    current_outcodes = fetch_current_rightmove_outcodes(driver)\n",
 94 |     "\n",
 95 |     "    # Load UK outcode csv file into pandas\n",
 96 |     "    df = pd.read_csv(\"../../data/outcodes.csv\", index_col=0)\n",
 97 |     "\n",
 98 |     "    fetch_outcodes(df, cursor, driver) "
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "id": "281a6c10-cf5d-488a-82d5-f5ef567b2179",
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "import os\n",
109 |     "gecko_path = os.path.expanduser('~/Downloads/geckodriver')\n",
110 |     "driver = webdriver.Firefox(executable_path=gecko_path)\n",
111 |     "\n",
112 |     "driver.get(\"https://www.rightmove.co.uk/property-to-rent.html\")\n",
113 |     "\n",
114 |     "input_box = driver.find_element(By.XPATH, '//*[@id=\"searchLocation\"]')\n",
115 |     "input_box.send_keys(postcode)\n",
116 |     "search_box = driver.find_element(By.XPATH, '//*[@id=\"search\"]')\n",
117 |     "search_box.click()"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "id": "a842cd6d-42df-47f4-89a0-a43f33c429d9",
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": []
127 |   }
128 |  ],
129 |  "metadata": {
130 |   "kernelspec": {
131 |    "display_name": "Python 3 (ipykernel)",
132 |    "language": "python",
133 |    "name": "python3"
134 |   },
135 |   "language_info": {
136 |    "codemirror_mode": {
137 |     "name": "ipython",
138 |     "version": 3
139 |    },
140 |    "file_extension": ".py",
141 |    "mimetype": "text/x-python",
142 |    "name": "python",
143 |    "nbconvert_exporter": "python",
144 |    "pygments_lexer": "ipython3",
145 |    "version": "3.11.4"
146 |   }
147 |  },
148 |  "nbformat": 4,
149 |  "nbformat_minor": 5
150 | }
151 | 


--------------------------------------------------------------------------------
/rightmove/data_ingestion/rightmove_scraper/rightmove_scraper/spiders/rightmove.py:
--------------------------------------------------------------------------------
  1 | import scrapy
  2 | import os
  3 | import csv
  4 | import requests
  5 | import io
  6 | 
  7 | import logging
  8 | 
  9 | logging.basicConfig(level=logging.DEBUG)
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | from bs4 import BeautifulSoup
 13 | 
 14 | from pymongo import MongoClient
 15 | 
 16 | # MONGO_URL = "mongodb://mongodb:27017/"
 17 | MONGO_URI = os.environ.get("MONGO_URI")
 18 | 
 19 | 
 20 | class RightmoveSpider(scrapy.Spider):
 21 |     name = "rightmove"
 22 | 
 23 |     def __init__(self, *args, **kwargs):
 24 |         self.headers = {
 25 |             "Accept": "application/json, text/plain, */*",
 26 |             "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",
 27 |             "Connection": "keep-alive",
 28 |             "Referer": "https://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=REGION%5E87490&index=24&propertyTypes=&includeLetAgreed=false&mustHave=&dontShow=&furnishTypes=&keywords=",
 29 |             "Sec-Fetch-Dest": "empty",
 30 |             "Sec-Fetch-Mode": "cors",
 31 |             "Sec-Fetch-Site": "same-origin",
 32 |             "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36",
 33 |             "sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
 34 |             "sec-ch-ua-mobile": "?0",
 35 |             "sec-ch-ua-platform": '"macOS"',
 36 |         }
 37 | 
 38 |         self.rightmove_ids = self.get_property_ids()
 39 | 
 40 |         print(self.rightmove_ids)
 41 | 
 42 |         print("Number of IDs: ", len(self.rightmove_ids))
 43 | 
 44 |         logger.info(f"Fetching new MongoDB data from {MONGO_URI}...")
 45 | 
 46 |         self.fetched_outcodes = self.get_outcodes()
 47 | 
 48 |     def start_requests(self):
 49 |         for codes in self.fetched_outcodes:
 50 |             rightmove_code = codes[1]
 51 |             postcode = codes[0]
 52 |             for index_jump in range(
 53 |                 0, 100, 25
 54 |             ):  # Adjusting to 100 so I can have some extra values to test with
 55 |                 url = f"https://www.rightmove.co.uk/api/_search?locationIdentifier=OUTCODE%5E{rightmove_code}&numberOfPropertiesPerPage=24&radius=10.0&sortType=6&index={index_jump}&includeLetAgreed=false&viewType=LIST&channel=RENT&areaSizeUnit=sqft&currencyCode=GBP&isFetching=false"
 56 | 
 57 |                 yield scrapy.Request(
 58 |                     method="GET", url=url, headers=self.headers, callback=self.parse
 59 |                 )
 60 | 
 61 |     def parse(self, response):
 62 |         listings = response.json()["properties"]
 63 |         for listing in listings:
 64 |             property_id = listing["id"]
 65 | 
 66 |             if property_id not in self.rightmove_ids:
 67 |                 property_url = f"https://www.rightmove.co.uk/properties/{property_id}"
 68 | 
 69 |                 yield scrapy.Request(
 70 |                     method="GET",
 71 |                     url=property_url,
 72 |                     headers=self.headers,
 73 |                     callback=self.parse_property,
 74 |                     meta={"item": listing},
 75 |                 )
 76 |             else:
 77 |                 print("Already loaded in")
 78 | 
 79 |     def parse_property(self, response):
 80 |         soup = BeautifulSoup(response.text, "lxml")
 81 | 
 82 |         item = response.meta["item"]
 83 | 
 84 |         # Get feature list
 85 |         try:
 86 |             uls = soup.find("ul", {"class": "_1uI3IvdF5sIuBtRIvKrreQ"})
 87 |             features = uls.find_all("li")
 88 |             feature_list = [feature.text for feature in features]
 89 |         except:
 90 |             feature_list = None
 91 | 
 92 |         # Get full summary
 93 |         summary = soup.find("div", {"class": "OD0O7FWw1TjbTD4sdRi1_"}).div.text
 94 | 
 95 |         # Assign content to item
 96 |         item["feature_list"] = feature_list
 97 |         item["summary"] = summary
 98 | 
 99 |         yield item
100 | 
101 |     def get_outcodes(self):
102 |         # URL of the CSV file in the public GCS bucket
103 |         csv_url = "https://storage.googleapis.com/rightmove-resources-public/rightmove_outcodes.csv"
104 | 
105 |         # Download the CSV file
106 |         response = requests.get(csv_url)
107 |         if response.status_code == 200:
108 |             # Convert binary data to a text stream
109 |             csv_text = io.StringIO(response.content.decode("utf-8"))
110 | 
111 |             # Read CSV data
112 |             reader = csv.reader(csv_text)
113 |             outcodes = list(reader)
114 |             outcodes = outcodes[1:]  # Skip header row
115 |             outcodes = [(outcode[1], outcode[2]) for outcode in outcodes]
116 |             return outcodes
117 |         else:
118 |             print("Failed to download CSV file")
119 |             return []
120 | 
121 |     def get_property_ids(self) -> list:
122 |         client = MongoClient(MONGO_URI)
123 |         # client = MongoClient("mongodb://localhost:27017/")
124 |         db = client["rightmove"]
125 |         # Access collection
126 |         collection = db["properties"]
127 | 
128 |         # logging.info("Connected to MongoDB")
129 | 
130 |         rightmove_ids = collection.find({}, {"id": 1})
131 | 
132 |         # Convert the result to a list of IDs
133 |         ids = [doc["id"] for doc in rightmove_ids]
134 | 
135 |         client.close()
136 | 
137 |         return ids
138 | 


--------------------------------------------------------------------------------
/rightmove/dashboard/streamlit/01_LandingPage.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import requests
  3 | import pydeck as pdk
  4 | from wordcloud import WordCloud
  5 | import pandas as pd
  6 | from datetime import datetime, timedelta
  7 | import geopandas
  8 | import matplotlib.pyplot as plt
  9 | import json
 10 | import seaborn as sns
 11 | import plotly.express as px
 12 | import os
 13 | 
 14 | import logging
 15 | 
 16 | logging.basicConfig(level=logging.INFO)
 17 | 
 18 | from dotenv import load_dotenv
 19 | load_dotenv("/.env")
 20 | 
 21 | MONGO_URI = os.environ.get("MONGO_URI")
 22 | 
 23 | @st.cache_data
 24 | def load_data():
 25 |     df = pd.read_parquet(
 26 |         "gs://rightmove-artifacts-ml/streamlit_data/2024-02-27-12-32-07/data.parquet"
 27 |     )
 28 | 
 29 |     df["monthly_price"] = df["price"] / 12
 30 |     df = df.dropna()
 31 |     return df
 32 | 
 33 | 
 34 | def get_recents(subset, days):
 35 |     new_df = subset[subset["listingUpdateReason"] == "new"]
 36 | 
 37 |     today = pd.Timestamp(datetime.now(), tz="UTC")
 38 | 
 39 |     # Calculate the start date of the last week (7 days ago)
 40 |     date_start = today - timedelta(days=days)
 41 | 
 42 |     new_df["firstVisibleDate"] = pd.to_datetime(new_df["firstVisibleDate"], utc=True)
 43 | 
 44 |     # Corrected filtering to use new_df instead of df
 45 |     in_between_rows = new_df[
 46 |         (new_df["firstVisibleDate"] > date_start)
 47 |         & (new_df["firstVisibleDate"] <= today)
 48 |     ]
 49 | 
 50 |     # Get the total number of rows
 51 |     total_rows = len(in_between_rows)
 52 |     return total_rows
 53 | 
 54 | 
 55 | def plot_bedrooms_distribution(df):
 56 |     max_bedrooms = df["bedrooms"].max()
 57 |     fig = px.histogram(df, x="bedrooms", title="Distribution of Bedrooms")
 58 |     fig.update_layout(
 59 |         xaxis=dict(title="Number of Bedrooms", tickmode="linear", dtick=1),
 60 |         yaxis_title="Number of Properties",
 61 |         plot_bgcolor="rgba(0,0,0,0)",
 62 |     )
 63 |     return fig
 64 | 
 65 | 
 66 | def plot_bathrooms_distribution(df):
 67 |     # Determine the maximum number of bathrooms to set appropriate bins
 68 |     max_bathrooms = df["bathrooms"].max()
 69 |     fig = px.histogram(df, x="bathrooms", title="Distribution of Bathrooms")
 70 |     fig.update_layout(
 71 |         xaxis=dict(title="Number of Bathrooms", tickmode="linear", dtick=1),
 72 |         yaxis_title="Number of Properties",
 73 |         plot_bgcolor="rgba(0,0,0,0)",
 74 |     )
 75 |     return fig
 76 | 
 77 | 
 78 | def plot_price_density(df):
 79 |     fig = px.histogram(
 80 |         df, x="monthly_price", title="Distribution of Monthly Rental Prices"
 81 |     )
 82 |     fig.update_layout(
 83 |         xaxis_title="Rental Price",
 84 |         yaxis_title="Number of Properties",
 85 |         plot_bgcolor="rgba(0,0,0,0)",
 86 |     )
 87 |     return fig
 88 | 
 89 | 
 90 | df = load_data()
 91 | 
 92 | min_price, max_price = st.sidebar.slider(
 93 |     "Select a monthly rental price range:",
 94 |     min_value=int(df["monthly_price"].min()),  # Minimum value for the slider
 95 |     max_value=int(df["monthly_price"].max()),  # Maximum value for the slider
 96 |     value=(
 97 |         int(df["monthly_price"].min()),
 98 |         int(df["monthly_price"].max()),
 99 |     ),  # Initial range (min, max)
100 | )
101 | 
102 | subset = df[(df["monthly_price"] >= min_price) & (df["monthly_price"] <= max_price)]
103 | 
104 | # Streamlit UI
105 | col1, col2, col3 = st.columns(3)
106 | 
107 | # Calculate properties added since last week
108 | properties_last_week = get_recents(subset, 8)  # Last 7 days
109 | # Display metric in the first column, restrict to 2 decimal places
110 | col1.metric(label="Properties Added Since Last Week", value=f"{properties_last_week}")
111 | 
112 | # Calculate properties added since yesterday
113 | properties_yesterday = get_recents(subset, 2)  # Last 1 day
114 | # Display metric in the second column, restrict to 2 decimal places
115 | col2.metric(label="Properties Added Since Yesterday", value=f"{properties_yesterday}")
116 | 
117 | # Calculate the total number of properties
118 | total_properties = len(subset)
119 | # Display metric in the third column, restrict to 2 decimal places
120 | col3.metric(label="Total Properties", value=f"{total_properties}")
121 | 
122 | st.header("Property Distribution Map")
123 | layer = pdk.Layer(
124 |     "HexagonLayer",  # `type` positional argument is here
125 |     subset[["longitude", "latitude"]],  # `data` positional argument is here
126 |     get_position=["longitude", "latitude"],
127 |     auto_highlight=True,
128 |     elevation_scale=50,
129 |     pickable=True,
130 |     elevation_range=[0, 3000],
131 |     extruded=True,
132 |     coverage=1,
133 | )
134 | 
135 | # Set the viewport location
136 | view_state = pdk.ViewState(
137 |     longitude=-1.415,
138 |     latitude=52.2323,
139 |     zoom=6,
140 |     min_zoom=5,
141 |     max_zoom=15,
142 |     pitch=40.5,
143 |     bearing=-27.36,
144 | )
145 | 
146 | # Combine everything and render a viewport
147 | r = pdk.Deck(layers=[layer], initial_view_state=view_state)
148 | st.info(
149 |     "The map displays the distribution of properties based on their location. The higher the concentration of properties, the higher the elevation."
150 | )
151 | st.pydeck_chart(r)
152 | 
153 | st.header("Histogram and Density Plots of Property Features")
154 | 
155 | st.info(
156 |     "The following plots provide a visual representation of the distribution of property features such as bedrooms, bathrooms, and rental prices."
157 | )
158 | 
159 | col1, col2 = st.columns(2)
160 | with col1:
161 |     st.plotly_chart(plot_bedrooms_distribution(subset), use_container_width=True)
162 | with col2:
163 |     st.plotly_chart(plot_bathrooms_distribution(subset), use_container_width=True)
164 | 
165 | # Density plot for price
166 | st.plotly_chart(plot_price_density(subset), use_container_width=True)


--------------------------------------------------------------------------------
/rightmove/dashboard/streamlit/pages/03_WalkScore.py:
--------------------------------------------------------------------------------
  1 | import geopandas
  2 | import streamlit as st
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import math
  6 | 
  7 | 
  8 | @st.cache_data
  9 | def load_geo_data():
 10 |     # Load your GeoPandas DataFrame here
 11 |     # For example: gdf = gpd.read_file('your_file_path.shp')
 12 |     # Returning an example gdf, replace this with your actual data loading
 13 |     gdf = geopandas.read_file(
 14 |         "/Users/alexander.girardet/Code/Personal/projects/rightmove_project/notebooks/serving/london_borough_stats.geojson"
 15 |     )
 16 |     return gdf
 17 | 
 18 | 
 19 | def plot_geo_data(gdf, column):
 20 |     # Create the figure and axis with a larger size for better visibility
 21 |     fig, ax = plt.subplots(figsize=(14, 8))
 22 | 
 23 |     # Plot the GeoDataFrame with a more appealing color map and adjust the legend
 24 |     gdf.plot(
 25 |         column=column,
 26 |         cmap="viridis",
 27 |         legend=True,
 28 |         ax=ax,
 29 |         legend_kwds={
 30 |             "label": f"{reversed_options[column]}",
 31 |             "orientation": "horizontal",
 32 |         },
 33 |     )
 34 | 
 35 |     # Adjust the figure layout to accommodate the legend and ensure no clipping
 36 |     plt.tight_layout()
 37 | 
 38 |     return fig
 39 | 
 40 | 
 41 | def distance_decay(distance):
 42 |     dist = distance / 1000  # Convert distance to kilometers
 43 |     score = math.e ** ((-5.0 * (dist / 4)) ** 5.0)
 44 |     return score
 45 | 
 46 | 
 47 | def plot_distance_decay():
 48 |     # Generate distances from 0 to 2000 meters
 49 |     distances = np.linspace(0, 2000, 500)
 50 |     scores = np.array([distance_decay(d) for d in distances])
 51 | 
 52 |     # Plotting the decay of distance
 53 |     plt.figure(figsize=(10, 6))
 54 |     plt.plot(distances, scores, label="Distance Decay", color="blue")
 55 |     plt.xlabel("Distance (meters)")
 56 |     plt.ylabel("Score")
 57 |     plt.title("Distance Decay Effect on Score")
 58 |     plt.grid(True)
 59 |     plt.xlim(0, 2000)  # Limit x-axis to 2000 meters
 60 |     plt.legend()
 61 |     plt.tight_layout()
 62 |     return plt
 63 | 
 64 | 
 65 | def calculate_amenity_walk_score(distances, amenity_weights):
 66 |     total_score = 0
 67 |     for amenity, distance in distances.items():
 68 |         decayed_distance = distance_decay(distance)
 69 |         weights = amenity_weights.get(
 70 |             amenity, [1]
 71 |         )  # Default weight if amenity not found
 72 |         # Assume the first weight for simplicity, could be adapted for multiple distances per amenity
 73 |         amenity_score = decayed_distance * weights[0]
 74 |         total_score += amenity_score
 75 |     return total_score
 76 | 
 77 | 
 78 | # def get_walk_score(subset):
 79 | #     return subset['walk_score'].mean()
 80 | #
 81 | # # walk_score = get_walk_score(subset)
 82 | # # Display metric in the third column, restrict to 2 decimal places
 83 | # col3.metric(label="Average Walk Score", value=f"{walk_score:.2f}")
 84 | 
 85 | amenity_weights = {
 86 |     "grocery": [3],
 87 |     "restaurants": [3],
 88 |     "shopping": [2],
 89 |     "coffee": [2],
 90 |     "banks": [1],
 91 |     "parks": [1],
 92 |     "schools": [1],
 93 |     "books": [1],
 94 |     "entertainment": [1],
 95 | }
 96 | 
 97 | # Streamlit app setup for interactive walk score explanation
 98 | st.title("Interactive Walk Score Explanation")
 99 | 
100 | st.write(
101 |     """
102 |          This application demonstrates how the walk score for a property is calculated based on the distances to various amenities. 
103 |          Walk score is a measure of how friendly an area is to walking with a score from 0 to 100, where higher scores indicate better walkability.
104 |          """
105 | )
106 | 
107 | st.header("Walk Score Visualization")
108 | 
109 | st.write(
110 |     """
111 |          In visualizing the walk score we consider the average price of properties, the walk score, and the property count. This could provide an indication
112 |             of the relationship between the walk score and the average price of properties in a given area. Additionally, the property count could provide an
113 |             indication of the demand, and supply for properties in a given area. Logically, the higher the walk score, the higher the density of properties in
114 |             a given area, and the higher the average price of properties in a given area.
115 |          """
116 | )
117 | 
118 | options = {
119 |     "Price": "avg_price",
120 |     "Walk Score": "mean_walk_score",
121 |     "Property Count": "property_count",
122 | }
123 | 
124 | reversed_options = {value: key for key, value in options.items()}
125 | 
126 | # Use the dictionary keys as the display labels and get the selected option value
127 | selected_label = st.selectbox("Choose attribute to visualize:", options.keys())
128 | 
129 | option_value = options[selected_label]
130 | 
131 | gdf = load_geo_data()
132 | 
133 | # Display the plot in Streamlit
134 | st.pyplot(plot_geo_data(gdf, option_value))
135 | 
136 | st.header("Distance Decay Visualization")
137 | st.write(
138 |     "This plot shows the decay of scores with increasing distance for a single amenity. It illustrates how closer amenities contribute more significantly to the walk score."
139 | )
140 | fig = plot_distance_decay()
141 | st.pyplot(fig)
142 | 
143 | st.header("Customize Your Walk Score")
144 | st.write(
145 |     "Adjust the sliders below to simulate distances to different amenities and calculate a simplified walk score."
146 | )
147 | 
148 | # Example of creating sliders for different amenities (simplified version)
149 | 
150 | distances = {}
151 | for amenity in amenity_weights.keys():
152 |     distance = st.slider(f"Distance to nearest {amenity} (meters)", 0, 2000, 500, 50)
153 |     distances[amenity] = distance
154 | 
155 | total_walk_score = calculate_amenity_walk_score(distances, amenity_weights)
156 | 
157 | walk_score = total_walk_score * 6.67
158 | 
159 | st.metric("Total Walk Score", walk_score)
160 | 


--------------------------------------------------------------------------------
/rightmove/dashboard/streamlit/pages/02_Price.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | import pandas as pd
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import seaborn as sns
  6 | import pydeck as pdk
  7 | 
  8 | 
  9 | @st.cache_data
 10 | def load_data():
 11 |     df = pd.read_parquet(
 12 |         "gs://rightmove-artifacts-ml/streamlit_data/2024-02-27-12-32-07/data.parquet"
 13 |     )
 14 |     df["monthly_price"] = df["price"] / 12
 15 |     return df
 16 | 
 17 | 
 18 | df = load_data()
 19 | # Streamlit page setup
 20 | st.title("Bedroom, Bathroom, and Location Relationship with Price")
 21 | 
 22 | import plotly.express as px
 23 | 
 24 | 
 25 | def get_average_price(df):
 26 |     avg_price = df["monthly_price"].mean()
 27 |     return avg_price
 28 | 
 29 | 
 30 | def get_average_bedrooms(df):
 31 |     avg_bedrooms = df["bedrooms"].mean()
 32 |     return avg_bedrooms
 33 | 
 34 | 
 35 | def get_average_bathrooms(df):
 36 |     avg_bathrooms = df["bathrooms"].mean()
 37 |     return avg_bathrooms
 38 | 
 39 | 
 40 | def plot_price_by_bedrooms(df):
 41 |     fig = px.box(
 42 |         df,
 43 |         x="bedrooms",
 44 |         y="monthly_price",
 45 |         title="Rental Price Distribution by Number of bedrooms",
 46 |     )
 47 |     fig.update_layout(
 48 |         xaxis=dict(title="Number of Bedrooms"),
 49 |         yaxis=dict(title="Rental Price"),
 50 |         plot_bgcolor="rgba(0,0,0,0)",
 51 |         title_x=0.5,
 52 |     )
 53 |     return fig
 54 | 
 55 | 
 56 | def plot_price_by_bathrooms(df):
 57 |     fig = px.box(
 58 |         df,
 59 |         x="bathrooms",
 60 |         y="monthly_price",
 61 |         title="Rental Price Distribution by Number of Bathrooms",
 62 |     )
 63 |     fig.update_layout(
 64 |         xaxis=dict(title="Number of Bathrooms"),
 65 |         yaxis=dict(title="Rental Price"),
 66 |         plot_bgcolor="rgba(0,0,0,0)",
 67 |         title_x=0.5,
 68 |     )
 69 | 
 70 |     return fig
 71 | 
 72 | 
 73 | st.sidebar.title("Filters")
 74 | min_price, max_price = st.sidebar.slider(
 75 |     "Select Rental Price Range",
 76 |     min_value=int(df["monthly_price"].min()),
 77 |     max_value=int(df["monthly_price"].max()),
 78 |     value=(int(df["monthly_price"].min()), int(df["monthly_price"].max())),
 79 | )
 80 | 
 81 | # Filtering the DataFrame based on the selected price range
 82 | filtered_df = df[
 83 |     (df["monthly_price"] >= min_price) & (df["monthly_price"] <= max_price)
 84 | ]
 85 | 
 86 | col1, col2, col3 = st.columns(3)
 87 | with col1:
 88 |     st.metric(
 89 |         label="Average Monthly Price", value=f"{get_average_price(filtered_df):.2f}"
 90 |     )
 91 | # Average Bedrooms and Bathroom
 92 | with col2:
 93 |     st.metric(
 94 |         label="Average Number of Bathrooms",
 95 |         value=f"{get_average_bathrooms(filtered_df):.2f}",
 96 |     )
 97 | 
 98 | with col3:
 99 |     st.metric(
100 |         label="Average Number of Bedrooms",
101 |         value=f"{get_average_bedrooms(filtered_df):.2f}",
102 |     )
103 | 
104 | st.plotly_chart(plot_price_by_bathrooms(filtered_df), use_container_width=True)
105 | st.plotly_chart(plot_price_by_bedrooms(filtered_df), use_container_width=True)
106 | 
107 | max_rental_price = df["monthly_price"].max()
108 | 
109 | # Assuming you've already grouped your data as needed or if you're using individual points,
110 | # you can directly use the rental_price for elevation. For a true mean aggregation, you'd need
111 | # to aggregate your data by the hexagon/bin locations, which requires additional preprocessing.
112 | 
113 | # For color, normalize the rental_price to get a value between 0 and 255 for the color scale
114 | df["color_value"] = (df["monthly_price"] / max_rental_price) * 255
115 | df["color_value"] = df["color_value"].astype(
116 |     int
117 | )  # Ensure it's an integer for color coding
118 | 
119 | 
120 | # Function to create the heatmap
121 | st.info(
122 |     "The following map shows the distribution of rental prices in the selected area. The elevation of the hexagons represents the mean rental price of properties within each hexagon."
123 | )
124 | @st.cache_resource
125 | def create_hexagon_map(
126 |     dataframe,
127 |     lat_col="latitude",
128 |     lon_col="longitude",
129 |     value_col="monthly_price",
130 |     radius=200,
131 | ):
132 |     """Create a hexagon map where the elevation represents the mean rental price of properties within each hexagon.
133 | 
134 |     Args:
135 |         dataframe (pd.DataFrame): The dataframe containing the data.
136 |         lat_col (str): Column name for latitude values.
137 |         lon_col (str): Column name for longitude values.
138 |         value_col (str): Column name for the values to average (mean rental price).
139 |         radius (int): Radius of the hexagons in meters.
140 | 
141 |     Returns:
142 |         pydeck.Deck: A pydeck Deck object ready to be displayed.
143 |     """
144 |     # Aggregate data by hexagon
145 |     layer = pdk.Layer(
146 |         "HexagonLayer",
147 |         dataframe[[lon_col, lat_col, value_col]],
148 |         get_position=[lon_col, lat_col],
149 |         auto_highlight=True,
150 |         elevation_scale=50,  # Adjust based on your data's scale for better visualization
151 |         pickable=True,
152 |         elevation_range=[0, 3000],  # Max elevation in meters
153 |         extruded=True,  # Make hexagon 3D
154 |         coverage=4,
155 |         opacity=0.3,
156 |         radius=radius,  # Radius of hexagon in meters
157 |         get_elevation="monthly_price",  # Use the 'elevation' column if you've aggregated data
158 |         get_fill_color="[255, 255, color_value, 140]",
159 |     )
160 | 
161 |     # Set the initial view
162 |     view_state = pdk.ViewState(
163 |         longitude=-1.415,
164 |         latitude=52.2323,
165 |         zoom=6,
166 |         min_zoom=5,
167 |         max_zoom=15,
168 |         pitch=40.5,
169 |         bearing=-27.36,
170 |     )
171 | 
172 |     # Combine everything and render a viewport
173 |     r = pdk.Deck(layers=[layer], initial_view_state=view_state)
174 | 
175 |     return r
176 | 
177 | 
178 | # Example usage
179 | hex_map = create_hexagon_map(filtered_df)
180 | st.pydeck_chart(hex_map)
181 | 


--------------------------------------------------------------------------------
/notebooks/data_storage/mongo_integration.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "4b764f16-3e43-463f-af40-fbcc51f3f9cb",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "from pymongo import MongoClient\n",
 11 |     "\n",
 12 |     "client = MongoClient(\"mongodb://localhost:27017/\") # Hosted with Docker"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "id": "3c7dc024-1194-4ca9-9179-dedcd5ca476c",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "## Access Database"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "id": "35c41cd0-27e0-48a3-9b35-4c15988f0010",
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "db = client[\"rightmove\"]\n",
 31 |     "\n",
 32 |     "# Access collection\n",
 33 |     "collection = db[\"properties\"]"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "id": "fd6b7ec8-958b-4b02-b882-5eac66268c09",
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "# Insert a document\n",
 44 |     "collection.insert_one({\"name\": \"John Doe\", \"age\": 30})\n",
 45 |     "\n",
 46 |     "# Find a document\n",
 47 |     "user = collection.find_one({\"name\": \"John Doe\"})\n",
 48 |     "\n",
 49 |     "# Update a document\n",
 50 |     "collection.update_one({\"name\": \"John Doe\"}, {\"$set\": {\"age\": 31}})\n",
 51 |     "\n",
 52 |     "# Delete a document\n",
 53 |     "collection.delete_one({\"name\": \"John Doe\"})"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "id": "cf50b563-4cac-4156-9b6e-faaa4af9ca48",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "## Fetch data"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 30,
 67 |    "id": "77e22ba8-c237-4d04-959d-3683c114e35c",
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "import json\n",
 72 |     "\n",
 73 |     "with open(\"../resources/data/property.json\", \"r\") as file:\n",
 74 |     "    property = json.load(file) \n",
 75 |     "\n",
 76 |     "with open(\"../resources/data/property_1.json\", \"r\") as file:\n",
 77 |     "    property_1 = json.load(file) "
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "id": "478f104c-0b48-4211-991f-76fe790f65ba",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "## Load data to MongoDB"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "id": "f97139f9-4383-4e3b-a368-d54b4463509d",
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "collection.insert_one(property)"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 31,
101 |    "id": "6771b78d-0020-42d6-a96b-4301bf6b834f",
102 |    "metadata": {},
103 |    "outputs": [
104 |     {
105 |      "data": {
106 |       "text/plain": [
107 |        "<pymongo.results.InsertOneResult at 0x111f27dc0>"
108 |       ]
109 |      },
110 |      "execution_count": 31,
111 |      "metadata": {},
112 |      "output_type": "execute_result"
113 |     }
114 |    ],
115 |    "source": [
116 |     "collection.insert_one(property_1)"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "id": "bcbe4789-0c1c-41d6-bede-22efb03fc8cd",
122 |    "metadata": {},
123 |    "source": [
124 |     "## Load Data from MongoDB"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 13,
130 |    "id": "9a27999b-7315-4a02-805e-25e05458427f",
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "property = collection.find_one({\"id\": 142547498})"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "id": "1609ed92-78c7-48e1-8d02-d1ea9eac2fde",
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "property"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "id": "9b51e9c9-8baf-4140-ad2c-18ee30995c5a",
150 |    "metadata": {},
151 |    "source": [
152 |     "## Get a list of all rightmove IDs"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "id": "69201660-191d-4daf-8b6b-7944f8c58c4a",
158 |    "metadata": {},
159 |    "source": [
160 |     "I created a new unique index in MongoDB to enforce uniqueness and quick access for IDs."
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 35,
166 |    "id": "367dcb10-508f-49a2-91bb-8bec6359f08c",
167 |    "metadata": {},
168 |    "outputs": [
169 |     {
170 |      "name": "stdout",
171 |      "output_type": "stream",
172 |      "text": [
173 |       "[142547498, 142659089]\n"
174 |      ]
175 |     }
176 |    ],
177 |    "source": [
178 |     "rightmove_ids = collection.find({}, {\"id\": 1})\n",
179 |     "\n",
180 |     "# Convert the result to a list of IDs\n",
181 |     "ids = [doc['id'] for doc in rightmove_ids]\n",
182 |     "\n",
183 |     "print(ids)"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "markdown",
188 |    "id": "e4521181-d231-443b-8ed8-849763b0d736",
189 |    "metadata": {},
190 |    "source": [
191 |     "### Close connection"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 37,
197 |    "id": "8acde9bf-e654-4e7c-91c1-5b653d7ecd1a",
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "client.close()"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "id": "48a83a75-144e-466f-bbd1-31e9b26b074b",
208 |    "metadata": {},
209 |    "outputs": [],
210 |    "source": []
211 |   }
212 |  ],
213 |  "metadata": {
214 |   "kernelspec": {
215 |    "display_name": "Python 3 (ipykernel)",
216 |    "language": "python",
217 |    "name": "python3"
218 |   },
219 |   "language_info": {
220 |    "codemirror_mode": {
221 |     "name": "ipython",
222 |     "version": 3
223 |    },
224 |    "file_extension": ".py",
225 |    "mimetype": "text/x-python",
226 |    "name": "python",
227 |    "nbconvert_exporter": "python",
228 |    "pygments_lexer": "ipython3",
229 |    "version": "3.11.4"
230 |   }
231 |  },
232 |  "nbformat": 4,
233 |  "nbformat_minor": 5
234 | }
235 | 


--------------------------------------------------------------------------------
/notebooks/data_storage/.ipynb_checkpoints/mongo_integration-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "4b764f16-3e43-463f-af40-fbcc51f3f9cb",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "from pymongo import MongoClient\n",
 11 |     "\n",
 12 |     "client = MongoClient(\"mongodb://localhost:27017/\") # Hosted with Docker"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "id": "3c7dc024-1194-4ca9-9179-dedcd5ca476c",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "## Access Database"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "id": "35c41cd0-27e0-48a3-9b35-4c15988f0010",
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "db = client[\"rightmove\"]\n",
 31 |     "\n",
 32 |     "# Access collection\n",
 33 |     "collection = db[\"properties\"]"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "id": "fd6b7ec8-958b-4b02-b882-5eac66268c09",
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "# Insert a document\n",
 44 |     "collection.insert_one({\"name\": \"John Doe\", \"age\": 30})\n",
 45 |     "\n",
 46 |     "# Find a document\n",
 47 |     "user = collection.find_one({\"name\": \"John Doe\"})\n",
 48 |     "\n",
 49 |     "# Update a document\n",
 50 |     "collection.update_one({\"name\": \"John Doe\"}, {\"$set\": {\"age\": 31}})\n",
 51 |     "\n",
 52 |     "# Delete a document\n",
 53 |     "collection.delete_one({\"name\": \"John Doe\"})"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "id": "cf50b563-4cac-4156-9b6e-faaa4af9ca48",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "## Fetch data"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 30,
 67 |    "id": "77e22ba8-c237-4d04-959d-3683c114e35c",
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "import json\n",
 72 |     "\n",
 73 |     "with open(\"../resources/data/property.json\", \"r\") as file:\n",
 74 |     "    property = json.load(file) \n",
 75 |     "\n",
 76 |     "with open(\"../resources/data/property_1.json\", \"r\") as file:\n",
 77 |     "    property_1 = json.load(file) "
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "id": "478f104c-0b48-4211-991f-76fe790f65ba",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "## Load data to MongoDB"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "id": "f97139f9-4383-4e3b-a368-d54b4463509d",
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "collection.insert_one(property)"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 31,
101 |    "id": "6771b78d-0020-42d6-a96b-4301bf6b834f",
102 |    "metadata": {},
103 |    "outputs": [
104 |     {
105 |      "data": {
106 |       "text/plain": [
107 |        "<pymongo.results.InsertOneResult at 0x111f27dc0>"
108 |       ]
109 |      },
110 |      "execution_count": 31,
111 |      "metadata": {},
112 |      "output_type": "execute_result"
113 |     }
114 |    ],
115 |    "source": [
116 |     "collection.insert_one(property_1)"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "id": "bcbe4789-0c1c-41d6-bede-22efb03fc8cd",
122 |    "metadata": {},
123 |    "source": [
124 |     "## Load Data from MongoDB"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 13,
130 |    "id": "9a27999b-7315-4a02-805e-25e05458427f",
131 |    "metadata": {},
132 |    "outputs": [],
133 |    "source": [
134 |     "property = collection.find_one({\"id\": 142547498})"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "id": "1609ed92-78c7-48e1-8d02-d1ea9eac2fde",
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "property"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "id": "9b51e9c9-8baf-4140-ad2c-18ee30995c5a",
150 |    "metadata": {},
151 |    "source": [
152 |     "## Get a list of all rightmove IDs"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "id": "69201660-191d-4daf-8b6b-7944f8c58c4a",
158 |    "metadata": {},
159 |    "source": [
160 |     "I created a new unique index in MongoDB to enforce uniqueness and quick access for IDs."
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 35,
166 |    "id": "367dcb10-508f-49a2-91bb-8bec6359f08c",
167 |    "metadata": {},
168 |    "outputs": [
169 |     {
170 |      "name": "stdout",
171 |      "output_type": "stream",
172 |      "text": [
173 |       "[142547498, 142659089]\n"
174 |      ]
175 |     }
176 |    ],
177 |    "source": [
178 |     "rightmove_ids = collection.find({}, {\"id\": 1})\n",
179 |     "\n",
180 |     "# Convert the result to a list of IDs\n",
181 |     "ids = [doc['id'] for doc in rightmove_ids]\n",
182 |     "\n",
183 |     "print(ids)"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "markdown",
188 |    "id": "e4521181-d231-443b-8ed8-849763b0d736",
189 |    "metadata": {},
190 |    "source": [
191 |     "### Close connection"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 37,
197 |    "id": "8acde9bf-e654-4e7c-91c1-5b653d7ecd1a",
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "client.close()"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "id": "48a83a75-144e-466f-bbd1-31e9b26b074b",
208 |    "metadata": {},
209 |    "outputs": [],
210 |    "source": []
211 |   }
212 |  ],
213 |  "metadata": {
214 |   "kernelspec": {
215 |    "display_name": "Python 3 (ipykernel)",
216 |    "language": "python",
217 |    "name": "python3"
218 |   },
219 |   "language_info": {
220 |    "codemirror_mode": {
221 |     "name": "ipython",
222 |     "version": 3
223 |    },
224 |    "file_extension": ".py",
225 |    "mimetype": "text/x-python",
226 |    "name": "python",
227 |    "nbconvert_exporter": "python",
228 |    "pygments_lexer": "ipython3",
229 |    "version": "3.11.4"
230 |   }
231 |  },
232 |  "nbformat": 4,
233 |  "nbformat_minor": 5
234 | }
235 | 


--------------------------------------------------------------------------------
/rightmove/orchestration/airflow_app/dags/rightmove/data_processing/rightmove_processing.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from apache_beam.io.mongodbio import ReadFromMongoDB, WriteToMongoDB
  4 | 
  5 | import apache_beam as beam
  6 | from apache_beam.options.pipeline_options import PipelineOptions
  7 | import logging
  8 | from sklearn.neighbors import BallTree
  9 | import pandas as pd
 10 | import math
 11 | import numpy as np
 12 | import datetime
 13 | from math import radians
 14 | 
 15 | from pymongo import MongoClient
 16 | 
 17 | # from dotenv import load_dotenv
 18 | # load_dotenv("/Users/alexander.girardet/Code/Personal/projects/rightmove_project/.env")
 19 | 
 20 | MONGO_URI = os.environ.get("MONGO_URI")
 21 | 
 22 | GCS_PARQUET_URL = "https://storage.googleapis.com/rightmove-resources-public/UK_pois.parquet"  # TODO: Make this private
 23 | WALK_SCORES_COLLECTION = "walk_scores"
 24 | 
 25 | BATCH_SIZE = 50
 26 | 
 27 | 
 28 | class ProcessElement(beam.DoFn):
 29 |     def fetch_current_ids(self):
 30 |         client = MongoClient(MONGO_URI)
 31 |         db = client["rightmove"]
 32 |         collection = db[WALK_SCORES_COLLECTION]
 33 |         query = {}
 34 |         data = collection.find(query, {"id": 1})
 35 |         return [x["id"] for x in list(data)]
 36 | 
 37 |     def process_results_df(self, distance_series, pois_df):
 38 |         results_df = pd.DataFrame(distance_series)
 39 | 
 40 |         results_df = results_df.join(pois_df["amenities"], how="left")
 41 | 
 42 |         results_df["distance_in_metres"] = results_df["distance"].apply(
 43 |             lambda x: x * self.earth_radius
 44 |         )
 45 | 
 46 |         results_df["distance_decayed"] = results_df["distance_in_metres"].apply(
 47 |             lambda x: float(self.distance_decay(x))
 48 |         )
 49 | 
 50 |         return results_df
 51 | 
 52 |     def distance_decay(sefl, distance):
 53 |         M = float(1)
 54 |         dist = distance / 1000
 55 |         score = math.e ** ((-5.0 * (dist / 4)) ** 5.0)
 56 |         return score
 57 | 
 58 |     def calculate_amenity_walk_score(self, property_distance_df, amenity, weights):
 59 |         k = len(weights)
 60 |         weight_array = np.array(weights)
 61 | 
 62 |         dist_array = (
 63 |             property_distance_df[property_distance_df["amenities"] == amenity]
 64 |             .iloc[0:k]["distance_decayed"]
 65 |             .values
 66 |         )
 67 |         dist_array_padded = np.pad(
 68 |             dist_array, (0, weight_array.size - dist_array.size), "constant"
 69 |         )
 70 | 
 71 |         scores_array = dist_array_padded * weight_array
 72 | 
 73 |         amenity_score = scores_array.sum()
 74 | 
 75 |         return amenity_score
 76 | 
 77 |     def calculuate_walk_score(self, property, ball_tree, amenity_weights, pois_df):
 78 |         property_id = property["id"]
 79 |         latitude = property["location"]["latitude"]
 80 |         longitude = property["location"]["longitude"]
 81 | 
 82 |         radian_longitude = radians(longitude)
 83 |         radian_latitude = radians(latitude)
 84 | 
 85 |         k = 100  # Maximum number of amenities to return
 86 | 
 87 |         distances, indices = ball_tree.query(
 88 |             [[radian_longitude, radian_latitude]], k=k, return_distance=True
 89 |         )
 90 | 
 91 |         dist_series = pd.Series(distances[0], index=indices[0], name="distance")
 92 | 
 93 |         results_df = self.process_results_df(dist_series, pois_df)
 94 | 
 95 |         # print(results_df)
 96 | 
 97 |         scores_dict = {}
 98 | 
 99 |         walk_score = 0
100 | 
101 |         for key, values in amenity_weights.items():
102 |             amenity_score = self.calculate_amenity_walk_score(results_df, key, values)
103 | 
104 |             scores_dict[key] = amenity_score
105 | 
106 |         return scores_dict
107 | 
108 |     def setup(self):
109 |         self.earth_radius = 6371000  # Earth radius in metres
110 |         self.pois_df = pd.read_parquet(GCS_PARQUET_URL)
111 |         self.ball_tree = BallTree(
112 |             self.pois_df[["lon_rad", "lat_rad"]].values, metric="haversine"
113 |         )  # What is the ball tree doing?
114 |         self.amenity_weights = {
115 |             "grocery": [3],
116 |             "restaurants": [
117 |                 0.75,
118 |                 0.45,
119 |                 0.25,
120 |                 0.25,
121 |                 0.225,
122 |                 0.225,
123 |                 0.225,
124 |                 0.225,
125 |                 0.2,
126 |                 0.2,
127 |             ],
128 |             "shopping": [0.5, 0.45, 0.4, 0.35, 0.3],
129 |             "coffee": [1.25, 0.75],
130 |             "banks": [1],
131 |             "parks": [1],
132 |             "schools": [1],
133 |             "books": [1],
134 |             "entertainment": [1],
135 |         }
136 |         self.processed_ids = self.fetch_current_ids()
137 | 
138 |     def process(self, element):  # TODO: ADD ID processing to avoid duplicate processing
139 |         logging.info(f"Processing element: {len(element)}")
140 |         for ele in element:
141 |             if ele["id"] not in self.processed_ids:
142 |                 property = {"id": ele["id"], "location": ele["location"]}
143 |                 logging.info(f"Processing property: {property}")
144 |                 scores_dict = self.calculuate_walk_score(
145 |                     property, self.ball_tree, self.amenity_weights, self.pois_df
146 |                 )
147 |                 walk_score = sum(scores_dict.values()) * 6.67
148 |                 scores_dict["walk_score"] = walk_score
149 | 
150 |                 property["scores"] = scores_dict
151 | 
152 |                 property[
153 |                     "processing_timestamp"
154 |                 ] = datetime.datetime.utcnow().timestamp()
155 | 
156 |                 yield property
157 |             else:
158 |                 logging.info(f"Property already processed: {ele['id']}")
159 |                 continue
160 | 
161 | 
162 | def run():
163 |     with beam.Pipeline(options=PipelineOptions()) as pipeline:
164 |         (
165 |             pipeline
166 |             | "Read from Mongo"
167 |             >> ReadFromMongoDB(
168 |                 uri=MONGO_URI, db="rightmove", coll="properties", bucket_auto=True
169 |             )  # Only return the id and the location
170 |             | "Batch Elements"
171 |             >> beam.BatchElements(min_batch_size=BATCH_SIZE, max_batch_size=BATCH_SIZE)
172 |             | "Process each element" >> beam.ParDo(ProcessElement())
173 |             | "Write to MongoDB"
174 |             >> WriteToMongoDB(
175 |                 uri=MONGO_URI,
176 |                 db="rightmove",
177 |                 coll=WALK_SCORES_COLLECTION,
178 |                 batch_size=10,
179 |             )
180 |         )
181 | 
182 | 
183 | if __name__ == "__main__":
184 |     logging.getLogger().setLevel(logging.INFO)
185 |     run()
186 | 


--------------------------------------------------------------------------------
/notebooks/data_ingestion/scrapy_connection.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "5e5492f1-7a93-4509-a5d6-99ac7c56dadb",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "This notebook is to test the integration with scrapyd to scehdule runs."
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 1,
 14 |    "id": "f53b22b9-f901-4000-bc8b-822f87abb6e2",
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import requests"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "id": "bdfd9578-2d5d-4a08-a51c-e919a93774ba",
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "spider = \"rightmove\"\n",
 29 |     "project = \"rightmove_scraper\""
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "id": "e5fb7732-c1ef-45a4-83d4-118818da84ed",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "## Schedule job"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 15,
 43 |    "id": "5268cded-e321-4c5d-a11d-228cc52615b6",
 44 |    "metadata": {},
 45 |    "outputs": [
 46 |     {
 47 |      "name": "stdout",
 48 |      "output_type": "stream",
 49 |      "text": [
 50 |       "{\"node_name\": \"d9198cfc8c27\", \"status\": \"ok\", \"jobid\": \"c3fc2a8a936b11eebe730242ac1b0006\"}\n",
 51 |       "\n",
 52 |       "Request successful\n",
 53 |       "Job started\n"
 54 |      ]
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "SCRAPYD_ENDPOINT = \"http://localhost:6800/schedule.json\"\n",
 59 |     "\n",
 60 |     "spider = \"rightmove\"\n",
 61 |     "project = \"rightmove_scraper\"\n",
 62 |     "payload = f\"project={project}&spider={spider}\"\n",
 63 |     "headers = {\n",
 64 |     "    'Content-Type': 'application/x-www-form-urlencoded'\n",
 65 |     "}\n",
 66 |     "\n",
 67 |     "response = requests.request(\"POST\", SCRAPYD_ENDPOINT, headers=headers, data=payload)\n",
 68 |     "\n",
 69 |     "print(response.text)\n",
 70 |     "if response.status_code == 200:\n",
 71 |     "    print(\"Request successful\")\n",
 72 |     "    if response.json()['status'] == 'ok':\n",
 73 |     "        print(\"Job started\")\n",
 74 |     "        job_id = response.json()['jobid']\n",
 75 |     "else:\n",
 76 |     "    print(response.text)\n",
 77 |     "    raise ValueError(\"Request failed\")"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 16,
 83 |    "id": "2ea697a8-f1b1-4c34-8759-8457ed6001f8",
 84 |    "metadata": {},
 85 |    "outputs": [
 86 |     {
 87 |      "data": {
 88 |       "text/plain": [
 89 |        "'c3fc2a8a936b11eebe730242ac1b0006'"
 90 |       ]
 91 |      },
 92 |      "execution_count": 16,
 93 |      "metadata": {},
 94 |      "output_type": "execute_result"
 95 |     }
 96 |    ],
 97 |    "source": [
 98 |     "job_id"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "id": "cf474c91-356f-4d90-8765-85564cb3266d",
104 |    "metadata": {},
105 |    "source": [
106 |     "## Check job"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 17,
112 |    "id": "c7152ebb-c49e-4b6f-812c-82c425a2794c",
113 |    "metadata": {},
114 |    "outputs": [
115 |     {
116 |      "name": "stdout",
117 |      "output_type": "stream",
118 |      "text": [
119 |       "{\"node_name\": \"d9198cfc8c27\", \"status\": \"ok\", \"pending\": [], \"running\": [{\"project\": \"rightmove_scraper\", \"spider\": \"rightmove\", \"id\": \"c3fc2a8a936b11eebe730242ac1b0006\", \"pid\": 45, \"start_time\": \"2023-12-05 12:42:39.979369\"}], \"finished\": [{\"project\": \"rightmove_scraper\", \"spider\": \"rightmove\", \"id\": \"cddc3cf8936a11eebe730242ac1b0006\", \"start_time\": \"2023-12-05 12:35:44.985567\", \"end_time\": \"2023-12-05 12:37:23.300061\", \"log_url\": \"/logs/rightmove_scraper/rightmove/cddc3cf8936a11eebe730242ac1b0006.log\", \"items_url\": \"/items/rightmove_scraper/rightmove/cddc3cf8936a11eebe730242ac1b0006.jl\"}, {\"project\": \"rightmove_scraper\", \"spider\": \"rightmove\", \"id\": \"93ae7428936b11eebe730242ac1b0006\", \"start_time\": \"2023-12-05 12:41:15.135047\", \"end_time\": \"2023-12-05 12:42:13.706062\", \"log_url\": \"/logs/rightmove_scraper/rightmove/93ae7428936b11eebe730242ac1b0006.log\", \"items_url\": \"/items/rightmove_scraper/rightmove/93ae7428936b11eebe730242ac1b0006.jl\"}]}\n",
120 |       "\n"
121 |      ]
122 |     }
123 |    ],
124 |    "source": [
125 |     "import requests\n",
126 |     "\n",
127 |     "url = f\"http://localhost:6800/listjobs.json?project={project}\"\n",
128 |     "\n",
129 |     "payload = {}\n",
130 |     "headers = {}\n",
131 |     "\n",
132 |     "response = requests.request(\"GET\", url, headers=headers, data=payload)\n",
133 |     "\n",
134 |     "print(response.text)"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "id": "93ec794d-9cc7-44bb-905a-4650404eeeb0",
140 |    "metadata": {},
141 |    "source": [
142 |     "## Cancel Job"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 18,
148 |    "id": "60c90559-2f28-4d7a-aa12-f601ba6ec8bd",
149 |    "metadata": {},
150 |    "outputs": [
151 |     {
152 |      "name": "stdout",
153 |      "output_type": "stream",
154 |      "text": [
155 |       "{\"node_name\": \"d9198cfc8c27\", \"status\": \"ok\", \"prevstate\": \"running\"}\n",
156 |       "\n",
157 |       "Request successful\n"
158 |      ]
159 |     }
160 |    ],
161 |    "source": [
162 |     "SCRAPYD_ENDPOINT = \"http://localhost:6800/cancel.json\"\n",
163 |     "\n",
164 |     "spider = \"rightmove\"\n",
165 |     "project = \"rightmove_scraper\"\n",
166 |     "job_id = \"c3fc2a8a936b11eebe730242ac1b0006\"\n",
167 |     "payload = f\"project={project}&job={job_id}\"\n",
168 |     "headers = {\n",
169 |     "    'Content-Type': 'application/x-www-form-urlencoded'\n",
170 |     "}\n",
171 |     "\n",
172 |     "response = requests.request(\"POST\", SCRAPYD_ENDPOINT, headers=headers, data=payload)\n",
173 |     "\n",
174 |     "print(response.text)\n",
175 |     "if response.status_code == 200:\n",
176 |     "    print(\"Request successful\")\n",
177 |     "else:\n",
178 |     "    print(response.text)\n",
179 |     "    raise ValueError(\"Request failed\")"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": null,
185 |    "id": "b44e1572-9bc0-4e57-be99-6b25c15201a7",
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": []
189 |   }
190 |  ],
191 |  "metadata": {
192 |   "kernelspec": {
193 |    "display_name": "Python 3 (ipykernel)",
194 |    "language": "python",
195 |    "name": "python3"
196 |   },
197 |   "language_info": {
198 |    "codemirror_mode": {
199 |     "name": "ipython",
200 |     "version": 3
201 |    },
202 |    "file_extension": ".py",
203 |    "mimetype": "text/x-python",
204 |    "name": "python",
205 |    "nbconvert_exporter": "python",
206 |    "pygments_lexer": "ipython3",
207 |    "version": "3.11.4"
208 |   }
209 |  },
210 |  "nbformat": 4,
211 |  "nbformat_minor": 5
212 | }
213 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Rightmove Rental Prediction System
 2 | 
 3 | In recent months, I've deepened my expertise in creating machine learning (ML) systems through comprehensive study and application of three pivotal areas: Data Engineering, MLOps, and ML Engineering, as structured by the Data Talk Club courses. My project, the Rightmove Rental Prediction System, encapsulates this journey, demonstrating a cohesive application of these skills.
 4 | 
 5 | The essence of this project lies in its comprehensive architecture, designed to predict rental prices with precision. It integrates:
 6 | 
 7 | 1. **Data Engineering** through an asynchronous web scraper and batch ingestion pipelines, enabling efficient data extraction and preprocessing.
 8 | 2. **ML Engineering** with a focus on model training and feature engineering, including the development of an innovative "Walk Score."
 9 | 3. **MLOps** by implementing monitoring practices to ensure the system's reliability and performance over time.
10 | 
11 | ### **Project Components**
12 | 
13 | 1. **Extraction and Data Processing Pipeline**: Automated to handle large-scale data extraction, cleaning, and preparation.
14 | 2. **ML Training Pipeline**: Designed for iterative experimentation and training, leveraging a RandomForest model among others, to identify the most effective prediction method.
15 | 3. **MLOps Monitoring Pipeline**: Ensures model performance remains optimal through continuous monitoring for data drift and other potential issues.
16 | 4. **Model Serving API**: Utilizes FastAPI for efficient model deployment, allowing real-time predictions.
17 | 5. **Visualization Dashboard**: Built with Streamlit and Grafana, offering insightful data visualizations and monitoring dashboards to track system performance and data quality.
18 | 
19 | ### **Infrastructure and Deployment**
20 | 
21 | My approach combines DevOps and software engineering principles, employing Terraform for infrastructure management and Docker Compose for containerization, across both AWS and GCP platforms. This dual-cloud strategy not only leverages the strengths of both services but also optimizes costs through their free tier options.
22 | 
23 | ### **ML and MLOps Implementation**
24 | 
25 | The project showcases my ML and MLOps expertise through the development of a RandomForest model, enhanced by a unique feature, the Walk Score, to improve predictive accuracy. MLFlow serves as the backbone for experiment tracking and model registry, facilitating the model's evolution and serving.
26 | 
27 | ### **Data Extraction and Processing**
28 | 
29 | ![Data Extraction Pipeline](/static/images/Processing_pipeline_rightmove.png)
30 | 
31 | 
32 | Choosing Rightmove, a leading UK property listing site, as the data source, I developed a Scrapy spider deployed on a Scrapyd server. This setup enhances control over scraping activities and integrates seamlessly with Airflow for orchestration, ensuring ethical data usage and compliance with best practices.
33 | 
34 | Data storage is managed through PostgreSQL and MongoDB, supporting structured and unstructured data, respectively. This configuration not only facilitates efficient data management but also integrates a custom Beam job to compute the Walk Score for enhanced model input.
35 | 
36 | ### **ML Training with MLFlow**
37 | 
38 | ![ML Training pipeline](/static/images/model_training_pipeline.png)
39 | 
40 | For the ML training component, MLFlow played a critical role as a central hub for experiment tracking, model versioning, and serving. This tool allowed for a systematic approach to managing the lifecycle of machine learning models. Here's how it was integrated into the workflow:
41 | 
42 | - **Experiment Tracking**: Every training run, along with its parameters, metrics, and outcomes, was logged in MLFlow. This facilitated a comprehensive analysis of each experiment, enabling quick iteration over models to find the best performing ones based on Root Mean Squared Error (RMSE) metrics.
43 | - **Model Registry**: The most promising models, particularly the RandomForest model which outperformed others including XGBoost, were registered in MLFlow's model registry. This registry acted as a repository, making it simple to version, store, and access models for deployment.
44 | - **Model Serving**: MLFlow also streamlined the deployment process. The serving component fetched the latest and most effective model version directly from the registry, ensuring that the prediction service always utilized the best available model.
45 | 
46 | The use of MLFlow not only brought organization and efficiency to the model training process but also ensured transparency and reproducibility, which are essential for collaboration and continuous improvement in ML projects.
47 | 
48 | ## **DevOps and Scraper Monitoring**
49 | 
50 | The Rightmove Rental Prediction System employs a focused approach to monitor its web scraping operations, leveraging Grafana and PostgreSQL for a streamlined and effective oversight.
51 | 
52 | ### **Monitoring Framework**
53 | 
54 | **Grafana Dashboard**: Provides real-time visualization of critical metrics such as success rates, error counts, and response times. This dashboard enables quick identification of performance issues or errors in the web scraping process.
55 | 
56 | **PostgreSQL**: Acts as the storage backbone for logging detailed metrics from each scraping session. This includes timestamps, counts of extracted records, and error logs, offering a comprehensive view for analysis and troubleshooting.
57 | 
58 | ### **Key Objectives**
59 | 
60 | - **Efficiency and Error Management**: Monitoring ensures the scraper runs efficiently, with a quick response to any errors or bottlenecks.
61 | - **Compliance and Rate Limiting**: Keeps the scraping activities within ethical and legal boundaries by tracking request rates and adherence to site policies.
62 | 
63 | ### **DevOps Integration**
64 | 
65 | The setup integrates seamlessly with our DevOps practices, with Grafana alerts configured to trigger automated actions or notifications for immediate attention, ensuring the system's robustness and reliability.
66 | 
67 | #### System Monitoring
68 | ![Extraction Monitoring](/static/images/scrapy_monitoring.png)
69 | 
70 | System Monitoring of Scrapy Sessions
71 | 
72 | ## **MLOps**
73 | 
74 | ![MLOps Diagram](/static/images/mlops_pipeline.png)
75 | 
76 | Understanding and mitigating concept drift and data drift are critical for maintaining the performance of ML models in production. Here’s how these challenges were approached:
77 | 
78 | - **Concept Drift**: This occurs when the statistical properties of the target variable, which the model is trying to predict, change over time. This can degrade the model's performance because the patterns the model learned during training may no longer apply. To detect concept drift, the monitoring pipeline employed statistical tests and comparisons between predictions and actual outcomes over time. When significant drift was detected, a model retraining workflow was triggered, incorporating new data to adapt the model to the current reality.
79 | - **Data Drift**: Data drift refers to changes in the input data's distribution. It's crucial to monitor because even if the target variable's distribution remains the same, changes in input data can lead to poor model performance. The project utilized Evidently to monitor key features' distributions, comparing incoming data against a historical baseline (the golden dataset). Alerts were set up to notify when data drift exceeded predefined thresholds, prompting an evaluation to determine if model retraining or adjustment in data preprocessing steps was necessary.
80 | 
81 | #### ML Model Monitoring
82 | ![Model Monitoring](/static/images/model_monitoring.png)
83 | 
84 | MLOps monitoring of Data and Concept Drift
85 | 
86 | ### Addressing change
87 | 
88 | Grafana enables automated actions. In the case our model’s prediction performance drops below a certain threshold, we will trigger an automatic retraining of the model on new data that include the new patterns. This should ensure our model is up to date in an automated fashion.
89 | 


--------------------------------------------------------------------------------
/notebooks/resources/data/property_1.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "id": 142659089,
  3 |     "bedrooms": 2,
  4 |     "bathrooms": 2,
  5 |     "numberOfImages": 9,
  6 |     "numberOfFloorplans": 0,
  7 |     "numberOfVirtualTours": 1,
  8 |     "summary": "Property Reference: 1915996.This 2 bedroom house is in the popular area of Mugiemoss, Bucksburn and is available for let from the start of January 2024The property boasts an enviable location within an established community and with an easy commute to the city centre. Good quality laminate and carpet along with blinds and white goods are all included with the house. Upstairs there are two double bedrooms, one featuring a built in wardrobe. There is also a three piece bathroom suite upstairs with a shower over the bath and there is a separate downstairs cloakroom.The spacious open plan lounge/kitchen features a built in oven and hob, washer-dryer and fridge freezer.Outside there is a fully enclosed garden and also a residents parking area.A deposit of one month's rent is required and eligibility criteria will need to be met. This property is suitable for pets with an additional 50% deposit.\u00a0Scottish charity number: SCO44825, **some photos are for illustration purposes only**Summary & Exclusions:- Rent Amount: \u00a3632.42 per month (\u00a3145.94 per week)- Deposit / Bond: \u00a3632.41- 2 Bedrooms- 2 Bathrooms- Property comes unfurnished- Available to move in from 05 January, 2024- Maximum number of tenants is 3- DSS enquiries welcome- Students welcome to enquire- Pets considered / by arrangement- No Smokers- Family Friendly- Bills not included- Property has parking- Property has garden access- EPC Rating: B  If calling, please quote reference: 1915996  Fees:You will not be charged any admin fees. ** Contact today to book a viewing and have the landlord show you round! ** Request Details form responded to 24/7, with phone bookings available 9am-9pm, 7 days a week.OpenRent is on the Scottish Letting Agent Register, registration number LARN1809026The landlord is on the Scottish Landlord Register, registration number 453083/100/19571",
  9 |     "displayAddress": "Mugiemoss Road, Aberdeen, AB21",
 10 |     "countryCode": "GB",
 11 |     "location": {
 12 |         "latitude": 57.17924,
 13 |         "longitude": -2.16878
 14 |     },
 15 |     "propertyImages": {
 16 |         "images": [
 17 |             {
 18 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/97k/96668/142659089/96668_191599604122023_IMG_00_0000_max_476x317.jpeg",
 19 |                 "url": "97k/96668/142659089/96668_191599604122023_IMG_00_0000.jpeg",
 20 |                 "caption": null
 21 |             },
 22 |             {
 23 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/97k/96668/142659089/96668_191599604122023_IMG_01_0000_max_476x317.jpeg",
 24 |                 "url": "97k/96668/142659089/96668_191599604122023_IMG_01_0000.jpeg",
 25 |                 "caption": null
 26 |             },
 27 |             {
 28 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/97k/96668/142659089/96668_191599604122023_IMG_02_0000_max_476x317.jpeg",
 29 |                 "url": "97k/96668/142659089/96668_191599604122023_IMG_02_0000.jpeg",
 30 |                 "caption": null
 31 |             },
 32 |             {
 33 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/97k/96668/142659089/96668_191599604122023_IMG_03_0000_max_476x317.jpeg",
 34 |                 "url": "97k/96668/142659089/96668_191599604122023_IMG_03_0000.jpeg",
 35 |                 "caption": null
 36 |             },
 37 |             {
 38 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/97k/96668/142659089/96668_191599604122023_IMG_04_0000_max_476x317.jpeg",
 39 |                 "url": "97k/96668/142659089/96668_191599604122023_IMG_04_0000.jpeg",
 40 |                 "caption": null
 41 |             },
 42 |             {
 43 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/97k/96668/142659089/96668_191599604122023_IMG_05_0000_max_476x317.jpeg",
 44 |                 "url": "97k/96668/142659089/96668_191599604122023_IMG_05_0000.jpeg",
 45 |                 "caption": null
 46 |             },
 47 |             {
 48 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/97k/96668/142659089/96668_191599604122023_IMG_06_0000_max_476x317.jpeg",
 49 |                 "url": "97k/96668/142659089/96668_191599604122023_IMG_06_0000.jpeg",
 50 |                 "caption": null
 51 |             },
 52 |             {
 53 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/97k/96668/142659089/96668_191599604122023_IMG_07_0000_max_476x317.jpeg",
 54 |                 "url": "97k/96668/142659089/96668_191599604122023_IMG_07_0000.jpeg",
 55 |                 "caption": null
 56 |             },
 57 |             {
 58 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/97k/96668/142659089/96668_191599604122023_IMG_08_0000_max_476x317.jpeg",
 59 |                 "url": "97k/96668/142659089/96668_191599604122023_IMG_08_0000.jpeg",
 60 |                 "caption": null
 61 |             }
 62 |         ],
 63 |         "mainImageSrc": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/97k/96668/142659089/96668_191599604122023_IMG_00_0000_max_476x317.jpeg",
 64 |         "mainMapImageSrc": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/97k/96668/142659089/96668_191599604122023_IMG_00_0000_max_296x197.jpeg"
 65 |     },
 66 |     "propertySubType": "Terraced",
 67 |     "listingUpdate": {
 68 |         "listingUpdateReason": "new",
 69 |         "listingUpdateDate": "2023-12-04T11:54:03Z"
 70 |     },
 71 |     "premiumListing": false,
 72 |     "featuredProperty": false,
 73 |     "price": {
 74 |         "amount": 632,
 75 |         "frequency": "monthly",
 76 |         "currencyCode": "GBP",
 77 |         "displayPrices": [
 78 |             {
 79 |                 "displayPrice": "\u00a3632 pcm",
 80 |                 "displayPriceQualifier": ""
 81 |             },
 82 |             {
 83 |                 "displayPrice": "\u00a3146 pw",
 84 |                 "displayPriceQualifier": ""
 85 |             }
 86 |         ]
 87 |     },
 88 |     "customer": {
 89 |         "branchId": 96668,
 90 |         "brandPlusLogoURI": "/company/clogo_rmchoice_37106_0000.png",
 91 |         "contactTelephone": "020 3322 3265",
 92 |         "branchDisplayName": "OpenRent, London",
 93 |         "branchName": "London",
 94 |         "brandTradingName": "OpenRent",
 95 |         "branchLandingPageUrl": "/estate-agents/agent/OpenRent/London-96668.html",
 96 |         "development": false,
 97 |         "showReducedProperties": true,
 98 |         "commercial": false,
 99 |         "showOnMap": true,
100 |         "enhancedListing": false,
101 |         "developmentContent": null,
102 |         "buildToRent": false,
103 |         "buildToRentBenefits": [],
104 |         "brandPlusLogoUrl": "https://media.rightmove.co.uk:443/dir/company/clogo_rmchoice_37106_0000_max_100x50.png"
105 |     },
106 |     "distance": null,
107 |     "transactionType": "rent",
108 |     "productLabel": {
109 |         "productLabelText": "",
110 |         "spotlightLabel": false
111 |     },
112 |     "commercial": false,
113 |     "development": false,
114 |     "residential": true,
115 |     "students": false,
116 |     "auction": false,
117 |     "feesApply": false,
118 |     "feesApplyText": null,
119 |     "displaySize": "",
120 |     "showOnMap": true,
121 |     "propertyUrl": "/properties/142659089#/?channel=RES_LET",
122 |     "contactUrl": "/property-to-rent/contactBranch.html?propertyId=142659089",
123 |     "staticMapUrl": null,
124 |     "channel": "RENT",
125 |     "firstVisibleDate": "2023-12-04T11:48:11Z",
126 |     "keywords": [],
127 |     "keywordMatchType": "no_keyword",
128 |     "saved": false,
129 |     "hidden": false,
130 |     "onlineViewingsAvailable": false,
131 |     "lozengeModel": {
132 |         "matchingLozenges": []
133 |     },
134 |     "hasBrandPlus": true,
135 |     "displayStatus": "",
136 |     "enquiredTimestamp": null,
137 |     "heading": "",
138 |     "isRecent": true,
139 |     "enhancedListing": false,
140 |     "formattedBranchName": " by OpenRent, London",
141 |     "formattedDistance": "",
142 |     "propertyTypeFullDescription": "2 bedroom terraced house",
143 |     "addedOrReduced": "Added today",
144 |     "feature_list": [
145 |         "No Agent Fees",
146 |         "Students Can Enquire",
147 |         "Property Reference Number: 1915996"
148 |     ]
149 | }


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | accelerate==0.27.2
  2 | affine==2.4.0
  3 | aiohttp==3.9.3
  4 | aiosignal==1.3.1
  5 | alembic==1.13.1
  6 | altair==5.2.0
  7 | annotated-types==0.6.0
  8 | anyio==4.2.0
  9 | apache-airflow==2.8.1
 10 | apache-airflow-providers-common-io==1.2.0
 11 | apache-airflow-providers-common-sql==1.10.1
 12 | apache-airflow-providers-ftp==3.7.0
 13 | apache-airflow-providers-http==4.9.0
 14 | apache-airflow-providers-imap==3.5.0
 15 | apache-airflow-providers-sqlite==3.7.0
 16 | apache-beam==2.52.0
 17 | apispec==6.4.0
 18 | appdirs==1.4.4
 19 | appnope==0.1.3
 20 | argcomplete==3.2.2
 21 | argon2-cffi==23.1.0
 22 | argon2-cffi-bindings==21.2.0
 23 | arrow==1.3.0
 24 | asgiref==3.7.2
 25 | asttokens==2.4.1
 26 | async-lru==2.0.4
 27 | attrs==23.1.0
 28 | Automat==22.10.0
 29 | Babel==2.14.0
 30 | backoff==2.2.1
 31 | beautifulsoup4==4.12.2
 32 | bleach==6.1.0
 33 | blinker==1.7.0
 34 | cachelib==0.9.0
 35 | cachetools==5.3.2
 36 | certifi==2023.11.17
 37 | cffi==1.16.0
 38 | charset-normalizer==3.3.2
 39 | click==8.1.7
 40 | click-plugins==1.1.1
 41 | clickclick==20.10.2
 42 | cligj==0.7.2
 43 | cloudpickle==2.2.1
 44 | colorama==0.4.6
 45 | colorlog==4.8.0
 46 | comm==0.2.1
 47 | ConfigUpdater==3.2
 48 | connexion==2.14.2
 49 | constantly==23.10.4
 50 | contextily==1.5.0
 51 | contourpy==1.2.0
 52 | crcmod==1.7
 53 | cron-descriptor==1.4.3
 54 | croniter==2.0.1
 55 | cryptography==41.0.7
 56 | cssselect==1.2.0
 57 | cycler==0.12.1
 58 | cykhash==2.0.1
 59 | Cython==3.0.8
 60 | databricks-cli==0.18.0
 61 | datasets==2.17.1
 62 | debugpy==1.8.0
 63 | decorator==5.1.1
 64 | defusedxml==0.7.1
 65 | Deprecated==1.2.14
 66 | dill==0.3.8
 67 | distro==1.9.0
 68 | dnspython==2.4.2
 69 | docker==7.0.0
 70 | docopt==0.6.2
 71 | docutils==0.20.1
 72 | email-validator==2.1.0.post1
 73 | entrypoints==0.4
 74 | evidently==0.4.14
 75 | executing==2.0.1
 76 | fastapi==0.109.0
 77 | fastapi-restful==0.5.0
 78 | fastavro==1.9.0
 79 | fasteners==0.19
 80 | fastjsonschema==2.19.1
 81 | filelock==3.13.1
 82 | fiona==1.9.5
 83 | Flask==2.2.5
 84 | Flask-AppBuilder==4.3.10
 85 | Flask-Babel==2.0.0
 86 | Flask-Caching==2.1.0
 87 | Flask-JWT-Extended==4.6.0
 88 | Flask-Limiter==3.5.1
 89 | Flask-Login==0.6.3
 90 | Flask-Session==0.6.0
 91 | Flask-SQLAlchemy==2.5.1
 92 | Flask-WTF==1.2.1
 93 | fonttools==4.47.2
 94 | fqdn==1.5.1
 95 | frozenlist==1.4.1
 96 | fsspec==2023.10.0
 97 | gcsfs==2024.2.0
 98 | geographiclib==2.0
 99 | geopandas==0.14.3
100 | geopy==2.4.1
101 | gitdb==4.0.11
102 | GitPython==3.1.41
103 | google-api-core==2.17.0
104 | google-auth==2.27.0
105 | google-auth-oauthlib==1.2.0
106 | google-cloud==0.34.0
107 | google-cloud-core==2.4.1
108 | google-cloud-storage==2.14.0
109 | google-crc32c==1.5.0
110 | google-re2==1.1
111 | google-resumable-media==2.7.0
112 | googleapis-common-protos==1.62.0
113 | grpcio==1.59.3
114 | gunicorn==21.2.0
115 | h11==0.14.0
116 | hdfs==2.7.3
117 | httpcore==1.0.2
118 | httplib2==0.22.0
119 | httpx==0.26.0
120 | huggingface-hub==0.20.3
121 | hyperlink==21.0.0
122 | idna==3.6
123 | importlib-metadata==6.11.0
124 | importlib-resources==6.1.1
125 | incremental==22.10.0
126 | inflection==0.5.1
127 | iniconfig==2.0.0
128 | ipykernel==6.29.0
129 | ipython==8.20.0
130 | ipython-genutils==0.2.0
131 | ipywidgets==8.1.1
132 | isoduration==20.11.0
133 | itemadapter==0.8.0
134 | itemloaders==1.1.0
135 | iterative-telemetry==0.0.8
136 | itsdangerous==2.1.2
137 | jedi==0.19.1
138 | Jinja2==3.1.3
139 | jmespath==1.0.1
140 | joblib==1.3.2
141 | Js2Py==0.74
142 | json5==0.9.14
143 | jsonpointer==2.4
144 | jsonschema==4.20.0
145 | jsonschema-specifications==2023.11.2
146 | jupyter==1.0.0
147 | jupyter-console==6.6.3
148 | jupyter-contrib-core==0.4.2
149 | jupyter-contrib-nbextensions==0.7.0
150 | jupyter-events==0.9.0
151 | jupyter-highlight-selected-word==0.2.0
152 | jupyter-lsp==2.2.2
153 | jupyter-nbextensions-configurator==0.6.3
154 | jupyter_client==8.6.0
155 | jupyter_core==5.7.1
156 | jupyter_server==2.12.5
157 | jupyter_server_terminals==0.5.2
158 | jupyterlab==4.0.11
159 | jupyterlab-widgets==3.0.9
160 | jupyterlab_pygments==0.3.0
161 | jupyterlab_server==2.25.2
162 | kiwisolver==1.4.5
163 | lazy-object-proxy==1.10.0
164 | limits==3.7.0
165 | linkify-it-py==2.0.3
166 | lockfile==0.12.2
167 | lxml==4.9.3
168 | Mako==1.3.0
169 | Markdown==3.5.2
170 | markdown-it-py==3.0.0
171 | MarkupSafe==2.1.4
172 | marshmallow==3.20.2
173 | marshmallow-oneofschema==3.1.1
174 | marshmallow-sqlalchemy==0.26.1
175 | matplotlib==3.8.2
176 | matplotlib-inline==0.1.6
177 | mdit-py-plugins==0.4.0
178 | mdurl==0.1.2
179 | mercantile==1.2.1
180 | mistune==3.0.2
181 | mlflow==2.10.2
182 | mpmath==1.3.0
183 | multidict==6.0.5
184 | multiprocess==0.70.16
185 | mypy-extensions==1.0.0
186 | nbclient==0.9.0
187 | nbconvert==7.14.2
188 | nbformat==5.9.2
189 | nest-asyncio==1.6.0
190 | networkx==3.2.1
191 | nltk==3.8.1
192 | notebook==7.0.7
193 | notebook_shim==0.2.3
194 | numpy==1.24.4
195 | oauthlib==3.2.2
196 | objsize==0.6.1
197 | opentelemetry-api==1.22.0
198 | opentelemetry-exporter-otlp==1.22.0
199 | opentelemetry-exporter-otlp-proto-common==1.22.0
200 | opentelemetry-exporter-otlp-proto-grpc==1.22.0
201 | opentelemetry-exporter-otlp-proto-http==1.22.0
202 | opentelemetry-proto==1.22.0
203 | opentelemetry-sdk==1.22.0
204 | opentelemetry-semantic-conventions==0.43b0
205 | ordered-set==4.1.0
206 | orjson==3.9.13
207 | overrides==7.7.0
208 | packaging==23.2
209 | pandas==2.2.0
210 | pandocfilters==1.5.1
211 | parsel==1.8.1
212 | parso==0.8.3
213 | pathspec==0.12.1
214 | patsy==0.5.6
215 | pendulum==3.0.0
216 | pexpect==4.9.0
217 | pillow==10.2.0
218 | platformdirs==4.1.0
219 | plotly==5.18.0
220 | pluggy==1.4.0
221 | prison==0.2.1
222 | prometheus-client==0.19.0
223 | prompt-toolkit==3.0.43
224 | Protego==0.3.0
225 | proto-plus==1.22.3
226 | protobuf==4.25.1
227 | psutil==5.9.8
228 | psycopg==3.1.17
229 | psycopg2==2.9.9
230 | psycopg2-binary==2.9.9
231 | ptyprocess==0.7.0
232 | pure-eval==0.2.2
233 | pyarrow==15.0.0
234 | pyarrow-hotfix==0.6
235 | pyasn1==0.5.1
236 | pyasn1-modules==0.3.0
237 | pycparser==2.21
238 | pydantic==2.6.1
239 | pydantic_core==2.16.2
240 | pydeck==0.8.0
241 | PyDispatcher==2.0.7
242 | pydot==1.4.2
243 | Pygments==2.17.2
244 | pyjsparser==2.7.1
245 | PyJWT==2.8.0
246 | pymongo==4.6.1
247 | PyMySQL==1.1.0
248 | pyOpenSSL==23.3.0
249 | pyparsing==3.1.1
250 | pyproj==3.6.1
251 | pyrobuf==0.9.3
252 | pyrosm==0.6.2
253 | pytest==8.0.2
254 | python-daemon==3.0.1
255 | python-dateutil==2.8.2
256 | python-dotenv==1.0.1
257 | python-json-logger==2.0.7
258 | python-nvd3==0.15.0
259 | python-rapidjson==1.14
260 | python-slugify==8.0.4
261 | pytz==2023.3.post1
262 | PyYAML==6.0.1
263 | pyzmq==25.1.2
264 | qtconsole==5.5.1
265 | QtPy==2.4.1
266 | querystring-parser==1.2.4
267 | queuelib==1.6.2
268 | rasterio==1.3.9
269 | referencing==0.31.1
270 | regex==2023.10.3
271 | requests==2.31.0
272 | requests-file==1.5.1
273 | requests-oauthlib==1.3.1
274 | requests-toolbelt==1.0.0
275 | rfc3339-validator==0.1.4
276 | rfc3986-validator==0.1.1
277 | rich==13.7.0
278 | rich-argparse==1.4.0
279 | rpds-py==0.13.2
280 | rsa==4.9
281 | safetensors==0.4.2
282 | scikit-learn==1.3.2
283 | scipy==1.12.0
284 | Scrapy==2.11.0
285 | scrapyd==1.4.3
286 | scrapyd-client==1.2.3
287 | seaborn==0.13.2
288 | Send2Trash==1.8.2
289 | service-identity==23.1.0
290 | setproctitle==1.3.3
291 | shapely==2.0.2
292 | six==1.16.0
293 | smmap==5.0.1
294 | sniffio==1.3.0
295 | snuggs==1.4.7
296 | soupsieve==2.5
297 | SQLAlchemy==1.4.51
298 | SQLAlchemy-JSONField==1.0.2
299 | SQLAlchemy-Utils==0.41.1
300 | sqlparse==0.4.4
301 | stack-data==0.6.3
302 | starlette==0.35.1
303 | statsmodels==0.14.1
304 | streamlit==1.31.1
305 | sympy==1.12
306 | tabulate==0.9.0
307 | tenacity==8.2.3
308 | termcolor==2.4.0
309 | terminado==0.18.0
310 | text-unidecode==1.3
311 | threadpoolctl==3.2.0
312 | time-machine==2.13.0
313 | tinycss2==1.2.1
314 | tldextract==5.1.1
315 | tokenizers==0.15.2
316 | toml==0.10.2
317 | toolz==0.12.1
318 | torch==2.2.1
319 | tornado==6.4
320 | tqdm==4.66.1
321 | traitlets==5.14.1
322 | transformers==4.38.1
323 | Twisted==22.10.0
324 | typer==0.9.0
325 | types-python-dateutil==2.8.19.20240106
326 | typing-inspect==0.9.0
327 | typing_extensions==4.8.0
328 | tzdata==2023.4
329 | tzlocal==5.2
330 | uberegg==0.1.1
331 | uc-micro-py==1.0.3
332 | unicodecsv==0.14.1
333 | universal_pathlib==0.2.0
334 | uri-template==1.3.0
335 | urllib3==2.1.0
336 | uvicorn==0.27.0.post1
337 | validators==0.22.0
338 | w3lib==2.1.2
339 | watchdog==3.0.0
340 | wcwidth==0.2.13
341 | webcolors==1.13
342 | webencodings==0.5.1
343 | websocket-client==1.7.0
344 | Werkzeug==2.2.3
345 | widgetsnbextension==4.0.9
346 | wordcloud==1.9.3
347 | wrapt==1.16.0
348 | WTForms==3.1.2
349 | xgboost==2.0.3
350 | xxhash==3.4.1
351 | xyzservices==2023.10.1
352 | yarl==1.9.4
353 | zipp==3.17.0
354 | zope.interface==6.1
355 | zstandard==0.22.0
356 | 


--------------------------------------------------------------------------------
/rightmove/orchestration/airflow_app/dags/rightmove/train_model.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from datetime import datetime, timedelta
  3 | 
  4 | from pymongo import MongoClient
  5 | from google.cloud import storage
  6 | import pandas as pd
  7 | from sklearn.model_selection import train_test_split
  8 | 
  9 | from sklearn.metrics import mean_squared_error
 10 | from mlflow.data.pandas_dataset import PandasDataset
 11 | from airflow import DAG
 12 | from airflow.operators.python_operator import PythonOperator
 13 | from airflow.operators.dummy_operator import DummyOperator
 14 | 
 15 | from rightmove.data_processing.data_processor import DataPreprocessor
 16 | 
 17 | from sklearn.ensemble import RandomForestRegressor
 18 | 
 19 | from dotenv import load_dotenv
 20 | 
 21 | load_dotenv("/Users/alexander.girardet/Code/Personal/projects/rightmove_project/.env")
 22 | 
 23 | import mlflow
 24 | 
 25 | import logging
 26 | 
 27 | logging.basicConfig(level=logging.INFO)
 28 | 
 29 | MONGO_URI = os.environ.get("MONGO_URI")
 30 | 
 31 | mlflow.set_tracking_uri(
 32 |     "postgresql+psycopg2://postgres:postgres@realestate-database.czkkjkojmucd.eu-west-2.rds.amazonaws.com:5432/mlflow"
 33 | )
 34 | experiment_name = "rightmove-prediction"
 35 | mlflow.set_experiment(experiment_name)
 36 | 
 37 | client = storage.Client()
 38 | bucket = client.get_bucket("rightmove-artifacts-ml")
 39 | 
 40 | def load_data_from_mongo(collection_name="properties", fields=None):
 41 |     logging.info("Loading data from mongo")
 42 | 
 43 |     client = MongoClient(MONGO_URI)  # Hosted with Docker
 44 | 
 45 |     db = client["rightmove"]
 46 | 
 47 |     collection = db[collection_name]
 48 | 
 49 |     query = {}
 50 | 
 51 |     data = collection.find(query, fields)
 52 | 
 53 |     df = pd.DataFrame(list(data))
 54 | 
 55 |     if len(df) == 0:
 56 |         raise ValueError(f"No data found in collection {collection_name}")
 57 |     else:
 58 |         logging.info(f"Data loaded from collection {collection_name}")
 59 | 
 60 |     return df
 61 | 
 62 | def generate_foldername():
 63 |     now = datetime.now()
 64 |     return now.strftime("%Y-%m-%d-%H-%M-%S")
 65 | 
 66 | 
 67 | def load_df_to_gcs(df, dest_path):
 68 |     blob = bucket.blob(dest_path)
 69 |     try:
 70 |         blob.upload_from_string(df.to_csv(), "text/csv")
 71 |         logging.info(f"Data uploaded to {dest_path}")
 72 |         return True
 73 |     except Exception as e:
 74 |         print(e)
 75 | 
 76 | 
 77 | def preprocess_data(property_df, walkscore_df):
 78 |     preprocessor = DataPreprocessor(with_text=False, with_binary=False)
 79 | 
 80 |     property_df = preprocessor.preprocess_properties(property_df)
 81 |     walk_df = preprocessor.preprocess_walk_score(walkscore_df)
 82 | 
 83 |     df = property_df.merge(walk_df, on="id", how="left")
 84 | 
 85 |     logging.info("Data preprocessed")
 86 | 
 87 |     return df
 88 | 
 89 | 
 90 | def load_data_from_gcs(source_url):
 91 |     logging.info(f"Loading {source_url} from GCS")
 92 |     df = pd.read_csv(source_url, index_col=0)
 93 |     return df
 94 | 
 95 | 
 96 | def fetch_preprocess_and_upload_data():
 97 |     property_df = load_data_from_mongo(
 98 |         collection_name="properties",
 99 |         fields={
100 |             "id": 1,
101 |             "price.amount": 1,
102 |             "price.frequency": 1,
103 |             "firstVisibleDate": 1,
104 |             "bedrooms": 1,
105 |             "bathrooms": 1,
106 |             "listingUpdate": 1,
107 |             "location": 1,
108 |         },
109 |     )
110 |     walkscore_df = load_data_from_mongo(
111 |         collection_name="walk_scores", fields={"id": 1, "scores": 1}
112 |     )
113 | 
114 |     df = preprocess_data(property_df, walkscore_df)
115 | 
116 |     df = df[["bedrooms", "bathrooms", "price", "longitude", "latitude", "walk_score"]]
117 | 
118 |     folder_name = generate_foldername()
119 |     parent_folder = "data"
120 | 
121 |     df["price_bin"] = pd.qcut(df["price"], q=10, duplicates="drop")
122 | 
123 |     # Create train test,  validation split
124 |     train_val, test_df = train_test_split(
125 |         df, test_size=0.1, stratify=df["price_bin"], random_state=42
126 |     )
127 |     train_df, val_df = train_test_split(
128 |         train_val, test_size=0.2, stratify=train_val["price_bin"], random_state=42
129 |     )
130 | 
131 |     # Upload to GCS train, test, and validation data
132 |     load_df_to_gcs(train_df, f"{parent_folder}/{folder_name}/train.csv")
133 |     load_df_to_gcs(val_df, f"{parent_folder}/{folder_name}/val.csv")
134 |     load_df_to_gcs(test_df, f"{parent_folder}/{folder_name}/test.csv")
135 | 
136 |     logging.info("Data uploaded to GCS")
137 | 
138 |     return folder_name
139 | 
140 | 
141 | def train_model(**kwargs):
142 |     if "ti" in kwargs:
143 |         ti = kwargs["ti"]
144 |         folder_name = ti.xcom_pull(task_ids="load_data")
145 |     else:
146 |         folder_name = kwargs["folder_name"]
147 | 
148 |     logging.info(f"Training model with data from {folder_name}")
149 | 
150 |     train_dataset_source_url = (
151 |         f"gs://rightmove-artifacts-ml/data/{folder_name}/train.csv"
152 |     )
153 |     val_dataset_source_url = f"gs://rightmove-artifacts-ml/data/{folder_name}/val.csv"
154 |     test_dataset_source_url = f"gs://rightmove-artifacts-ml/data/{folder_name}/test.csv"
155 | 
156 |     train_df = load_data_from_gcs(train_dataset_source_url)
157 |     val_df = load_data_from_gcs(val_dataset_source_url)
158 |     test_df = load_data_from_gcs(test_dataset_source_url)
159 | 
160 |     train_df = train_df.dropna()
161 |     val_df = val_df.dropna()
162 | 
163 |     features = ["bedrooms", "bathrooms", "longitude", "latitude", "walk_score"]
164 |     target = "price"
165 | 
166 |     X_train = train_df[features]
167 |     y_train = train_df[target]
168 | 
169 |     X_val = val_df[features]
170 |     y_val = val_df[target]
171 | 
172 |     train_dataset: PandasDataset = mlflow.data.from_pandas(
173 |         train_df, source=train_dataset_source_url
174 |     )
175 |     val_dataset: PandasDataset = mlflow.data.from_pandas(
176 |         val_df, source=val_dataset_source_url
177 |     )
178 |     test_dataset: PandasDataset = mlflow.data.from_pandas(
179 |         test_df, source=val_dataset_source_url
180 |     )
181 | 
182 |     with mlflow.start_run() as run:
183 |         mlflow.set_tag("developer", "Alex")
184 | 
185 |         mlflow.log_param("Model type", "Random Forest")
186 |         model = RandomForestRegressor()
187 | 
188 |         # Log the datasets
189 |         mlflow.log_input(train_dataset, context="training")
190 |         mlflow.log_input(val_dataset, context="validation")
191 |         mlflow.log_input(test_dataset, context="test")
192 | 
193 |         logging.info("Fitting model")
194 |         model.fit(X_train, y_train)
195 | 
196 |         y_pred = model.predict(X_val)
197 | 
198 |         rmse = mean_squared_error(y_val, y_pred, squared=False)
199 | 
200 |         r2 = model.score(X_val, y_val)
201 | 
202 |         mlflow.log_metric("rmse", rmse)
203 |         mlflow.sklearn.log_model(model, "random-forest")
204 | 
205 |         logging.info("Model trained and logged to MLflow")
206 | 
207 |     return run.info.run_id
208 | 
209 | 
210 | def register_model(**kwargs):
211 |     if "ti" in kwargs:
212 |         ti = kwargs["ti"]
213 |         run_id = ti.xcom_pull(task_ids="train_model")
214 |     else:
215 |         run_id = kwargs["run_id"]
216 | 
217 |     model_name = "Random Forest Walk Score"
218 |     artifact_path = "random-forest"
219 | 
220 |     model_uri = f"runs:/{run_id}/{artifact_path}"
221 | 
222 |     model_details = mlflow.register_model(model_uri=model_uri, name=model_name)
223 |     logging.info(
224 |         f"Model registered with name: {model_name} and version: {model_details.version}"
225 |     )
226 | 
227 |     client = mlflow.tracking.MlflowClient()
228 |     client.transition_model_version_stage(
229 |         name=model_name,
230 |         version=model_details.version,
231 |         stage="Staging",
232 |         archive_existing_versions=False,
233 |     )
234 |     logging.info(f"Model version {model_details.version} transitioned to Staging")
235 | 
236 | 
237 | default_args = {
238 |     "owner": "airflow_app",
239 |     "depends_on_past": False,
240 |     "email_on_failure": False,
241 |     "email_on_retry": False,
242 |     "retries": 1,
243 |     "retry_delay": timedelta(minutes=5),
244 | }
245 | 
246 | dag = DAG(
247 |     "train_model",
248 |     default_args=default_args,
249 |     description="DAG for making scraping rightmove",
250 |     schedule_interval=timedelta(days=1),
251 |     start_date=datetime(2023, 1, 1),
252 |     catchup=False,
253 |     max_active_runs=1,
254 | )
255 | with dag:
256 |     load_and_preprocess_data_task = PythonOperator(
257 |         task_id="load_data", python_callable=fetch_preprocess_and_upload_data
258 |     )
259 | 
260 |     train_model_task = PythonOperator(
261 |         task_id="train_model", python_callable=train_model, provide_context=True
262 |     )
263 | 
264 |     register_model_task = PythonOperator(
265 |         task_id="register_model", python_callable=register_model, provide_context=True
266 |     )
267 | 
268 |     load_and_preprocess_data_task >> train_model_task >> register_model_task
269 | 
270 | if __name__ == "__main__":
271 |     folder_name = fetch_preprocess_and_upload_data()
272 |     print(folder_name)
273 |     run_id = train_model(folder_name=folder_name)
274 |     print(run_id)
275 |     register_model(run_id=run_id)
276 | 


--------------------------------------------------------------------------------
/rightmove/data_ingestion/rightmove_scraper/rightmove_scraper/middlewares.py:
--------------------------------------------------------------------------------
  1 | # Define here the models for your spider middleware
  2 | #
  3 | # See documentation in:
  4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  5 | import os
  6 | 
  7 | from scrapy import signals
  8 | from scrapy import signals
  9 | import datetime
 10 | from psycopg2.extras import execute_values
 11 | from scrapy.signalmanager import dispatcher
 12 | import psycopg2
 13 | 
 14 | # useful for handling different item types with a single interface
 15 | from itemadapter import is_item, ItemAdapter
 16 | 
 17 | import logging
 18 | 
 19 | logging.basicConfig(level=logging.DEBUG)
 20 | logger = logging.getLogger(__name__)
 21 | 
 22 | POSTGRES_URI = os.environ.get("MONITORING_URI_PG")
 23 | 
 24 | 
 25 | class RightmoveScraperSpiderMiddleware:
 26 |     # Not all methods need to be defined. If a method is not defined,
 27 |     # scrapy acts as if the spider middleware does not modify the
 28 |     # passed objects.
 29 | 
 30 |     @classmethod
 31 |     def from_crawler(cls, crawler):
 32 |         # This method is used by Scrapy to create your spiders.
 33 |         s = cls()
 34 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 35 |         crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
 36 |         return s
 37 | 
 38 |     def process_spider_input(self, response, spider):
 39 |         # Called for each response that goes through the spider
 40 |         # middleware and into the spider.
 41 | 
 42 |         # Should return None or raise an exception.
 43 |         return None
 44 | 
 45 |     def process_spider_output(self, response, result, spider):
 46 |         # Called with the results returned from the Spider, after
 47 |         # it has processed the response.
 48 | 
 49 |         # Must return an iterable of Request, or item objects.
 50 |         for i in result:
 51 |             yield i
 52 | 
 53 |     def process_spider_exception(self, response, exception, spider):
 54 |         # Called when a spider or process_spider_input() method
 55 |         # (from other spider middleware) raises an exception.
 56 | 
 57 |         # Should return either None or an iterable of Request or item objects.
 58 |         pass
 59 | 
 60 |     def process_start_requests(self, start_requests, spider):
 61 |         # Called with the start requests of the spider, and works
 62 |         # similarly to the process_spider_output() method, except
 63 |         # that it doesn’t have a response associated.
 64 | 
 65 |         # Must return only requests (not items).
 66 |         for r in start_requests:
 67 |             yield r
 68 | 
 69 |     def spider_opened(self, spider):
 70 |         spider.logger.info("Spider opened: %s" % spider.name)
 71 | 
 72 |     def spider_closed(self, spider):
 73 |         # Retrieve stats
 74 |         stats = spider.crawler.stats.get_stats()
 75 | 
 76 |         # Call the method to save stats to PostgreSQL
 77 |         self.save_stats_to_postgres(stats)
 78 | 
 79 |     def save_stats_to_postgres(self, stats):
 80 |         # Setup database connection
 81 |         logger.info(f"Logging stats to Postgres: {stats}")
 82 | 
 83 |         start_time = stats.get("start_time")
 84 |         finish_time = stats.get("finish_time")
 85 |         elapsed_time_seconds = stats.get("elapsed_time_seconds")
 86 |         item_scraped_count = stats.get("item_scraped_count", 0)
 87 |         finish_reason = stats.get("finish_reason")
 88 |         log_count_debug = stats.get("log_count/DEBUG", 0)
 89 |         log_count_info = stats.get("log_count/INFO", 0)
 90 |         log_count_error = stats.get("log_count/ERROR", 0)
 91 |         mem_usage_startup = stats.get("memusage/startup")
 92 |         mem_usage_max = stats.get("memusage/max")
 93 |         scheduler_enqueued_memory = stats.get("scheduler/enqueued/memory")
 94 |         downloader_request_count = stats.get("downloader/request_count")
 95 |         downloader_reponse_count = stats.get("downloader/response_count")
 96 |         response_received_count = stats.get("response_received_count")
 97 |         downloader_request_method_count_get = stats.get(
 98 |             "downloader/request_method_count/GET"
 99 |         )
100 |         downloader_request_bytes = stats.get("downloader/request_bytes")
101 | 
102 |         logger.info("Saving stats to PostgreSQL")
103 |         logger.info(f"start_time: {start_time}")
104 |         logger.info(f"finish_time: {finish_time}")
105 |         logger.info(f"elapsed_time_seconds: {elapsed_time_seconds}")
106 |         logger.info(f"item_scraped_count: {item_scraped_count}")
107 |         logger.info(f"finish_reason: {finish_reason}")
108 |         logger.info(f"log_count_debug: {log_count_debug}")
109 |         logger.info(f"log_count_info: {log_count_info}")
110 |         logger.info(f"log_count_error: {log_count_error}")
111 |         logger.info(f"mem_usage_startup: {mem_usage_startup}")
112 |         logger.info(f"mem_usage_max: {mem_usage_max}")
113 |         logger.info(f"scheduler_enqueued_memory: {scheduler_enqueued_memory}")
114 |         logger.info(f"downloader_request_count: {downloader_request_count}")
115 |         logger.info(f"downloader_reponse_count: {downloader_reponse_count}")
116 |         logger.info(f"response_received_count: {response_received_count}")
117 |         logger.info(
118 |             f"downloader_request_method_count_get: {downloader_request_method_count_get}"
119 |         )
120 |         logger.info(f"downloader_request_bytes: {downloader_request_bytes}")
121 | 
122 |         insert_sql = """
123 |         INSERT INTO scrapy_rightmove_rental_stats (
124 |             start_time, finish_time, elapsed_time_seconds, item_scraped_count, finish_reason,
125 |             log_count_debug, log_count_info, log_count_error, mem_usage_startup, mem_usage_max, scheduler_enqueued_memory,
126 |             downloader_request_count, downloader_response_count, response_received_count,
127 |             downloader_request_method_count_get, downloader_request_bytes
128 |         ) VALUES %s;
129 |         """
130 | 
131 |         # Data tuple to insert
132 |         data = (
133 |             stats.get("start_time"),
134 |             stats.get("finish_time"),
135 |             stats.get("elapsed_time_seconds"),
136 |             stats.get("item_scraped_count", 0),
137 |             stats.get("finish_reason"),
138 |             stats.get("log_count/DEBUG", 0),
139 |             stats.get("log_count/INFO", 0),
140 |             stats.get("log_count/ERROR", 0),
141 |             stats.get("memusage/startup"),
142 |             stats.get("memusage/max"),
143 |             stats.get("scheduler/enqueued/memory"),
144 |             stats.get("downloader/request_count"),
145 |             stats.get("downloader/response_count"),
146 |             stats.get("response_received_count"),
147 |             stats.get("downloader/request_method_count/GET"),
148 |             stats.get("downloader/request_bytes"),
149 |         )
150 |         cur = None
151 |         conn = None
152 |         try:
153 |             # Connect to your database
154 |             conn = psycopg2.connect(POSTGRES_URI)
155 |             cur = conn.cursor()
156 | 
157 |             # Execute the insert statement
158 |             execute_values(cur, insert_sql, [data])
159 | 
160 |             # Commit the transaction
161 |             conn.commit()
162 | 
163 |             logger.info("Stats successfully saved to PostgreSQL")
164 |         except Exception as e:
165 |             logger.error(f"An error occurred: {e}")
166 |         finally:
167 |             if cur is not None:
168 |                 cur.close()
169 | 
170 |             if conn is not None:
171 |                 conn.close()
172 | 
173 | 
174 | class RightmoveScraperDownloaderMiddleware:
175 |     # Not all methods need to be defined. If a method is not defined,
176 |     # scrapy acts as if the downloader middleware does not modify the
177 |     # passed objects.
178 | 
179 |     @classmethod
180 |     def from_crawler(cls, crawler):
181 |         # This method is used by Scrapy to create your spiders.
182 |         s = cls()
183 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
184 |         return s
185 | 
186 |     def process_request(self, request, spider):
187 |         # Called for each request that goes through the downloader
188 |         # middleware.
189 | 
190 |         # Must either:
191 |         # - return None: continue processing this request
192 |         # - or return a Response object
193 |         # - or return a Request object
194 |         # - or raise IgnoreRequest: process_exception() methods of
195 |         #   installed downloader middleware will be called
196 |         return None
197 | 
198 |     def process_response(self, request, response, spider):
199 |         # Called with the response returned from the downloader.
200 | 
201 |         # Must either;
202 |         # - return a Response object
203 |         # - return a Request object
204 |         # - or raise IgnoreRequest
205 |         return response
206 | 
207 |     def process_exception(self, request, exception, spider):
208 |         # Called when a download handler or a process_request()
209 |         # (from other downloader middleware) raises an exception.
210 | 
211 |         # Must either:
212 |         # - return None: continue processing this exception
213 |         # - return a Response object: stops process_exception() chain
214 |         # - return a Request object: stops process_exception() chain
215 |         pass
216 | 
217 |     def spider_opened(self, spider):
218 |         spider.logger.info("Spider opened: %s" % spider.name)
219 | 


--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
  1 | version: "3.8"
  2 | 
  3 | x-airflow-common:
  4 |   &airflow-common
  5 |   build:
  6 |     context: ./rightmove/orchestration/airflow_app/
  7 |   env_file:
  8 |     - .env
  9 |   environment:
 10 |     &airflow-common-env
 11 |     AIRFLOW__CORE__EXECUTOR: LocalExecutor
 12 |     AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
 13 | #    AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: ${POSTGRES_URI:-}/airflow
 14 |     AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
 15 | #    AIRFLOW__CORE__SQL_ALCHEMY_CONN: ${POSTGRES_URI:-}/airflow
 16 |     AIRFLOW__CORE__FERNET_KEY: ''
 17 |     AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
 18 |     AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
 19 |     AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session'
 20 |     _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-}
 21 |     AIRFLOW_UID: ${AIRFLOW_UID:-50000}
 22 |     GOOGLE_APPLICATION_CREDENTIALS: /opt/airflow/credentials/airflow-service-account.json
 23 |     GCP_GCS_BUCKET: ${GCP_GCS_BUCKET:-}
 24 | 
 25 |   volumes:
 26 |     - ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags
 27 |     - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs
 28 |     - ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins
 29 |     - /var/run/docker.sock:/var/run/docker.sock
 30 |     - /Users/alexander.girardet/Code/Personal/projects/rightmove_project/credentials:/opt/airflow/credentials
 31 |   user: "${AIRFLOW_UID:-50000}:0"
 32 |   depends_on:
 33 |     &airflow-common-depends-on
 34 |     postgres:
 35 |       condition: service_healthy
 36 |   networks:
 37 |     - backend
 38 | 
 39 | services:
 40 | 
 41 |   scrapy_app:
 42 |     build:
 43 |       context: ./rightmove/data_ingestion/rightmove_scraper/
 44 |     ports:
 45 |       - "6800:6800"
 46 |     networks:
 47 |       - backend
 48 |     env_file:
 49 |       - .env
 50 | 
 51 | #  streamlit_app:
 52 | #    build:
 53 | #      context: ./rightmove/dashboard/
 54 | #    depends_on:
 55 | #      - mongodb
 56 | #    ports:
 57 | #      - "8501:8501"
 58 | #    environment:
 59 | #      GOOGLE_APPLICATION_CREDENTIALS: /tmp/keys/credentials/airflow-service-account.json
 60 | #    env_file:
 61 | #      - .env
 62 | #    volumes:
 63 | #      - /Users/alexander.girardet/Code/Personal/projects/rightmove_project/credentials:/tmp/keys/credentials
 64 | #    networks:
 65 | #      - frontend
 66 | #      - backend
 67 | #
 68 |   fastapi_app:
 69 |     build:
 70 |       context: ./rightmove/backend/
 71 |     ports:
 72 |       - "8000:8000"
 73 |     environment:
 74 |       GOOGLE_APPLICATION_CREDENTIALS: /tmp/keys/credentials/airflow-service-account.json
 75 |     env_file:
 76 |       - .env
 77 |     volumes:
 78 |       - /Users/alexander.girardet/Code/Personal/projects/rightmove_project/credentials:/tmp/keys/credentials
 79 |     networks:
 80 |       - frontend
 81 |       - backend
 82 | 
 83 |   mlflow-server:
 84 |     build:
 85 |       context: ./rightmove/mlflow/
 86 |     ports:
 87 |       - "5001:5001"
 88 |     environment:
 89 |       MLFLOW_BACKEND_STORE_URI: ${POSTGRES_URI:-}/mlflow
 90 |       MLFLOW_ARTIFACTS_DESTINATION: ${GCS_ARTIFACT_BUCKET:-}
 91 |       GOOGLE_APPLICATION_CREDENTIALS: /tmp/keys/credentials/airflow-service-account.json
 92 |     volumes:
 93 |         - /Users/alexander.girardet/Code/Personal/projects/rightmove_project/credentials:/tmp/keys/credentials
 94 |     networks:
 95 |       - backend
 96 | 
 97 |   postgres:
 98 |     image: postgres:13
 99 |     environment:
100 |       POSTGRES_USER: airflow
101 |       POSTGRES_PASSWORD: airflow
102 |       POSTGRES_DB: airflow
103 |     volumes:
104 |       - postgres-db-volume:/var/lib/postgresql/data
105 |     healthcheck:
106 |       test: [ "CMD", "pg_isready", "-U", "airflow" ]
107 |       interval: 5s
108 |       retries: 5
109 |     ports:
110 |       - "5432:5432"
111 |     restart: always
112 |     networks:
113 |       - backend
114 | 
115 |   grafana:
116 |     image: grafana/grafana
117 |     user: "472"
118 |     ports:
119 |       - "3000:3000"
120 |     volumes:
121 |       - ./rightmove/monitoring/config/grafana_datasources.yaml:/etc/grafana/provisioning/datasources/datasource.yaml:ro
122 |       - ./rightmove/monitoring/config/grafana_dashboards.yaml:/etc/grafana/provisioning/dashboards/dashboards.yaml:ro
123 |       - ./rightmove/monitoring/config/dashboards:/var/lib/grafana/dashboards:ro
124 |     networks:
125 |       - backend
126 |     env_file:
127 |       - .env
128 |     restart: always
129 | 
130 |   airflow-webserver:
131 |     <<: *airflow-common
132 |     command: webserver
133 |     ports:
134 |       - "8080:8080"
135 |     healthcheck:
136 |       test:
137 |         [
138 |           "CMD",
139 |           "curl",
140 |           "--fail",
141 |           "http://localhost:8080/health"
142 |         ]
143 |       interval: 10s
144 |       timeout: 10s
145 |       retries: 5
146 |     restart: always
147 |     depends_on:
148 |       <<: *airflow-common-depends-on
149 |       airflow-init:
150 |         condition: service_completed_successfully
151 | 
152 |   airflow-scheduler:
153 |     <<: *airflow-common
154 |     command: scheduler
155 |     healthcheck:
156 |       test:
157 |         [
158 |           "CMD-SHELL",
159 |           'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"'
160 |         ]
161 |       interval: 10s
162 |       timeout: 10s
163 |       retries: 5
164 |     restart: always
165 |     depends_on:
166 |       <<: *airflow-common-depends-on
167 |       airflow-init:
168 |         condition: service_completed_successfully
169 | 
170 |   airflow-triggerer:
171 |     <<: *airflow-common
172 |     command: triggerer
173 |     healthcheck:
174 |       test:
175 |         [
176 |           "CMD-SHELL",
177 |           'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"'
178 |         ]
179 |       interval: 10s
180 |       timeout: 10s
181 |       retries: 5
182 |     restart: always
183 |     depends_on:
184 |       <<: *airflow-common-depends-on
185 |       airflow-init:
186 |         condition: service_completed_successfully
187 | 
188 |   airflow-init:
189 |     <<: *airflow-common
190 |     entrypoint: /bin/bash
191 |     # yamllint disable rule:line-length
192 |     command:
193 |       - -c
194 |       - |
195 |         function ver() {
196 |           printf "%04d%04d%04d%04d" $${1//./ }
197 |         }
198 |         airflow_version=$$(AIRFLOW__LOGGING__LOGGING_LEVEL=INFO && gosu airflow airflow version)
199 |         airflow_version_comparable=$$(ver $${airflow_version})
200 |         min_airflow_version=2.2.0
201 |         min_airflow_version_comparable=$$(ver $${min_airflow_version})
202 |         if (( airflow_version_comparable < min_airflow_version_comparable )); then
203 |           echo
204 |           echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m"
205 |           echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!"
206 |           echo
207 |           exit 1
208 |         fi
209 |         if [[ -z "${AIRFLOW_UID}" ]]; then
210 |           echo
211 |           echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
212 |           echo "If you are on Linux, you SHOULD follow the instructions below to set "
213 |           echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
214 |           echo "For other operating systems you can get rid of the warning with manually created .env file:"
215 |           echo "    See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user"
216 |           echo
217 |         fi
218 |         one_meg=1048576
219 |         mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg))
220 |         cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat)
221 |         disk_available=$$(df / | tail -1 | awk '{print $$4}')
222 |         warning_resources="false"
223 |         if (( mem_available < 4000 )) ; then
224 |           echo
225 |           echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m"
226 |           echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))"
227 |           echo
228 |           warning_resources="true"
229 |         fi
230 |         if (( cpus_available < 2 )); then
231 |           echo
232 |           echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m"
233 |           echo "At least 2 CPUs recommended. You have $${cpus_available}"
234 |           echo
235 |           warning_resources="true"
236 |         fi
237 |         if (( disk_available < one_meg * 10 )); then
238 |           echo
239 |           echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m"
240 |           echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))"
241 |           echo
242 |           warning_resources="true"
243 |         fi
244 |         if [[ $${warning_resources} == "true" ]]; then
245 |           echo
246 |           echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m"
247 |           echo "Please follow the instructions to increase amount of resources available:"
248 |           echo "   https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#before-you-begin"
249 |           echo
250 |         fi
251 |         mkdir -p /sources/logs /sources/dags /sources/plugins
252 |         chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
253 |         exec /entrypoint airflow version
254 |     # yamllint enable rule:line-length
255 |     environment:
256 |       <<: *airflow-common-env
257 |       _AIRFLOW_DB_UPGRADE: 'true'
258 |       _AIRFLOW_WWW_USER_CREATE: 'true'
259 |       _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
260 |       _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
261 |       _PIP_ADDITIONAL_REQUIREMENTS: ''
262 |     user: "0:0"
263 |     volumes:
264 |       - ${AIRFLOW_PROJ_DIR:-.}:/sources
265 | 
266 |   airflow-cli:
267 |     <<: *airflow-common
268 |     profiles:
269 |       - debug
270 |     environment:
271 |       <<: *airflow-common-env
272 |       CONNECTION_CHECK_MAX_COUNT: "0"
273 |     command:
274 |       - bash
275 |       - -c
276 |       - airflow
277 | networks:
278 |   backend:
279 |   frontend:
280 | 
281 | volumes:
282 |   mongodb_data:
283 |   postgres-db-volume:
284 |   my_db_volume:


--------------------------------------------------------------------------------
/rightmove/orchestration/airflow_app/dags/rightmove/data_processing/metric_extraction.py:
--------------------------------------------------------------------------------
  1 | import psycopg2
  2 | from datetime import datetime
  3 | 
  4 | from evidently.report import Report
  5 | from evidently.metric_preset import (
  6 |     DataDriftPreset,
  7 |     TargetDriftPreset,
  8 |     RegressionPreset,
  9 |     DataQualityPreset,
 10 | )
 11 | from evidently import ColumnMapping
 12 | from evidently.metrics import *
 13 | import logging
 14 | 
 15 | logging.basicConfig(level=logging.INFO)
 16 | 
 17 | import os
 18 | 
 19 | from dotenv import load_dotenv
 20 | 
 21 | load_dotenv("/Users/alexander.girardet/Code/Personal/projects/rightmove_project/.env")
 22 | 
 23 | PG_URI = os.environ.get("MONITORING_URI_PG")
 24 | 
 25 | 
 26 | class MetricExtraction:
 27 |     def __init__(self):
 28 |         self.conn = None
 29 |         self.cur = None
 30 | 
 31 |     def connect_to_postgres(self):
 32 |         self.conn = psycopg2.connect(dsn=PG_URI)
 33 |         self.cur = self.conn.cursor()
 34 | 
 35 |     def close_connection(self):
 36 |         self.conn.close()
 37 | 
 38 |     def extract_data_quality(self, quality_report):
 39 |         # Initialize a dictionary to store the results
 40 |         summary_dict = {
 41 |             "walk_score": {},
 42 |             "price": {},
 43 |             "bedrooms": {},
 44 |             "bathrooms": {},
 45 |         }
 46 | 
 47 |         nans_by_columns = {}
 48 | 
 49 |         for metric in quality_report["metrics"]:
 50 |             if metric["metric"] == "DatasetSummaryMetric":
 51 |                 nans_by_columns = metric["result"]["current"]["nans_by_columns"]
 52 |                 continue
 53 | 
 54 |             if "column_name" in metric["result"]:
 55 |                 column_name = metric["result"]["column_name"]
 56 | 
 57 |                 if column_name in summary_dict:
 58 |                     summary_dict[column_name]["reference_mean"] = metric["result"][
 59 |                         "reference_characteristics"
 60 |                     ]["mean"]
 61 |                     summary_dict[column_name]["current_mean"] = metric["result"][
 62 |                         "current_characteristics"
 63 |                     ]["mean"]
 64 |                     summary_dict[column_name]["current_count"] = metric["result"][
 65 |                         "current_characteristics"
 66 |                     ]["count"]
 67 |                     summary_dict[column_name]["current_nulls"] = nans_by_columns.get(
 68 |                         column_name, 0
 69 |                     )
 70 | 
 71 |         return summary_dict
 72 | 
 73 |     def extract_drift(self, drift_report):
 74 |         share_of_drifted_columns = drift_report["metrics"][0]["result"][
 75 |             "share_of_drifted_columns"
 76 |         ]
 77 |         dataset_drift_binary = drift_report["metrics"][0]["result"]["dataset_drift"]
 78 |         target_drift_score = drift_report["metrics"][1]["result"]["drift_by_columns"][
 79 |             "target"
 80 |         ]["drift_score"]
 81 |         target_drift_detected = drift_report["metrics"][1]["result"][
 82 |             "drift_by_columns"
 83 |         ]["target"]["drift_detected"]
 84 | 
 85 |         summary_dict = {
 86 |             "share_of_drifted_columns": share_of_drifted_columns,
 87 |             "dataset_drift_binary": dataset_drift_binary,
 88 |             "target_drift_score": target_drift_score,
 89 |             "target_drift_detected": target_drift_detected,
 90 |         }
 91 | 
 92 |         return summary_dict
 93 | 
 94 |     def extract_performance(self, performance_report):
 95 |         reference_r2 = performance_report["metrics"][0]["result"]["reference"][
 96 |             "r2_score"
 97 |         ]
 98 |         reference_rmse = performance_report["metrics"][0]["result"]["reference"]["rmse"]
 99 |         reference_mean_error = performance_report["metrics"][0]["result"]["reference"][
100 |             "mean_error"
101 |         ]
102 |         reference_mean_abs_error = performance_report["metrics"][0]["result"][
103 |             "reference"
104 |         ]["mean_abs_error"]
105 | 
106 |         current_r2 = performance_report["metrics"][0]["result"]["current"]["r2_score"]
107 |         current_rmse = performance_report["metrics"][0]["result"]["current"]["rmse"]
108 |         current_mean_error = performance_report["metrics"][0]["result"]["current"][
109 |             "mean_error"
110 |         ]
111 |         current_mean_abs_error = performance_report["metrics"][0]["result"]["current"][
112 |             "mean_abs_error"
113 |         ]
114 | 
115 |         summary_dict = {
116 |             "reference": {
117 |                 "r2": reference_r2,
118 |                 "rmse": reference_rmse,
119 |                 "mean_error": reference_mean_error,
120 |                 "mean_abs_error": reference_mean_abs_error,
121 |             },
122 |             "current": {
123 |                 "r2": current_r2,
124 |                 "rmse": current_rmse,
125 |                 "mean_error": current_mean_error,
126 |                 "mean_abs_error": current_mean_abs_error,
127 |             },
128 |         }
129 | 
130 |         return summary_dict
131 | 
132 |     def extract_prediction(self, prediction_report):
133 |         prediction_drift_score = prediction_report["metrics"][0]["result"][
134 |             "drift_score"
135 |         ]
136 |         prediction_drift_detected = prediction_report["metrics"][0]["result"][
137 |             "drift_detected"
138 |         ]
139 | 
140 |         summary_dict = {
141 |             "prediction_drift_score": prediction_drift_score,
142 |             "prediction_drift_detected": prediction_drift_detected,
143 |         }
144 | 
145 |         return summary_dict
146 | 
147 |     def load_metrics_to_postgres(
148 |         self, data_dict, metric_category, loading_timestamp=None
149 |     ):
150 |         insert_query = """
151 |         INSERT INTO model_metrics (metric_category, metric_name, metric_value, metric_status, created_at)
152 |         VALUES (%s, %s, %s, %s, %s)
153 |         """
154 | 
155 |         if self.cur is None:
156 |             self.connect_to_postgres()
157 | 
158 |         if loading_timestamp:
159 |             current_timestamp = loading_timestamp
160 |         else:
161 |             current_timestamp = datetime.now()
162 | 
163 |         for key, value in data_dict.items():
164 |             if isinstance(
165 |                 value, dict
166 |             ):  # For nested dictionaries like in 'extract_means'
167 |                 for sub_key, sub_value in value.items():
168 |                     # Determine if sub_value is a boolean and assign appropriately
169 |                     if isinstance(sub_value, bool):
170 |                         self.cur.execute(
171 |                             insert_query,
172 |                             (
173 |                                 metric_category,
174 |                                 f"{key}_{sub_key}",
175 |                                 None,
176 |                                 sub_value,
177 |                                 current_timestamp,
178 |                             ),
179 |                         )
180 |                     else:
181 |                         self.cur.execute(
182 |                             insert_query,
183 |                             (
184 |                                 metric_category,
185 |                                 f"{key}_{sub_key}",
186 |                                 sub_value,
187 |                                 None,
188 |                                 current_timestamp,
189 |                             ),
190 |                         )
191 |             else:
192 |                 # Check if the value is boolean and assign to metric_status instead of metric_value
193 |                 if isinstance(value, bool):
194 |                     self.cur.execute(
195 |                         insert_query,
196 |                         (metric_category, key, None, value, current_timestamp),
197 |                     )
198 |                 else:
199 |                     # Assuming all non-dict and non-boolean values should be treated as numeric
200 |                     self.cur.execute(
201 |                         insert_query,
202 |                         (metric_category, key, value, None, current_timestamp),
203 |                     )
204 | 
205 |         self.conn.commit()
206 | 
207 |     def get_target_drift_metrics(self, current_data, reference_data):
208 |         target_drift_report = Report(metrics=[ColumnDriftMetric("target")])
209 | 
210 |         target_drift_report.run(
211 |             reference_data=reference_data, current_data=current_data
212 |         )
213 | 
214 |         predict_drift_report_dict = target_drift_report.as_dict()
215 | 
216 |         prediction_data = self.extract_prediction(predict_drift_report_dict)
217 |         return prediction_data
218 | 
219 |     def get_performance_metrics(self, current_data, reference_data):
220 |         reg_performance_report = Report(
221 |             metrics=[
222 |                 RegressionQualityMetric(),
223 |             ]
224 |         )
225 | 
226 |         reg_performance_report.run(
227 |             reference_data=reference_data, current_data=current_data
228 |         )
229 | 
230 |         reg_performance_dict = reg_performance_report.as_dict()
231 | 
232 |         performance_data = self.extract_performance(reg_performance_dict)
233 |         return performance_data
234 | 
235 |     def get_data_drift_metrics(self, current_data, reference_data):
236 |         data_drift_report = Report(
237 |             metrics=[
238 |                 DataDriftPreset(),
239 |             ]
240 |         )
241 | 
242 |         data_drift_report.run(reference_data=reference_data, current_data=current_data)
243 | 
244 |         data_drift_report_dict = data_drift_report.as_dict()
245 | 
246 |         drift_data = self.extract_drift(data_drift_report_dict)
247 |         return drift_data
248 | 
249 |     def get_data_quality_metrics(self, current_data, reference_data):
250 |         column_mapping = ColumnMapping()
251 | 
252 |         current_data = current_data[["bedrooms", "bathrooms", "walk_score", "target"]]
253 |         reference_data = reference_data[
254 |             ["bedrooms", "bathrooms", "walk_score", "target"]
255 |         ]
256 | 
257 |         numerical_features = ["bedrooms", "bathrooms", "walk_score"]
258 | 
259 |         column_mapping.numerical_features = numerical_features
260 |         column_mapping.target = "target"
261 | 
262 |         data_quality_report = Report(metrics=[DataQualityPreset()])
263 | 
264 |         data_quality_report.run(
265 |             current_data=current_data, reference_data=reference_data
266 |         )
267 | 
268 |         data_quality_report_dict = data_quality_report.as_dict()
269 | 
270 |         quality_data = self.extract_data_quality(data_quality_report_dict)
271 |         return quality_data
272 | 


--------------------------------------------------------------------------------
/rightmove/orchestration/airflow_app/dags/rightmove/ml_monitoring.py:
--------------------------------------------------------------------------------
  1 | mfrom airflow import DAG
  2 | from airflow.operators.python_operator import PythonOperator
  3 | from airflow.operators.dummy_operator import DummyOperator
  4 | 
  5 | from rightmove.data_processing.data_processor import DataPreprocessor
  6 | from rightmove.data_processing.metric_extraction import MetricExtraction
  7 | 
  8 | import re
  9 | 
 10 | 
 11 | from pymongo import MongoClient
 12 | import pandas as pd
 13 | from datetime import datetime, timedelta
 14 | from dotenv import load_dotenv
 15 | from google.cloud import storage
 16 | import os
 17 | import random
 18 | import requests
 19 | import logging
 20 | 
 21 | import mlflow
 22 | 
 23 | client = storage.Client()
 24 | bucket = client.get_bucket("rightmove-artifacts-ml")
 25 | 
 26 | MONITORING_URI_PG = os.environ.get("MONITORING_URI_PG")
 27 | 
 28 | mlflow.set_tracking_uri(MONITORING_URI_PG)
 29 | 
 30 | experiment_name = "rightmove-prediction"
 31 | mlflow.set_experiment(experiment_name)
 32 | 
 33 | ML_SERVING_URL = "http://fastapi_app:8000/batch-predict"
 34 | 
 35 | logging.basicConfig(level=logging.INFO)
 36 | 
 37 | # load_dotenv("/Users/alexander.girardet/Code/Personal/projects/rightmove_project/.env")
 38 | MONGO_URI = os.environ.get("MONGO_URI")
 39 | 
 40 | default_args = {
 41 |     "owner": "airflow_app",
 42 |     "depends_on_past": False,
 43 |     "email_on_failure": False,
 44 |     "email_on_retry": False,
 45 |     "retries": 1,
 46 |     "retry_delay": timedelta(minutes=5),
 47 | }
 48 | 
 49 | def modify_uri_to_test(uri: str) -> str:
 50 |     parts = uri.split('/')
 51 |     filename = parts[-1]
 52 |     new_filename = re.sub(r'(train|val|test)\.csv', 'test.csv', filename)
 53 |     parts[-1] = new_filename
 54 |     new_uri = '/'.join(parts)
 55 |     return new_uri
 56 | 
 57 | 
 58 | 
 59 | def fetch_reference_df():
 60 |     response = requests.get("http://fastapi_app:8000/latest-dataset")
 61 |     latest_uri = response.json().get("uri")
 62 | 
 63 |     test_uri = modify_uri_to_test(latest_uri)
 64 | 
 65 | 
 66 |     reference_data = pd.read_csv(
 67 |         test_uri, index_col=0
 68 |     )
 69 |     return reference_data
 70 | 
 71 | 
 72 | def preprocess_data(property_df, walkscore_df):
 73 |     preprocessor = DataPreprocessor(with_text=False, with_binary=False)
 74 | 
 75 |     property_df = preprocessor.preprocess_properties(property_df)
 76 |     walk_df = preprocessor.preprocess_walk_score(walkscore_df)
 77 | 
 78 |     df = property_df.merge(walk_df, on="id", how="left")
 79 | 
 80 |     logging.info("Data preprocessed")
 81 | 
 82 |     return df
 83 | 
 84 | 
 85 | def load_data_from_mongo(collection_name, fields, timestamp_field):
 86 |     client = MongoClient(MONGO_URI)
 87 | 
 88 |     db = client["rightmove"]
 89 | 
 90 |     collection = db[collection_name]
 91 | 
 92 |     two_hours_ago = datetime.now() - timedelta(hours=12)
 93 |     two_hours_ago_unix = two_hours_ago.timestamp()
 94 | 
 95 |     query = {timestamp_field: {"$gt": two_hours_ago_unix}}
 96 | 
 97 |     data = collection.find(query, fields)
 98 | 
 99 |     return pd.DataFrame(list(data))
100 | 
101 | 
102 | def fetch_latest_batch():
103 |     property_fields = {
104 |         "id": 1,
105 |         "bedrooms": 1,
106 |         "bathrooms": 1,
107 |         "location": 1,
108 |         "price": 1,
109 |         "listingUpdate": 1,
110 |         "firstVisibleDate": 1,
111 |     }
112 |     property_df = load_data_from_mongo(
113 |         "properties", property_fields, "extraction_timestamp"
114 |     )
115 | 
116 |     walk_score_fields = {"id": 1, "scores": 1}
117 |     walk_score_df = load_data_from_mongo(
118 |         "walk_scores", walk_score_fields, "processing_timestamp"
119 |     )
120 | 
121 |     df = preprocess_data(property_df, walk_score_df)
122 | 
123 |     df = df[["bedrooms", "bathrooms", "price", "longitude", "latitude", "walk_score"]]
124 | 
125 |     return df
126 | 
127 | 
128 | def load_predictions_from_gcs(folder_name):
129 |     current_data = pd.read_csv(
130 |         f"gs://rightmove-artifacts-ml/predictions/{folder_name}/current.csv",
131 |         index_col=0,
132 |     )
133 |     reference_data = pd.read_csv(
134 |         f"gs://rightmove-artifacts-ml/predictions/{folder_name}/reference.csv",
135 |         index_col=0,
136 |     )
137 | 
138 |     return current_data, reference_data
139 | 
140 | 
141 | def monitor_datasets(**kwargs):
142 |     synthetic_data = False
143 | 
144 |     if "ti" in kwargs:
145 |         ti = kwargs["ti"]
146 |         folder_name = ti.xcom_pull(task_ids="load_predictions_to_gcs")
147 |     else:
148 |         folder_name = kwargs.get("folder_name")
149 |         synthetic_data = kwargs.get("synthetic_data")
150 | 
151 |     current_data, reference_data = load_predictions_from_gcs(folder_name)
152 | 
153 |     current_data = current_data[['bedrooms', 'bathrooms', 'longitude', 'latitude', 'walk_score', 'prediction', 'target']]
154 |     reference_data = reference_data[['bedrooms', 'bathrooms', 'longitude', 'latitude', 'walk_score', 'prediction', 'target']]
155 | 
156 |     metric_extractor = MetricExtraction()
157 | 
158 |     metric_extractor.connect_to_postgres()
159 | 
160 |     performance_data = metric_extractor.get_performance_metrics(
161 |         current_data, reference_data
162 |     )
163 | 
164 |     prediction_data = metric_extractor.get_target_drift_metrics(
165 |         current_data, reference_data
166 |     )
167 | 
168 |     drift_data = metric_extractor.get_data_drift_metrics(current_data, reference_data)
169 | 
170 |     quality_data = metric_extractor.get_data_quality_metrics(
171 |         current_data, reference_data
172 |     )
173 | 
174 |     if synthetic_data:
175 |         fake_timestamp = datetime.now() - timedelta(days=random.randint(0, 30))
176 |     else:
177 |         fake_timestamp = None
178 | 
179 |     metric_extractor.load_metrics_to_postgres(
180 |         prediction_data, "prediction_drift", loading_timestamp=fake_timestamp
181 |     )
182 |     metric_extractor.load_metrics_to_postgres(
183 |         performance_data, "performance", loading_timestamp=fake_timestamp
184 |     )
185 |     metric_extractor.load_metrics_to_postgres(
186 |         drift_data, "drift", loading_timestamp=fake_timestamp
187 |     )
188 |     metric_extractor.load_metrics_to_postgres(
189 |         quality_data, "quality", loading_timestamp=fake_timestamp
190 |     )
191 |     logging.info("Metrics loaded to Postgres")
192 | 
193 |     metric_extractor.close_connection()
194 |     logging.info("Connection to Postgres closed")
195 | 
196 | 
197 | def predict_properties(properties_features):
198 |     try:
199 |         response = requests.post(ML_SERVING_URL, json=properties_features)
200 |         if response.status_code != 200:
201 |             raise ValueError("Request failed")
202 |         else:
203 |             predictions = response.json().get("predictions")
204 |             return predictions
205 |     except Exception as e:
206 |         raise e
207 | 
208 | 
209 | def generate_predictions(current_data=None, reference_data=None):
210 |     if current_data is None:
211 |         raise ValueError("No current data set")
212 | 
213 |     # new_logged_model = 'runs:/5c5b195cf1b74219993b436489545b7a/random-forest' # Replace with latest model from API
214 |     # new_logged_model = mlflow.pyfunc.load_model(new_logged_model)
215 | 
216 |     current_features = current_data[
217 |         ["bedrooms", "bathrooms", "longitude", "latitude", "walk_score"]
218 |     ].to_dict("records")
219 |     reference_features = reference_data[
220 |         ["bedrooms", "bathrooms", "longitude", "latitude", "walk_score"]
221 |     ].to_dict("records")
222 | 
223 |     current_predictions = predict_properties(current_features)
224 |     reference_predictions = predict_properties(reference_features)
225 | 
226 |     # reference_data['predictions'] = new_logged_model.predict(reference_data.drop(columns=['price']))
227 |     #
228 |     # current_data['predictions'] = new_logged_model.predict(current_data.drop(columns=['price']))
229 | 
230 |     current_data["predictions"] = current_predictions
231 | 
232 |     reference_data["predictions"] = reference_predictions
233 | 
234 |     logging.info("Predictions generated")
235 | 
236 |     return current_data, reference_data
237 | 
238 | 
239 | def load_df_to_gcs(df, dest_path):
240 |     blob = bucket.blob(dest_path)
241 |     try:
242 |         blob.upload_from_string(df.to_csv(), "text/csv")
243 |         logging.info(f"Data uploaded to {dest_path}")
244 |         return True
245 |     except Exception as e:
246 |         print(e)
247 | 
248 | 
249 | def load_data_from_gcs(source_url):
250 |     logging.info(f"Loading {source_url} from GCS")
251 |     df = pd.read_csv(source_url, index_col=0)
252 |     return df
253 | 
254 | 
255 | def generate_foldername():
256 |     now = datetime.now()
257 |     return now.strftime("%Y-%m-%d-%H-%M-%S")
258 | 
259 | 
260 | def load_predictions_to_gcs():
261 |     logging.info("Fetching data")
262 |     current_data = fetch_latest_batch()
263 |     reference_data = fetch_reference_df()
264 | 
265 |     logging.info("Generating predictions")
266 |     current_data, reference_data = generate_predictions(
267 |         current_data=current_data, reference_data=reference_data
268 |     )
269 | 
270 |     current_data = current_data.rename(
271 |         columns={"predictions": "prediction", "price": "target"}
272 |     )
273 |     reference_data = reference_data.rename(
274 |         columns={"predictions": "prediction", "price": "target"}
275 |     )
276 | 
277 |     folder_name = generate_foldername()
278 |     parent_folder = "predictions"
279 | 
280 |     load_df_to_gcs(current_data, f"{parent_folder}/{folder_name}/current.csv")
281 |     load_df_to_gcs(reference_data, f"{parent_folder}/{folder_name}/reference.csv")
282 | 
283 |     logging.info("Data loaded to GCS")
284 | 
285 |     return folder_name
286 | 
287 | 
288 | dag = DAG(
289 |     "monitor_ml_performance_rightmove",
290 |     default_args=default_args,
291 |     description="DAG for monitoring ML performance for rightmove",
292 |     schedule_interval=timedelta(days=1),
293 |     start_date=datetime(2023, 1, 1),
294 |     catchup=False,
295 |     max_active_runs=1,
296 | )
297 | 
298 | start_task = DummyOperator(task_id="start", dag=dag)
299 | 
300 | load_predictions_to_gcs_task = PythonOperator(
301 |     task_id="load_predictions_to_gcs", python_callable=load_predictions_to_gcs, dag=dag
302 | )
303 | 
304 | monitor_datasets_task = PythonOperator(
305 |     task_id="monitor_datasets",
306 |     python_callable=monitor_datasets,
307 |     provide_context=True,
308 |     dag=dag,
309 | )
310 | 
311 | end_task = DummyOperator(task_id="end", dag=dag)
312 | 
313 | start_task >> load_predictions_to_gcs_task >> monitor_datasets_task >> end_task
314 | 
315 | if __name__ == "__main__":
316 |     # folder_name = load_predictions_to_gcs()
317 |     # monitor_datasets(folder_name=folder_name)
318 | 
319 |     response = requests.get("http://localhost:8000/latest-dataset")
320 |     latest_uri = response.json().get("uri")
321 | 
322 |     test_uri = modify_uri_to_test(latest_uri)
323 | 
324 |     print(test_uri)
325 |     #
326 |     # import logging
327 |     # import pandas as pd
328 | 
329 |     # def split_df_into_chunks(df, chunk_size=500):
330 |     #     """Yield successive chunks of rows from df."""
331 |     #     for i in range(0, df.shape[0], chunk_size):
332 |     #         yield df.iloc[i:i + chunk_size]
333 |     #
334 |     #
335 |     # def load_chunk_to_gcs(chunk, parent_folder, folder_name, base_filename, chunk_index):
336 |     #     """Load a single chunk of DataFrame to GCS, with a unique filename."""
337 |     #     filename = f"{base_filename}_part{chunk_index}.csv"
338 |     #     path = f"{parent_folder}/{folder_name}/{filename}"
339 |     #     # This function should be defined to handle the actual loading process to GCS
340 |     #     load_df_to_gcs(chunk, path)
341 |     #
342 |     #
343 |     # logging.info("Fetching data")
344 |     # current_data = fetch_latest_batch()
345 |     # reference_data = fetch_reference_df()
346 |     #
347 |     # logging.info("Generating predictions")
348 |     # current_data, reference_data = generate_predictions(current_data=current_data, reference_data=reference_data)
349 |     #
350 |     # current_data = current_data.rename(columns={"predictions": "prediction", "price": "target"})
351 |     # reference_data = reference_data.rename(columns={"predictions": "prediction", "price": "target"})
352 |     #
353 |     # folder_name = generate_foldername()
354 |     # parent_folder = "predictions"
355 |     #
356 |     # # Split and load current_data
357 |     # for index, chunk in enumerate(split_df_into_chunks(current_data)):
358 |     #     monitor_datasets(chunk, reference_data)
359 |     #     # load_chunk_to_gcs(chunk, parent_folder, folder_name, "current", index)
360 | 
361 |     logging.info("Data loaded to GCS")
362 | 


--------------------------------------------------------------------------------
/notebooks/resources/data/property.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "id": 142547498,
  3 |     "bedrooms": 4,
  4 |     "bathrooms": 5,
  5 |     "numberOfImages": 28,
  6 |     "numberOfFloorplans": 0,
  7 |     "numberOfVirtualTours": 0,
  8 |     "summary": "Stonehouse Lettings are delighted to offer the opportunity to Lease this impressive  detached family home which is available on a part furnished basis. All rooms offer comfortable and modern living throughout.  The welcoming central hallway allows access to most rooms within the property.  The main feature is the two separate staircases leading to the first floor along with the high vaulted ceilings.   The lounge is exceptionally spacious and overlooks the front and the rear of the property. The drawing room is generously proportioned and also overlooks the front. The dining kitchen has been fitted with a range of luxury base and wall units and comes complete with integrated appliances.  It should be noted the family room is on semi open plan.  Utility room and Cloakroom WC are also located on the ground floor. On the first floor there are 4 generously proportioned double bedrooms which all benefit from separate en suite facilities. The master bedroom is has been tastefully decorated and fitted with a range of wardrobes. A particular feature is the external balcony.  Externally the garden grounds are enclosed and mainly laid to lawn.  A large driveway leads to the detached double garage which is equipped with power and light.  Gas central heating and double glazed windows.  Early viewing is highly recommended.",
  9 |     "displayAddress": "Kepplestone Gardens, West End, Aberdeen, AB15",
 10 |     "countryCode": "GB",
 11 |     "location": {
 12 |         "latitude": 57.137373,
 13 |         "longitude": -2.14488
 14 |     },
 15 |     "propertyImages": {
 16 |         "images": [
 17 |             {
 18 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_00_0000_max_476x317.jpeg",
 19 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_00_0000.jpeg",
 20 |                 "caption": "Picture No. 36"
 21 |             },
 22 |             {
 23 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_01_0000_max_476x317.jpeg",
 24 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_01_0000.jpeg",
 25 |                 "caption": "Picture No. 07"
 26 |             },
 27 |             {
 28 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_02_0000_max_476x317.jpeg",
 29 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_02_0000.jpeg",
 30 |                 "caption": "Picture No. 12"
 31 |             },
 32 |             {
 33 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_03_0000_max_476x317.jpeg",
 34 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_03_0000.jpeg",
 35 |                 "caption": "Picture No. 13"
 36 |             },
 37 |             {
 38 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_04_0000_max_476x317.jpeg",
 39 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_04_0000.jpeg",
 40 |                 "caption": "Picture No. 10"
 41 |             },
 42 |             {
 43 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_05_0000_max_476x317.jpeg",
 44 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_05_0000.jpeg",
 45 |                 "caption": "Picture No. 11"
 46 |             },
 47 |             {
 48 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_06_0000_max_476x317.jpeg",
 49 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_06_0000.jpeg",
 50 |                 "caption": "Picture No. 08"
 51 |             },
 52 |             {
 53 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_07_0000_max_476x317.jpeg",
 54 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_07_0000.jpeg",
 55 |                 "caption": "Picture No. 09"
 56 |             },
 57 |             {
 58 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_08_0000_max_476x317.jpeg",
 59 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_08_0000.jpeg",
 60 |                 "caption": "Picture No. 24"
 61 |             },
 62 |             {
 63 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_09_0000_max_476x317.jpeg",
 64 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_09_0000.jpeg",
 65 |                 "caption": "Picture No. 25"
 66 |             },
 67 |             {
 68 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_10_0000_max_476x317.jpeg",
 69 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_10_0000.jpeg",
 70 |                 "caption": "Picture No. 26"
 71 |             },
 72 |             {
 73 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_11_0000_max_476x317.jpeg",
 74 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_11_0000.jpeg",
 75 |                 "caption": "Picture No. 27"
 76 |             },
 77 |             {
 78 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_12_0000_max_476x317.jpeg",
 79 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_12_0000.jpeg",
 80 |                 "caption": "Picture No. 28"
 81 |             },
 82 |             {
 83 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_13_0000_max_476x317.jpeg",
 84 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_13_0000.jpeg",
 85 |                 "caption": "Picture No. 29"
 86 |             },
 87 |             {
 88 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_14_0000_max_476x317.jpeg",
 89 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_14_0000.jpeg",
 90 |                 "caption": "Picture No. 16"
 91 |             },
 92 |             {
 93 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_15_0000_max_476x317.jpeg",
 94 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_15_0000.jpeg",
 95 |                 "caption": "Picture No. 17"
 96 |             },
 97 |             {
 98 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_16_0000_max_476x317.jpeg",
 99 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_16_0000.jpeg",
100 |                 "caption": "Picture No. 18"
101 |             },
102 |             {
103 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_17_0000_max_476x317.jpeg",
104 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_17_0000.jpeg",
105 |                 "caption": "Picture No. 19"
106 |             },
107 |             {
108 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_18_0000_max_476x317.jpeg",
109 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_18_0000.jpeg",
110 |                 "caption": "Picture No. 20"
111 |             },
112 |             {
113 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_19_0000_max_476x317.jpeg",
114 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_19_0000.jpeg",
115 |                 "caption": "Picture No. 22"
116 |             },
117 |             {
118 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_20_0000_max_476x317.jpeg",
119 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_20_0000.jpeg",
120 |                 "caption": "Picture No. 23"
121 |             },
122 |             {
123 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_21_0000_max_476x317.jpeg",
124 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_21_0000.jpeg",
125 |                 "caption": "Picture No. 30"
126 |             },
127 |             {
128 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_22_0000_max_476x317.jpeg",
129 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_22_0000.jpeg",
130 |                 "caption": "Picture No. 31"
131 |             },
132 |             {
133 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_23_0000_max_476x317.jpeg",
134 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_23_0000.jpeg",
135 |                 "caption": "Picture No. 32"
136 |             },
137 |             {
138 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_24_0000_max_476x317.jpeg",
139 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_24_0000.jpeg",
140 |                 "caption": "Picture No. 15"
141 |             },
142 |             {
143 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_25_0000_max_476x317.jpeg",
144 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_25_0000.jpeg",
145 |                 "caption": "Picture No. 14"
146 |             },
147 |             {
148 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_26_0000_max_476x317.jpeg",
149 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_26_0000.jpeg",
150 |                 "caption": "Picture No. 34"
151 |             },
152 |             {
153 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_27_0000_max_476x317.jpeg",
154 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_27_0000.jpeg",
155 |                 "caption": "Picture No. 35"
156 |             }
157 |         ],
158 |         "mainImageSrc": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_00_0000_max_476x317.jpeg",
159 |         "mainMapImageSrc": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_00_0000_max_296x197.jpeg"
160 |     },
161 |     "propertySubType": "Detached",
162 |     "listingUpdate": {
163 |         "listingUpdateReason": "new",
164 |         "listingUpdateDate": "2023-11-29T18:57:03Z"
165 |     },
166 |     "premiumListing": false,
167 |     "featuredProperty": true,
168 |     "price": {
169 |         "amount": 2915,
170 |         "frequency": "monthly",
171 |         "currencyCode": "GBP",
172 |         "displayPrices": [
173 |             {
174 |                 "displayPrice": "\u00a32,915 pcm",
175 |                 "displayPriceQualifier": ""
176 |             },
177 |             {
178 |                 "displayPrice": "\u00a3673 pw",
179 |                 "displayPriceQualifier": ""
180 |             }
181 |         ]
182 |     },
183 |     "customer": {
184 |         "branchId": 89488,
185 |         "brandPlusLogoURI": "/brand/brand_rmchoice_logo_89714_0002.jpeg",
186 |         "contactTelephone": "020 3840 3898",
187 |         "branchDisplayName": "DJ Alexander, Aberdeen",
188 |         "branchName": "Aberdeen",
189 |         "brandTradingName": "DJ Alexander",
190 |         "branchLandingPageUrl": "/estate-agents/agent/DJ-Alexander/Aberdeen-89488.html",
191 |         "development": false,
192 |         "showReducedProperties": true,
193 |         "commercial": false,
194 |         "showOnMap": true,
195 |         "enhancedListing": false,
196 |         "developmentContent": null,
197 |         "buildToRent": false,
198 |         "buildToRentBenefits": [],
199 |         "brandPlusLogoUrl": "https://media.rightmove.co.uk:443/dir/brand/brand_rmchoice_logo_89714_0002_max_100x50.jpeg"
200 |     },
201 |     "distance": null,
202 |     "transactionType": "rent",
203 |     "productLabel": {
204 |         "productLabelText": "",
205 |         "spotlightLabel": false
206 |     },
207 |     "commercial": false,
208 |     "development": false,
209 |     "residential": true,
210 |     "students": false,
211 |     "auction": false,
212 |     "feesApply": false,
213 |     "feesApplyText": null,
214 |     "displaySize": "",
215 |     "showOnMap": true,
216 |     "propertyUrl": "/properties/142547498#/?channel=RES_LET",
217 |     "contactUrl": "/property-to-rent/contactBranch.html?propertyId=142547498",
218 |     "staticMapUrl": null,
219 |     "channel": "RENT",
220 |     "firstVisibleDate": "2023-11-29T18:51:24Z",
221 |     "keywords": [],
222 |     "keywordMatchType": "no_keyword",
223 |     "saved": false,
224 |     "hidden": false,
225 |     "onlineViewingsAvailable": false,
226 |     "lozengeModel": {
227 |         "matchingLozenges": []
228 |     },
229 |     "hasBrandPlus": true,
230 |     "displayStatus": "",
231 |     "enquiredTimestamp": null,
232 |     "heading": "Featured Property",
233 |     "isRecent": false,
234 |     "enhancedListing": false,
235 |     "formattedBranchName": " by DJ Alexander, Aberdeen",
236 |     "formattedDistance": "",
237 |     "propertyTypeFullDescription": "4 bedroom detached house",
238 |     "addedOrReduced": "Added on 29/11/2023",
239 |     "feature_list": [
240 |         "* Unfurnished",
241 |         "* FOUR bedrooms",
242 |         "* West End Location",
243 |         "* Double Garage",
244 |         "* Garden",
245 |         "* Gas Central Heating",
246 |         "* landlord reg: 255737/100/1558",
247 |         "* Council Tax H"
248 |     ]
249 | }


--------------------------------------------------------------------------------
/notebooks/resources/data/.ipynb_checkpoints/property-checkpoint.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "id": 142547498,
  3 |     "bedrooms": 4,
  4 |     "bathrooms": 5,
  5 |     "numberOfImages": 28,
  6 |     "numberOfFloorplans": 0,
  7 |     "numberOfVirtualTours": 0,
  8 |     "summary": "Stonehouse Lettings are delighted to offer the opportunity to Lease this impressive  detached family home which is available on a part furnished basis. All rooms offer comfortable and modern living throughout.  The welcoming central hallway allows access to most rooms within the property.  The main feature is the two separate staircases leading to the first floor along with the high vaulted ceilings.   The lounge is exceptionally spacious and overlooks the front and the rear of the property. The drawing room is generously proportioned and also overlooks the front. The dining kitchen has been fitted with a range of luxury base and wall units and comes complete with integrated appliances.  It should be noted the family room is on semi open plan.  Utility room and Cloakroom WC are also located on the ground floor. On the first floor there are 4 generously proportioned double bedrooms which all benefit from separate en suite facilities. The master bedroom is has been tastefully decorated and fitted with a range of wardrobes. A particular feature is the external balcony.  Externally the garden grounds are enclosed and mainly laid to lawn.  A large driveway leads to the detached double garage which is equipped with power and light.  Gas central heating and double glazed windows.  Early viewing is highly recommended.",
  9 |     "displayAddress": "Kepplestone Gardens, West End, Aberdeen, AB15",
 10 |     "countryCode": "GB",
 11 |     "location": {
 12 |         "latitude": 57.137373,
 13 |         "longitude": -2.14488
 14 |     },
 15 |     "propertyImages": {
 16 |         "images": [
 17 |             {
 18 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_00_0000_max_476x317.jpeg",
 19 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_00_0000.jpeg",
 20 |                 "caption": "Picture No. 36"
 21 |             },
 22 |             {
 23 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_01_0000_max_476x317.jpeg",
 24 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_01_0000.jpeg",
 25 |                 "caption": "Picture No. 07"
 26 |             },
 27 |             {
 28 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_02_0000_max_476x317.jpeg",
 29 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_02_0000.jpeg",
 30 |                 "caption": "Picture No. 12"
 31 |             },
 32 |             {
 33 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_03_0000_max_476x317.jpeg",
 34 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_03_0000.jpeg",
 35 |                 "caption": "Picture No. 13"
 36 |             },
 37 |             {
 38 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_04_0000_max_476x317.jpeg",
 39 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_04_0000.jpeg",
 40 |                 "caption": "Picture No. 10"
 41 |             },
 42 |             {
 43 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_05_0000_max_476x317.jpeg",
 44 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_05_0000.jpeg",
 45 |                 "caption": "Picture No. 11"
 46 |             },
 47 |             {
 48 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_06_0000_max_476x317.jpeg",
 49 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_06_0000.jpeg",
 50 |                 "caption": "Picture No. 08"
 51 |             },
 52 |             {
 53 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_07_0000_max_476x317.jpeg",
 54 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_07_0000.jpeg",
 55 |                 "caption": "Picture No. 09"
 56 |             },
 57 |             {
 58 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_08_0000_max_476x317.jpeg",
 59 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_08_0000.jpeg",
 60 |                 "caption": "Picture No. 24"
 61 |             },
 62 |             {
 63 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_09_0000_max_476x317.jpeg",
 64 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_09_0000.jpeg",
 65 |                 "caption": "Picture No. 25"
 66 |             },
 67 |             {
 68 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_10_0000_max_476x317.jpeg",
 69 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_10_0000.jpeg",
 70 |                 "caption": "Picture No. 26"
 71 |             },
 72 |             {
 73 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_11_0000_max_476x317.jpeg",
 74 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_11_0000.jpeg",
 75 |                 "caption": "Picture No. 27"
 76 |             },
 77 |             {
 78 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_12_0000_max_476x317.jpeg",
 79 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_12_0000.jpeg",
 80 |                 "caption": "Picture No. 28"
 81 |             },
 82 |             {
 83 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_13_0000_max_476x317.jpeg",
 84 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_13_0000.jpeg",
 85 |                 "caption": "Picture No. 29"
 86 |             },
 87 |             {
 88 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_14_0000_max_476x317.jpeg",
 89 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_14_0000.jpeg",
 90 |                 "caption": "Picture No. 16"
 91 |             },
 92 |             {
 93 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_15_0000_max_476x317.jpeg",
 94 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_15_0000.jpeg",
 95 |                 "caption": "Picture No. 17"
 96 |             },
 97 |             {
 98 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_16_0000_max_476x317.jpeg",
 99 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_16_0000.jpeg",
100 |                 "caption": "Picture No. 18"
101 |             },
102 |             {
103 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_17_0000_max_476x317.jpeg",
104 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_17_0000.jpeg",
105 |                 "caption": "Picture No. 19"
106 |             },
107 |             {
108 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_18_0000_max_476x317.jpeg",
109 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_18_0000.jpeg",
110 |                 "caption": "Picture No. 20"
111 |             },
112 |             {
113 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_19_0000_max_476x317.jpeg",
114 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_19_0000.jpeg",
115 |                 "caption": "Picture No. 22"
116 |             },
117 |             {
118 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_20_0000_max_476x317.jpeg",
119 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_20_0000.jpeg",
120 |                 "caption": "Picture No. 23"
121 |             },
122 |             {
123 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_21_0000_max_476x317.jpeg",
124 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_21_0000.jpeg",
125 |                 "caption": "Picture No. 30"
126 |             },
127 |             {
128 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_22_0000_max_476x317.jpeg",
129 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_22_0000.jpeg",
130 |                 "caption": "Picture No. 31"
131 |             },
132 |             {
133 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_23_0000_max_476x317.jpeg",
134 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_23_0000.jpeg",
135 |                 "caption": "Picture No. 32"
136 |             },
137 |             {
138 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_24_0000_max_476x317.jpeg",
139 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_24_0000.jpeg",
140 |                 "caption": "Picture No. 15"
141 |             },
142 |             {
143 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_25_0000_max_476x317.jpeg",
144 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_25_0000.jpeg",
145 |                 "caption": "Picture No. 14"
146 |             },
147 |             {
148 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_26_0000_max_476x317.jpeg",
149 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_26_0000.jpeg",
150 |                 "caption": "Picture No. 34"
151 |             },
152 |             {
153 |                 "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_27_0000_max_476x317.jpeg",
154 |                 "url": "90k/89488/142547498/89488_SLA140261_L_IMG_27_0000.jpeg",
155 |                 "caption": "Picture No. 35"
156 |             }
157 |         ],
158 |         "mainImageSrc": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_00_0000_max_476x317.jpeg",
159 |         "mainMapImageSrc": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_00_0000_max_296x197.jpeg"
160 |     },
161 |     "propertySubType": "Detached",
162 |     "listingUpdate": {
163 |         "listingUpdateReason": "new",
164 |         "listingUpdateDate": "2023-11-29T18:57:03Z"
165 |     },
166 |     "premiumListing": false,
167 |     "featuredProperty": true,
168 |     "price": {
169 |         "amount": 2915,
170 |         "frequency": "monthly",
171 |         "currencyCode": "GBP",
172 |         "displayPrices": [
173 |             {
174 |                 "displayPrice": "\u00a32,915 pcm",
175 |                 "displayPriceQualifier": ""
176 |             },
177 |             {
178 |                 "displayPrice": "\u00a3673 pw",
179 |                 "displayPriceQualifier": ""
180 |             }
181 |         ]
182 |     },
183 |     "customer": {
184 |         "branchId": 89488,
185 |         "brandPlusLogoURI": "/brand/brand_rmchoice_logo_89714_0002.jpeg",
186 |         "contactTelephone": "020 3840 3898",
187 |         "branchDisplayName": "DJ Alexander, Aberdeen",
188 |         "branchName": "Aberdeen",
189 |         "brandTradingName": "DJ Alexander",
190 |         "branchLandingPageUrl": "/estate-agents/agent/DJ-Alexander/Aberdeen-89488.html",
191 |         "development": false,
192 |         "showReducedProperties": true,
193 |         "commercial": false,
194 |         "showOnMap": true,
195 |         "enhancedListing": false,
196 |         "developmentContent": null,
197 |         "buildToRent": false,
198 |         "buildToRentBenefits": [],
199 |         "brandPlusLogoUrl": "https://media.rightmove.co.uk:443/dir/brand/brand_rmchoice_logo_89714_0002_max_100x50.jpeg"
200 |     },
201 |     "distance": null,
202 |     "transactionType": "rent",
203 |     "productLabel": {
204 |         "productLabelText": "",
205 |         "spotlightLabel": false
206 |     },
207 |     "commercial": false,
208 |     "development": false,
209 |     "residential": true,
210 |     "students": false,
211 |     "auction": false,
212 |     "feesApply": false,
213 |     "feesApplyText": null,
214 |     "displaySize": "",
215 |     "showOnMap": true,
216 |     "propertyUrl": "/properties/142547498#/?channel=RES_LET",
217 |     "contactUrl": "/property-to-rent/contactBranch.html?propertyId=142547498",
218 |     "staticMapUrl": null,
219 |     "channel": "RENT",
220 |     "firstVisibleDate": "2023-11-29T18:51:24Z",
221 |     "keywords": [],
222 |     "keywordMatchType": "no_keyword",
223 |     "saved": false,
224 |     "hidden": false,
225 |     "onlineViewingsAvailable": false,
226 |     "lozengeModel": {
227 |         "matchingLozenges": []
228 |     },
229 |     "hasBrandPlus": true,
230 |     "displayStatus": "",
231 |     "enquiredTimestamp": null,
232 |     "heading": "Featured Property",
233 |     "isRecent": false,
234 |     "enhancedListing": false,
235 |     "formattedBranchName": " by DJ Alexander, Aberdeen",
236 |     "formattedDistance": "",
237 |     "propertyTypeFullDescription": "4 bedroom detached house",
238 |     "addedOrReduced": "Added on 29/11/2023",
239 |     "feature_list": [
240 |         "* Unfurnished",
241 |         "* FOUR bedrooms",
242 |         "* West End Location",
243 |         "* Double Garage",
244 |         "* Garden",
245 |         "* Gas Central Heating",
246 |         "* landlord reg: 255737/100/1558",
247 |         "* Council Tax H"
248 |     ]
249 | }


--------------------------------------------------------------------------------