├── rightmove ├── backend │ ├── __init__.py │ ├── app │ │ ├── __init__.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── coordinates.py │ │ │ ├── property.py │ │ │ └── pricing_category.py │ │ ├── data_processing │ │ │ ├── __init__.py │ │ │ ├── DataPreprocessor.py │ │ │ └── walk_score_processing.py │ │ └── main.py │ ├── requirements.txt │ ├── Dockerfile │ ├── fastapi.yaml │ └── tests │ │ └── integration_test.py ├── dashboard │ ├── streamlit │ │ ├── __init__.py │ │ ├── data_processing │ │ │ ├── __init__.py │ │ │ └── processing.py │ │ ├── pages │ │ │ ├── 05_WordCloud.py │ │ │ ├── 04_MachineLearning.py │ │ │ ├── 03_WalkScore.py │ │ │ └── 02_Price.py │ │ └── 01_LandingPage.py │ ├── requirements.txt │ └── Dockerfile ├── data_ingestion │ ├── rightmove_scraper │ │ ├── rightmove_scraper │ │ │ ├── __init__.py │ │ │ ├── spiders │ │ │ │ ├── __init__.py │ │ │ │ └── rightmove.py │ │ │ ├── items.py │ │ │ ├── pipelines.py │ │ │ ├── settings.py │ │ │ └── middlewares.py │ │ ├── requirements.txt │ │ ├── scrapyd.conf │ │ ├── scrapy.cfg │ │ ├── setup.py │ │ └── Dockerfile │ └── scrapy.yaml ├── orchestration │ └── airflow_app │ │ ├── dags │ │ └── rightmove │ │ │ ├── data_processing │ │ │ ├── __init__.py │ │ │ ├── data_processor.py │ │ │ ├── rightmove_processing.py │ │ │ └── metric_extraction.py │ │ │ ├── visualization_data.py │ │ │ ├── rightmove_ingest.py │ │ │ ├── train_model.py │ │ │ └── ml_monitoring.py │ │ ├── requirements.txt │ │ ├── setup.py │ │ └── Dockerfile ├── mlflow │ ├── requirements.txt │ └── Dockerfile └── monitoring │ └── config │ ├── grafana_datasources.yaml │ └── grafana_dashboards.yaml ├── config.yaml ├── static └── images │ ├── mlops_pipeline.png │ ├── model_monitoring.png │ ├── scrapy_monitoring.png │ ├── Rightmove extraction.png │ ├── model_training_pipeline.png │ └── Processing_pipeline_rightmove.png ├── infrastructure ├── aws │ ├── variables.tf │ ├── main.tf │ └── database.tf └── gcp │ ├── bucket.tf │ └── main.tf ├── setup.py ├── .gitignore ├── notebooks ├── data_processing │ └── process_boundaries.py ├── data_ingestion │ ├── fetch_outcodes.ipynb │ ├── .ipynb_checkpoints │ │ └── fetch_outcodes-checkpoint.ipynb │ └── scrapy_connection.ipynb ├── data_storage │ ├── mongo_integration.ipynb │ └── .ipynb_checkpoints │ │ └── mongo_integration-checkpoint.ipynb └── resources │ └── data │ ├── property_1.json │ ├── property.json │ └── .ipynb_checkpoints │ └── property-checkpoint.json ├── README.md ├── requirements.txt └── docker-compose.yaml /rightmove/backend/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rightmove/backend/app/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rightmove/backend/app/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rightmove/dashboard/streamlit/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rightmove/backend/app/data_processing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rightmove/dashboard/streamlit/data_processing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rightmove/data_ingestion/rightmove_scraper/rightmove_scraper/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rightmove/orchestration/airflow_app/dags/rightmove/data_processing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rightmove/data_ingestion/rightmove_scraper/rightmove_scraper/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rightmove/mlflow/requirements.txt: -------------------------------------------------------------------------------- 1 | mlflow 2 | psycopg2 3 | google-cloud-storage 4 | boto3 -------------------------------------------------------------------------------- /rightmove/dashboard/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | streamlit 3 | numpy 4 | requests 5 | google-cloud-storage 6 | -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: config 5 | data: 6 | mongo-url: "mongodb://mongodb:27017/" 7 | -------------------------------------------------------------------------------- /static/images/mlops_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexandergirardet/london_rightmove/HEAD/static/images/mlops_pipeline.png -------------------------------------------------------------------------------- /static/images/model_monitoring.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexandergirardet/london_rightmove/HEAD/static/images/model_monitoring.png -------------------------------------------------------------------------------- /static/images/scrapy_monitoring.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexandergirardet/london_rightmove/HEAD/static/images/scrapy_monitoring.png -------------------------------------------------------------------------------- /rightmove/data_ingestion/rightmove_scraper/requirements.txt: -------------------------------------------------------------------------------- 1 | scrapy 2 | pymongo 3 | scrapyd 4 | scrapyd-client 5 | beautifulsoup4 6 | psycopg2-binary -------------------------------------------------------------------------------- /static/images/Rightmove extraction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexandergirardet/london_rightmove/HEAD/static/images/Rightmove extraction.png -------------------------------------------------------------------------------- /static/images/model_training_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexandergirardet/london_rightmove/HEAD/static/images/model_training_pipeline.png -------------------------------------------------------------------------------- /infrastructure/aws/variables.tf: -------------------------------------------------------------------------------- 1 | variable "db_password" { 2 | description = "The database admin password" 3 | type = string 4 | sensitive = true 5 | } 6 | -------------------------------------------------------------------------------- /static/images/Processing_pipeline_rightmove.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexandergirardet/london_rightmove/HEAD/static/images/Processing_pipeline_rightmove.png -------------------------------------------------------------------------------- /infrastructure/gcp/bucket.tf: -------------------------------------------------------------------------------- 1 | resource "google_storage_bucket" "rightmove-artifacts-ml" { 2 | name = "rightmove-artifacts-ml" 3 | location = "europe-west2" 4 | } 5 | -------------------------------------------------------------------------------- /rightmove/backend/app/models/coordinates.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, ValidationError, validator 2 | 3 | 4 | class Coordinates(BaseModel): 5 | longitude: float 6 | latitude: float 7 | -------------------------------------------------------------------------------- /rightmove/backend/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi 2 | pydantic 3 | uvicorn 4 | scipy 5 | pandas 6 | numpy 7 | pymongo 8 | mlflow==2.10.2 9 | pytest 10 | psycopg2 11 | google-cloud-storage 12 | scikit-learn==1.3.2 -------------------------------------------------------------------------------- /rightmove/data_ingestion/rightmove_scraper/scrapyd.conf: -------------------------------------------------------------------------------- 1 | [scrapyd] 2 | bind_address= 0.0.0.0 3 | http_port = 6800 4 | eggs_dir = /scrapyd/eggs 5 | logs_dir = /scrapyd/logs 6 | items_dir = /scrapyd/items 7 | dbs_dir = /scrapyd/dbs -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='rightmove_scraper', 5 | version='0.1', 6 | package_dir={'': 'src'}, # Tells setuptools that packages are under src 7 | packages=find_packages(where='src'), 8 | ) -------------------------------------------------------------------------------- /rightmove/orchestration/airflow_app/requirements.txt: -------------------------------------------------------------------------------- 1 | awscli 2 | requests 3 | beautifulsoup4 4 | pendulum 5 | apache-airflow 6 | pymongo 7 | scikit-learn 8 | psycopg2-binary 9 | pandas 10 | numpy 11 | mlflow 12 | apache-beam 13 | evidently 14 | scipy 15 | -------------------------------------------------------------------------------- /rightmove/orchestration/airflow_app/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="data_processing", 5 | version="0.1.0", 6 | packages=find_packages(), 7 | description="Data processing module for Airflow pipelines", 8 | ) 9 | -------------------------------------------------------------------------------- /rightmove/backend/app/models/property.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, ValidationError, validator 2 | 3 | 4 | class Property(BaseModel): 5 | bedrooms: float 6 | bathrooms: float 7 | longitude: float 8 | latitude: float 9 | walk_score: float 10 | -------------------------------------------------------------------------------- /infrastructure/aws/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | aws = { 4 | source = "hashicorp/aws" 5 | version = "~> 4.16" 6 | } 7 | } 8 | 9 | required_version = ">= 1.2.0" 10 | } 11 | 12 | provider "aws" { 13 | region = "eu-west-2" 14 | } 15 | -------------------------------------------------------------------------------- /rightmove/backend/Dockerfile: -------------------------------------------------------------------------------- 1 | # 2 | FROM python:3.9 3 | 4 | # 5 | WORKDIR /code 6 | 7 | # 8 | COPY requirements.txt /code/requirements.txt 9 | 10 | # 11 | RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt 12 | 13 | # 14 | COPY app /code/app 15 | 16 | # 17 | CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] -------------------------------------------------------------------------------- /rightmove/data_ingestion/rightmove_scraper/rightmove_scraper/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | 8 | 9 | class RightmoveScraperItem(scrapy.Item): 10 | # define the fields for your item here like: 11 | # name = scrapy.Field() 12 | pass 13 | -------------------------------------------------------------------------------- /rightmove/data_ingestion/rightmove_scraper/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = rightmove_scraper.settings 8 | 9 | [deploy:development] 10 | url = http://localhost:6800/ 11 | project = rightmove_scraper 12 | 13 | 14 | -------------------------------------------------------------------------------- /rightmove/data_ingestion/rightmove_scraper/setup.py: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapyd-deploy 2 | 3 | from setuptools import setup, find_packages 4 | 5 | setup( 6 | name="rightmove_scraper", 7 | version="1.0", 8 | packages=find_packages(), 9 | entry_points={"scrapy": ["settings = rightmove_scraper.settings"]}, 10 | package_data={"rightmove_scraper": ["resources/data/*.csv"]}, 11 | ) 12 | -------------------------------------------------------------------------------- /infrastructure/aws/database.tf: -------------------------------------------------------------------------------- 1 | resource "aws_db_instance" "realestate-database" { 2 | # These fields are examples; modify them according to your existing resource's configuration 3 | allocated_storage = 20 4 | engine = "postgres" 5 | engine_version = "12.3" 6 | instance_class = "db.t3.micro" 7 | username = "postgres" 8 | password = var.db_password 9 | } 10 | -------------------------------------------------------------------------------- /rightmove/dashboard/Dockerfile: -------------------------------------------------------------------------------- 1 | # app/Dockerfile 2 | 3 | FROM python:3.9-slim 4 | 5 | WORKDIR /code 6 | 7 | COPY requirements.txt /code/requirements.txt 8 | 9 | RUN pip3 install -r /code/requirements.txt 10 | 11 | COPY streamlit /code/app 12 | 13 | EXPOSE 8501 14 | 15 | HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health 16 | 17 | ENTRYPOINT ["streamlit", "run", "app/main.py", "--server.port=8501", "--server.address=0.0.0.0"] -------------------------------------------------------------------------------- /rightmove/backend/app/models/pricing_category.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel, ValidationError, validator 2 | 3 | 4 | class PricingCategory(BaseModel): 5 | category: str 6 | 7 | # Optional: Validator to provide a more specific error message 8 | @validator("category") 9 | def check_category(cls, v): 10 | if v not in ["Cheap", "Average", "Expensive"]: 11 | raise ValidationError('Pricing must be "Cheap", "Average", or "Expensive"') 12 | return v 13 | -------------------------------------------------------------------------------- /rightmove/monitoring/config/grafana_datasources.yaml: -------------------------------------------------------------------------------- 1 | # config file version 2 | apiVersion: 1 3 | 4 | # list of datasources to insert/update 5 | # available in the database 6 | datasources: 7 | - name: grafana-postgresql-datasource 8 | type: postgres 9 | access: proxy 10 | url: realestate-database.czkkjkojmucd.eu-west-2.rds.amazonaws.com:5432 11 | database: monitoring 12 | user: postgres 13 | secureJsonData: 14 | password: 'postgres' 15 | jsonData: 16 | sslmode: 'require' -------------------------------------------------------------------------------- /infrastructure/gcp/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | google = { 4 | source = "hashicorp/google" 5 | version = "4.51.0" 6 | } 7 | } 8 | } 9 | 10 | provider "google" { 11 | credentials = file("/Users/alexander.girardet/Code/Personal/projects/rightmove_project/credentials/airflow-service-account.json") 12 | 13 | project = "personal-projects-411616" 14 | region = "europe-west2" 15 | zone = "europe-west2-a" 16 | } 17 | 18 | #resource "google_compute_network" "vpc_network" { 19 | # name = "terraform-network" 20 | #} 21 | -------------------------------------------------------------------------------- /rightmove/mlflow/Dockerfile: -------------------------------------------------------------------------------- 1 | # Start from a base image with Python installed 2 | FROM python:3.9 3 | 4 | # Set the working directory in the container 5 | WORKDIR /app 6 | 7 | # Copy the requirements file into the container 8 | COPY requirements.txt /app 9 | 10 | # Install mlflow and dependencies 11 | RUN pip install --no-cache-dir -r requirements.txt 12 | 13 | # Expose the default MLflow server port 14 | EXPOSE 5001 15 | 16 | # Start the MLflow server when the container starts 17 | CMD mlflow server --backend-store-uri $MLFLOW_BACKEND_STORE_URI --default-artifact-root $MLFLOW_ARTIFACTS_DESTINATION --host 0.0.0.0 --port 5001 18 | -------------------------------------------------------------------------------- /rightmove/data_ingestion/rightmove_scraper/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.8-slim 3 | 4 | # Copy the requirements file into the container at /usr/src/app 5 | COPY requirements.txt ./ 6 | 7 | # Install any needed packages specified in requirements.txt 8 | # (Assuming requirements.txt includes scrapy and scrapyd) 9 | RUN pip install --no-cache-dir -r requirements.txt 10 | 11 | RUN mkdir /etc/scrapyd 12 | RUN mkdir -p /scrapyd/logs 13 | 14 | # Make port 6800 available to the world outside this container 15 | # (scrapyd default port) 16 | EXPOSE 6800 17 | 18 | COPY . . 19 | 20 | # Run scrapyd when the container launches 21 | CMD ["scrapyd"] -------------------------------------------------------------------------------- /rightmove/monitoring/config/grafana_dashboards.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | providers: 4 | - name: 'My Dashboards' # A friendly name for this provisioning configuration 5 | orgId: 1 # The ID of the Org in Grafana where you want to provision the dashboards 6 | folder: '' # The name of the folder where you want these dashboards to appear. Leave empty for the General folder. 7 | type: file # The type of the provider. In this case, 'file' for file-based provisioning. 8 | disableDeletion: false # Whether Grafana should delete dashboards not in the JSON files. 9 | updateIntervalSeconds: 10 # How often Grafana will scan for changed dashboard files. 10 | options: 11 | path: /var/lib/grafana/dashboards 12 | -------------------------------------------------------------------------------- /rightmove/data_ingestion/scrapy.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: scrapy-deployment 5 | spec: 6 | replicas: 1 7 | selector: 8 | matchLabels: 9 | app: scrapy 10 | template: 11 | metadata: 12 | labels: 13 | app: scrapy 14 | spec: 15 | containers: 16 | - name: scrapy 17 | image: 18 | ports: 19 | - containerPort: 6800 20 | env: 21 | - name: MONGO_URL 22 | value: "mongodb://:27017/" 23 | 24 | --- 25 | 26 | apiVersion: v1 27 | kind: Service 28 | metadata: 29 | name: scrapy-service 30 | spec: 31 | type: NodePort 32 | selector: 33 | app: scrapy 34 | ports: 35 | - protocol: TCP 36 | port: 6800 37 | targetPort: 6800 -------------------------------------------------------------------------------- /rightmove/backend/fastapi.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: fastapi-deployment 5 | labels: 6 | app: fastapi 7 | spec: 8 | replicas: 1 9 | selector: 10 | matchLabels: 11 | app: fastapi 12 | template: 13 | metadata: 14 | labels: 15 | app: fastapi 16 | spec: 17 | containers: 18 | - name: webapp 19 | image: alexgirardet123/fastapi:latest 20 | ports: 21 | - containerPort: 80 22 | env: 23 | - name: MONGO_DB_URL 24 | valueFrom: 25 | configMapKeyRef: 26 | name: mongo-config 27 | key: mongo-url 28 | 29 | --- 30 | 31 | apiVersion: v1 32 | kind: Service 33 | metadata: 34 | name: fastapi-service # End point to access fastapi 35 | spec: 36 | type: NodePort # External Service type 37 | selector: # Selects the pods to forward the requests to. Forwards to pods by their label. 38 | app: fastapi 39 | ports: 40 | - protocol: TCP 41 | port: 80 42 | targetPort: 80 # The port of the pods that belong to the service. The target port should be the same as container port 43 | nodePort: 30100 # Port to access the service from outside the cluster -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled source # 2 | ################### 3 | *.com 4 | *.class 5 | *.dll 6 | *.exe 7 | *.o 8 | *.so 9 | 10 | # Pyenv 11 | rightmove_env 12 | 13 | # Packages # 14 | ############ 15 | # it's better to unpack these files and commit the raw source because packages can contain binary data 16 | *.7z 17 | *.dmg 18 | *.gz 19 | *.iso 20 | *.jar 21 | *.rar 22 | *.tar 23 | *.zip 24 | 25 | # Logs and databases # 26 | ###################### 27 | *.log 28 | *.sql 29 | *.sqlite 30 | 31 | # OS generated files # 32 | ###################### 33 | .DS_Store 34 | .DS_Store? 35 | ._* 36 | .Spotlight-V100 37 | .Trashes 38 | ehthumbs.db 39 | Thumbs.db 40 | 41 | # Editor directories and files # 42 | ################################ 43 | .idea 44 | *.swp 45 | *.swo 46 | *.sublime-workspace 47 | *.sublime-project 48 | .vscode/ 49 | *.code-workspace 50 | 51 | # build outputs # 52 | ################# 53 | bin/ 54 | obj/ 55 | out/ 56 | build/ 57 | dist/ 58 | *.dmg 59 | *.exe 60 | *.msi 61 | *.deb 62 | *.rpm 63 | *.tgz 64 | *.pkg 65 | 66 | # Dependencies # 67 | ################ 68 | # Node.js dependencies 69 | node_modules/ 70 | 71 | # Python # 72 | ########## 73 | # Byte-compiled / optimized / DLL files 74 | __pycache__/ 75 | *.py[cod] 76 | *$py.class 77 | *.egg-info 78 | *.benchmarks 79 | *.pytest_cache 80 | *.env 81 | -------------------------------------------------------------------------------- /rightmove/backend/tests/integration_test.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import pandas as pd 3 | 4 | # Define the URL of your batch prediction endpoint 5 | url = "http://127.0.0.1:8000/batch-predict" 6 | 7 | # Create a DataFrame with your test properties. This should match the structure expected by your API. 8 | # For example, if your Property model expects 'size' and 'location', your DataFrame should reflect that. 9 | df = pd.read_csv( 10 | "gs://rightmove-artifacts-ml/data/2024-02-17-14-18-14/test.csv", index_col=0 11 | ) 12 | 13 | df = df[["bedrooms", "bathrooms", "longitude", "latitude", "walk_score"]] 14 | 15 | print(df.head()) 16 | 17 | # Convert the DataFrame to a list of dictionaries 18 | properties_list = df.to_dict("records") 19 | 20 | 21 | # Make a POST request to the batch prediction endpoint 22 | response = requests.post(url, json=properties_list) 23 | 24 | # Check the status code to ensure the request was successful 25 | assert response.status_code == 200 26 | 27 | # Convert the response to JSON and retrieve the predictions 28 | predictions = response.json().get("predictions") 29 | 30 | # Perform any additional checks you need on the predictions 31 | # For example, check the number of predictions matches the number of input properties 32 | assert len(predictions) == len(properties_list) 33 | 34 | print("Predictions:", predictions) 35 | -------------------------------------------------------------------------------- /rightmove/orchestration/airflow_app/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM apache/airflow:latest 2 | 3 | ENV AIRFLOW_HOME=/opt/airflow 4 | 5 | USER root 6 | RUN apt-get update -qq && apt-get install vim -qqq 7 | # git gcc g++ -qqq 8 | 9 | COPY requirements.txt . 10 | 11 | USER $AIRFLOW_UID 12 | RUN pip install --no-cache-dir -r requirements.txt 13 | 14 | USER root 15 | 16 | # Ref: https://airflow.apache.org/docs/docker-stack/recipes.html 17 | 18 | SHELL ["/bin/bash", "-o", "pipefail", "-e", "-u", "-x", "-c"] 19 | 20 | ARG CLOUD_SDK_VERSION=322.0.0 21 | ENV GCLOUD_HOME=/home/google-cloud-sdk 22 | 23 | ENV PATH="${GCLOUD_HOME}/bin/:${PATH}" 24 | 25 | RUN apt-get update && apt-get install -y libpq-dev && rm -rf /var/lib/apt/lists/* 26 | 27 | RUN DOWNLOAD_URL="https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-${CLOUD_SDK_VERSION}-linux-x86_64.tar.gz" \ 28 | && TMP_DIR="$(mktemp -d)" \ 29 | && curl -fL "${DOWNLOAD_URL}" --output "${TMP_DIR}/google-cloud-sdk.tar.gz" \ 30 | && mkdir -p "${GCLOUD_HOME}" \ 31 | && tar xzf "${TMP_DIR}/google-cloud-sdk.tar.gz" -C "${GCLOUD_HOME}" --strip-components=1 \ 32 | && "${GCLOUD_HOME}/install.sh" \ 33 | --bash-completion=false \ 34 | --path-update=false \ 35 | --usage-reporting=false \ 36 | --quiet \ 37 | && rm -rf "${TMP_DIR}" \ 38 | && gcloud --version 39 | 40 | WORKDIR $AIRFLOW_HOME 41 | USER $AIRFLOW_UID -------------------------------------------------------------------------------- /notebooks/data_processing/process_boundaries.py: -------------------------------------------------------------------------------- 1 | from pyrosm import OSM, get_data 2 | import pandas as pd 3 | import requests 4 | import os 5 | pd.options.mode.chained_assignment = None 6 | import tempfile 7 | 8 | 9 | def process_geodata(): 10 | # file_path = f"../resources/boundary_date/data/{file_name}" 11 | file_name = "greater-london-latest.osm.pbf" 12 | file_path = "/Users/alexander.girardet/Code/Personal/projects/rightmove_project/data/greater-london-latest.osm.pbf" 13 | print(f"processing: {file_name}") 14 | 15 | osm = OSM(file_path) 16 | 17 | boundary_name = file_name.split(".osm")[0] 18 | 19 | boundary_df = osm.get_boundaries() 20 | boundary_df = boundary_df.rename(columns={"id": "boundary_id"}) 21 | 22 | output_filename = f'geodata/{boundary_name}.geojson' # Specifying the path in writable storage 23 | boundary_df.to_file(output_filename, driver='GeoJSON') 24 | print(f"Loaded {output_filename}") 25 | 26 | # already_processed = os.listdir("geodata") 27 | # processed_names = [name.split(".geojson")[0] for name in already_processed] 28 | # files = os.listdir("../resources/boundary_date/data") 29 | 30 | # for file_name in files: 31 | # name = file_name.split(".osm.pbf")[0] 32 | # if name not in processed_names: 33 | # if name != "scotland-latest": 34 | # try: 35 | # process_geodata(file_name) 36 | # except: 37 | # print(f"Failed to process: {name}") 38 | process_geodata() -------------------------------------------------------------------------------- /rightmove/data_ingestion/rightmove_scraper/rightmove_scraper/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | 7 | # useful for handling different item types with a single interface 8 | from itemadapter import ItemAdapter 9 | from pymongo import MongoClient 10 | import os 11 | import datetime 12 | 13 | # MONGO_URL = "mongodb://mongodb:27017/" 14 | MONGO_URI = os.environ.get("MONGO_URI") 15 | 16 | 17 | class RightmoveScraperPipeline: 18 | def __init__(self): 19 | self.batch = [] 20 | 21 | self.client = MongoClient(MONGO_URI) 22 | db = self.client["rightmove"] 23 | self.collection = db["properties"] 24 | 25 | def process_item(self, item, spider): 26 | """ 27 | Sending items to MongoDB in batches to reduce I/O operations 28 | """ 29 | 30 | item["extraction_timestamp"] = datetime.datetime.utcnow().timestamp() 31 | 32 | self.batch.append(item) 33 | 34 | if len(self.batch) >= 50: # Batch size of file 35 | self.collection.insert_many(self.batch) 36 | self.batch = [] 37 | 38 | return item 39 | 40 | def close_spider(self, spider): 41 | print("SPIDER CLOSING...") 42 | 43 | if len(self.batch) > 0: 44 | self.collection.insert_many(self.batch) # Send remaining items 45 | 46 | self.client.close() 47 | -------------------------------------------------------------------------------- /rightmove/dashboard/streamlit/pages/05_WordCloud.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import requests 3 | import pydeck as pdk 4 | from wordcloud import WordCloud 5 | import pandas as pd 6 | import geopandas 7 | import matplotlib.pyplot as plt 8 | import json 9 | 10 | 11 | @st.cache_data 12 | def load_data(): 13 | df = pd.read_parquet( 14 | "gs://rightmove-artifacts-ml/streamlit_data/2024-02-27-12-32-07/data.parquet" 15 | ) # 16 | return df 17 | 18 | 19 | df = load_data() 20 | 21 | 22 | def generate_wordcloud(text): 23 | wordcloud = WordCloud(width=800, height=400, background_color="white").generate( 24 | text 25 | ) 26 | fig, ax = plt.subplots(figsize=(10, 5)) 27 | ax.imshow(wordcloud) 28 | ax.axis("off") 29 | return fig 30 | 31 | 32 | def fetch_corpus(category, df): 33 | category_df = df[df["price_category"] == category] 34 | combined_text = " ".join(category_df["text"].tolist()) 35 | return combined_text 36 | 37 | 38 | st.title("Wordcloud Generator") 39 | 40 | # Category selection 41 | category = st.selectbox("Select a category:", ("Expensive", "Cheap", "Average")) 42 | 43 | corpus = fetch_corpus(category, df) 44 | 45 | # Implement word filter mechanism to accept multiple words 46 | filter_words = st.text_input( 47 | "Enter words to filter out (separated by commas) and regenerate wordcloud:" 48 | ) 49 | 50 | if filter_words: 51 | # Split the filter_words by commas, strip spaces, and convert to lowercase for case-insensitive comparison 52 | filter_words_list = [word.strip().lower() for word in filter_words.split(",")] 53 | # Filter out the words 54 | filtered_corpus = " ".join( 55 | [word for word in corpus.split() if word.lower() not in filter_words_list] 56 | ) 57 | else: 58 | # If no filter words are provided, use the original corpus 59 | filtered_corpus = corpus 60 | 61 | # Display the wordcloud 62 | st.write("Generated Wordcloud:") 63 | fig = generate_wordcloud(filtered_corpus) # Generate wordcloud with filtered corpus 64 | st.pyplot(fig) # Display the figure 65 | -------------------------------------------------------------------------------- /rightmove/dashboard/streamlit/data_processing/processing.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | class DataPreprocessor: 5 | def __init__(self, with_text=False, with_binary=False): 6 | self.with_text = with_text 7 | self.with_binary = with_binary 8 | 9 | @staticmethod 10 | def convert_frequencies(x): 11 | frequency = x["frequency"] 12 | price = x["amount"] 13 | 14 | if frequency == "monthly": 15 | return price * 12 16 | elif frequency == "weekly": 17 | return (price / 7) * 365 18 | elif frequency == "daily": 19 | return price * 365 20 | elif frequency == "quarterly": 21 | return price * 4 22 | else: # Yearly 23 | return price 24 | 25 | @staticmethod 26 | def remove_anomalies(df, percentile_threshold=0.99): 27 | percentile_thresholds = df[["price", "bedrooms", "bathrooms"]].quantile( 28 | percentile_threshold 29 | ) 30 | 31 | filtered_df = df[ 32 | (df["price"] <= percentile_thresholds["price"]) 33 | & (df["bedrooms"] <= percentile_thresholds["bedrooms"]) 34 | & (df["bathrooms"] <= percentile_thresholds["bathrooms"]) 35 | ] 36 | return filtered_df 37 | 38 | @staticmethod 39 | def merge_text(x): 40 | summary, feature_list = x[0], x[1] 41 | feature_list_joined = ", ".join(feature_list) if feature_list else "" 42 | return feature_list_joined + " , " + summary 43 | 44 | @staticmethod 45 | def label_price(price): 46 | if price < 8000: 47 | return "Cheap" 48 | elif price < 20_000: 49 | return "Average" 50 | else: 51 | return "Expensive" 52 | 53 | def preprocess_properties(self, df): 54 | df["longitude"] = df["location"].apply(lambda x: x["longitude"]) 55 | df["latitude"] = df["location"].apply(lambda x: x["latitude"]) 56 | df["price"] = df["price"].apply(self.convert_frequencies) 57 | 58 | if self.with_text: 59 | df["text"] = df[["summary", "feature_list"]].apply(self.merge_text, axis=1) 60 | if self.with_binary: 61 | df["commercial"] = df["commercial"].apply(lambda x: 1 if x else 0) 62 | df["development"] = df["development"].apply(lambda x: 1 if x else 0) 63 | df["students"] = df["students"].apply(lambda x: 1 if x else 0) 64 | 65 | df["price_category"] = df["price"].apply(self.label_price) 66 | df["listingUpdateReason"] = df["listingUpdate"].apply( 67 | lambda x: x["listingUpdateReason"] 68 | ) 69 | df["firstVisibleDate"] = pd.to_datetime(df["firstVisibleDate"], utc=True) 70 | df = self.remove_anomalies(df) 71 | df = df.drop(columns=["location", "_id", "listingUpdate"]) 72 | return df 73 | 74 | @staticmethod 75 | def preprocess_walk_score(df): 76 | df = df.drop_duplicates(subset=["id"]) 77 | df["walk_score"] = df["scores"].apply(lambda x: int(x["walk_score"])) 78 | df = df.drop(columns=["_id", "scores"]) 79 | return df -------------------------------------------------------------------------------- /rightmove/backend/app/data_processing/DataPreprocessor.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | class DataPreprocessor: 5 | def __init__(self): 6 | pass 7 | 8 | @staticmethod 9 | def convert_frequencies(x): 10 | frequency = x["frequency"] 11 | price = x["amount"] 12 | 13 | if frequency == "monthly": 14 | return price * 12 15 | elif frequency == "weekly": 16 | return (price / 7) * 365 17 | elif frequency == "daily": 18 | return price * 365 19 | elif frequency == "quarterly": 20 | return price * 4 21 | else: # Yearly 22 | return price 23 | 24 | @staticmethod 25 | def remove_anomalies(df, percentile_threshold=0.99): 26 | percentile_thresholds = df[["price", "bedrooms", "bathrooms"]].quantile( 27 | percentile_threshold 28 | ) 29 | 30 | # Filter the dataset to remove anomalies above the 98th percentile 31 | filtered_df = df[ 32 | (df["price"] <= percentile_thresholds["price"]) 33 | & (df["bedrooms"] <= percentile_thresholds["bedrooms"]) 34 | & (df["bathrooms"] <= percentile_thresholds["bathrooms"]) 35 | ] 36 | return filtered_df 37 | 38 | @staticmethod 39 | def merge_text(x): 40 | summary, feature_list = x[0], x[1] 41 | feature_list_joined = ", ".join(feature_list) if feature_list else "" 42 | return feature_list_joined + " , " + summary 43 | 44 | def preprocess_properties_with_binary(self, df): 45 | df["longitude"] = df["location"].apply(lambda x: x["longitude"]) 46 | df["latitude"] = df["location"].apply(lambda x: x["latitude"]) 47 | df = df.drop(columns=["location"]) 48 | df["price"] = df["price"].apply(self.convert_frequencies) 49 | df["commercial"] = df["commercial"].apply(lambda x: 1 if x else 0) 50 | df["development"] = df["development"].apply(lambda x: 1 if x else 0) 51 | df["students"] = df["students"].apply(lambda x: 1 if x else 0) 52 | df["text"] = df[["summary", "feature_list"]].apply(self.merge_text, axis=1) 53 | df = self.remove_anomalies(df) 54 | return df 55 | 56 | @staticmethod 57 | def label_price(price): 58 | if price < 8000: 59 | return "Cheap" 60 | elif price < 20_000: 61 | return "Average" 62 | else: 63 | return "Expensive" 64 | 65 | def preprocess_properties(self, df): 66 | df["longitude"] = df["location"].apply(lambda x: x["longitude"]) 67 | df["latitude"] = df["location"].apply(lambda x: x["latitude"]) 68 | df = df.drop(columns=["location", "_id"]) 69 | df["price"] = df["price"].apply(self.convert_frequencies) 70 | df["text"] = df[["summary", "feature_list"]].apply(self.merge_text, axis=1) 71 | df["price_category"] = df["price"].apply(self.label_price) 72 | df["listingUpdateReason"] = df["listingUpdate"].apply( 73 | lambda x: x["listingUpdateReason"] 74 | ) 75 | df["firstVisibleDate"] = pd.to_datetime(df["firstVisibleDate"], utc=True) 76 | df = self.remove_anomalies(df) 77 | return df 78 | 79 | @staticmethod 80 | def preprocess_walk_score(df): 81 | df = df.drop_duplicates(subset=["id"]) 82 | df["walk_score"] = df["scores"].apply(lambda x: int(x["walk_score"])) 83 | df = df.drop(columns=["_id", "scores"]) 84 | return df 85 | 86 | @staticmethod 87 | def merge_dataframes(df, walk_df): 88 | merged_df = df.merge(walk_df, on="id", how="left") 89 | 90 | return merged_df 91 | -------------------------------------------------------------------------------- /rightmove/backend/app/data_processing/walk_score_processing.py: -------------------------------------------------------------------------------- 1 | from sklearn.neighbors import BallTree 2 | import math 3 | 4 | from math import radians 5 | import pandas as pd 6 | import numpy as np 7 | 8 | GCS_PARQUET_URL = ( 9 | "https://storage.googleapis.com/rightmove-resources-public/UK_pois.parquet" 10 | ) 11 | WALK_SCORES_COLLECTION = "walk_scores" 12 | 13 | 14 | class WalkScoreProcessor: 15 | def __init__(self): 16 | self.earth_radius = 6371000 # Earth radius in metres 17 | self.pois_df = pd.read_parquet(GCS_PARQUET_URL) 18 | self.ball_tree = BallTree( 19 | self.pois_df[["lon_rad", "lat_rad"]].values, metric="haversine" 20 | ) # What is the ball tree doing? 21 | self.amenity_weights = { 22 | "grocery": [3], 23 | "restaurants": [ 24 | 0.75, 25 | 0.45, 26 | 0.25, 27 | 0.25, 28 | 0.225, 29 | 0.225, 30 | 0.225, 31 | 0.225, 32 | 0.2, 33 | 0.2, 34 | ], 35 | "shopping": [0.5, 0.45, 0.4, 0.35, 0.3], 36 | "coffee": [1.25, 0.75], 37 | "banks": [1], 38 | "parks": [1], 39 | "schools": [1], 40 | "books": [1], 41 | "entertainment": [1], 42 | } 43 | 44 | def process_results_df(self, distance_series, pois_df): 45 | results_df = pd.DataFrame(distance_series) 46 | 47 | results_df = results_df.join(pois_df["amenities"], how="left") 48 | 49 | results_df["distance_in_metres"] = results_df["distance"].apply( 50 | lambda x: x * self.earth_radius 51 | ) 52 | 53 | results_df["distance_decayed"] = results_df["distance_in_metres"].apply( 54 | lambda x: float(self.distance_decay(x)) 55 | ) 56 | 57 | return results_df 58 | 59 | def distance_decay(self, distance): 60 | dist = distance / 1000 61 | score = math.e ** ((-5.0 * (dist / 4)) ** 5.0) 62 | return score 63 | 64 | def calculate_amenity_walk_score(self, property_distance_df, amenity, weights): 65 | k = len(weights) 66 | weight_array = np.array(weights) 67 | 68 | dist_array = ( 69 | property_distance_df[property_distance_df["amenities"] == amenity] 70 | .iloc[0:k]["distance_decayed"] 71 | .values 72 | ) 73 | dist_array_padded = np.pad( 74 | dist_array, (0, weight_array.size - dist_array.size), "constant" 75 | ) 76 | 77 | scores_array = dist_array_padded * weight_array 78 | 79 | amenity_score = scores_array.sum() 80 | 81 | return amenity_score 82 | 83 | def calculuate_walk_score(self, longitude, latitude): 84 | radian_longitude = radians(longitude) 85 | radian_latitude = radians(latitude) 86 | 87 | k = 100 # Maximum number of amenities to return 88 | 89 | distances, indices = self.ball_tree.query( 90 | [[radian_longitude, radian_latitude]], k=k, return_distance=True 91 | ) 92 | 93 | dist_series = pd.Series(distances[0], index=indices[0], name="distance") 94 | 95 | results_df = self.process_results_df(dist_series, self.pois_df) 96 | 97 | scores_dict = {} 98 | 99 | for key, values in self.amenity_weights.items(): 100 | amenity_score = self.calculate_amenity_walk_score(results_df, key, values) 101 | 102 | scores_dict[key] = amenity_score 103 | 104 | return scores_dict 105 | -------------------------------------------------------------------------------- /rightmove/data_ingestion/rightmove_scraper/rightmove_scraper/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for rightmove_scraper project 2 | # 3 | # For simplicity, this file contains only settings considered important or 4 | # commonly used. You can find more settings consulting the documentation: 5 | # 6 | # https://docs.scrapy.org/en/latest/topics/settings.html 7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 9 | 10 | BOT_NAME = "rightmove_scraper" 11 | 12 | SPIDER_MODULES = ["rightmove_scraper.spiders"] 13 | NEWSPIDER_MODULE = "rightmove_scraper.spiders" 14 | 15 | 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 17 | # USER_AGENT = "rightmove_scraper (+http://www.yourdomain.com)" 18 | 19 | # Obey robots.txt rules 20 | ROBOTSTXT_OBEY = False 21 | 22 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 23 | # CONCURRENT_REQUESTS = 32 24 | 25 | # Configure a delay for requests for the same website (default: 0) 26 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 27 | # See also autothrottle settings and docs 28 | # DOWNLOAD_DELAY = 3 29 | # The download delay setting will honor only one of: 30 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 31 | # CONCURRENT_REQUESTS_PER_IP = 16 32 | 33 | # Disable cookies (enabled by default) 34 | # COOKIES_ENABLED = False 35 | 36 | # Disable Telnet Console (enabled by default) 37 | # TELNETCONSOLE_ENABLED = False 38 | 39 | # Override the default request headers: 40 | # DEFAULT_REQUEST_HEADERS = { 41 | # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 42 | # "Accept-Language": "en", 43 | # } 44 | 45 | # Enable or disable spider middlewares 46 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 47 | SPIDER_MIDDLEWARES = { 48 | "rightmove_scraper.middlewares.RightmoveScraperSpiderMiddleware": 543, 49 | } 50 | 51 | # Enable or disable downloader middlewares 52 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 53 | # DOWNLOADER_MIDDLEWARES = { 54 | # "rightmove_scraper.middlewares.RightmoveScraperDownloaderMiddleware": 543, 55 | # } 56 | 57 | # Enable or disable extensions 58 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 59 | # EXTENSIONS = { 60 | # "scrapy.extensions.telnet.TelnetConsole": None, 61 | # } 62 | 63 | # Configure item pipelines 64 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 65 | ITEM_PIPELINES = { 66 | "rightmove_scraper.pipelines.RightmoveScraperPipeline": 300, 67 | } 68 | 69 | # Enable and configure the AutoThrottle extension (disabled by default) 70 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 71 | # AUTOTHROTTLE_ENABLED = True 72 | # The initial download delay 73 | # AUTOTHROTTLE_START_DELAY = 5 74 | # The maximum download delay to be set in case of high latencies 75 | # AUTOTHROTTLE_MAX_DELAY = 60 76 | # The average number of requests Scrapy should be sending in parallel to 77 | # each remote server 78 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 79 | # Enable showing throttling stats for every response received: 80 | # AUTOTHROTTLE_DEBUG = False 81 | 82 | # Enable and configure HTTP caching (disabled by default) 83 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 84 | # HTTPCACHE_ENABLED = True 85 | # HTTPCACHE_EXPIRATION_SECS = 0 86 | # HTTPCACHE_DIR = "httpcache" 87 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 88 | # HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" 89 | 90 | # LOG_ENABLED = True 91 | # LOG_LEVEL = 'INFO' 92 | # LOG_FILE = None 93 | 94 | # Set settings whose default value is deprecated to a future-proof value 95 | REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" 96 | TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" 97 | FEED_EXPORT_ENCODING = "utf-8" 98 | -------------------------------------------------------------------------------- /rightmove/dashboard/streamlit/pages/04_MachineLearning.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import streamlit as st 4 | import requests 5 | import pandas as pd 6 | from streamlit_folium import folium_static 7 | import folium 8 | 9 | # Function to send the HTTP POST request 10 | def predict_property_value(features): 11 | url = "http://localhost:8000/predict" # Replace with your actual endpoint URL 12 | response = requests.post(url, json=features) 13 | return response.json() 14 | 15 | def get_walk_score(coordinates): 16 | print(coordinates) 17 | url = "http://localhost:8000/walk_score" # Replace with your actual endpoint URL 18 | response = requests.post(url, json=coordinates) 19 | return response.json() 20 | 21 | 22 | # Streamlit user interface setup 23 | st.title("Property Value Prediction") 24 | 25 | st.write( 26 | "Enter the property details below and choose to either generate a Walk Score based on the location or input it manually to see the impact on the property value prediction.") 27 | 28 | # Input fields for property features 29 | with st.form(key='property_details'): 30 | bedrooms = st.number_input("Number of Bedrooms", min_value=1, value=3) 31 | bathrooms = st.number_input("Number of Bathrooms", min_value=1, value=2) 32 | latitude = st.number_input("Latitude", value=51.53) 33 | longitude = st.number_input("Longitude", value=-0.06) 34 | 35 | # Instructions for Walk Score 36 | st.write("You can either generate a Walk Score based on the coordinates or input a Walk Score manually.") 37 | 38 | if st.form_submit_button("Generate Walk Score"): 39 | coordinates = {"longitude": longitude, "latitude": latitude} 40 | walk_score_generated = round(get_walk_score(coordinates)['walk_score'], 2) 41 | st.session_state.generated_walk_score = walk_score_generated 42 | st.success(f"Generated Walk Score: {walk_score_generated}") 43 | walk_score = st.number_input("Or Input Walk Score", min_value=0, value=50, key="manual_walk_score") 44 | 45 | # Use generated walk score if available, else use manual input 46 | final_walk_score = st.session_state.get('generated_walk_score', walk_score) 47 | 48 | submitted = st.form_submit_button("Confirm Inputs") 49 | if submitted: 50 | st.write("### Inputs for Prediction") 51 | st.write(f"- Number of Bedrooms: {bedrooms}") 52 | st.write(f"- Number of Bathrooms: {bathrooms}") 53 | st.write(f"- Latitude: {latitude}") 54 | st.write(f"- Longitude: {longitude}") 55 | st.write(f"- Walk Score: {final_walk_score}") 56 | st.write("Use the 'Launch Prediction' button below to predict the property value based on these inputs.") 57 | 58 | # Button to launch prediction after reviewing inputs 59 | if st.button("Launch Prediction"): 60 | features = { 61 | "bedrooms": bedrooms, 62 | "bathrooms": bathrooms, 63 | "walk_score": final_walk_score, 64 | "latitude": latitude, 65 | "longitude": longitude 66 | } 67 | prediction = predict_property_value(features) 68 | monthly_value = prediction["prediction"] / 12 69 | formatted_value = f"£{monthly_value:,.2f} per month" # Formats the number with comma as thousands separator and two decimal places 70 | # st.success(f"Predicted Property Value: {formatted_value}") 71 | st.success(f"A property with {features['bedrooms']} bedrooms and {features['bathrooms']} bathrooms, located at ({features['latitude']:,.2f}, {features['longitude']:,.2f}), with a Walk Score of {features['walk_score']}, is estimated to be worth {formatted_value}") 72 | st.subheader("Select Property Location on Map") 73 | m = folium.Map(location=[latitude, longitude], zoom_start=11) # Default location, change as needed 74 | folium.Marker(location=[latitude, longitude], tooltip="Move this marker to your property location", draggable=True).add_to(m) 75 | folium_static(m) 76 | 77 | # Button to make prediction # Convert from yearly to monthly 78 | 79 | -------------------------------------------------------------------------------- /rightmove/orchestration/airflow_app/dags/rightmove/data_processing/data_processor.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | class DataPreprocessor: 5 | def __init__(self, with_text=False, with_binary=False): 6 | self.with_text = with_text 7 | self.with_binary = with_binary 8 | 9 | @staticmethod 10 | def convert_frequencies(x): 11 | frequency = x["frequency"] 12 | price = x["amount"] 13 | 14 | if frequency == "monthly": 15 | return price * 12 16 | elif frequency == "weekly": 17 | return (price / 7) * 365 18 | elif frequency == "daily": 19 | return price * 365 20 | elif frequency == "quarterly": 21 | return price * 4 22 | else: # Yearly 23 | return price 24 | 25 | @staticmethod 26 | def remove_anomalies(df, percentile_threshold=0.99): 27 | percentile_thresholds = df[["price", "bedrooms", "bathrooms"]].quantile( 28 | percentile_threshold 29 | ) 30 | 31 | # Filter the dataset to remove anomalies above the 98th percentile 32 | filtered_df = df[ 33 | (df["price"] <= percentile_thresholds["price"]) 34 | & (df["bedrooms"] <= percentile_thresholds["bedrooms"]) 35 | & (df["bathrooms"] <= percentile_thresholds["bathrooms"]) 36 | ] 37 | return filtered_df 38 | 39 | @staticmethod 40 | def merge_text(x): 41 | summary, feature_list = x[0], x[1] 42 | feature_list_joined = ", ".join(feature_list) if feature_list else "" 43 | return feature_list_joined + " , " + summary 44 | 45 | # def preprocess_properties_with_binary(self, df): 46 | # df['longitude'] = df['location'].apply(lambda x: x['longitude']) 47 | # df['latitude'] = df['location'].apply(lambda x: x['latitude']) 48 | # df = df.drop(columns=['location']) 49 | # df['price'] = df['price'].apply(self.convert_frequencies) 50 | # df['commercial'] = df['commercial'].apply(lambda x: 1 if x else 0) 51 | # df['development'] = df['development'].apply(lambda x: 1 if x else 0) 52 | # df['students'] = df['students'].apply(lambda x: 1 if x else 0) 53 | # df['text'] = df[['summary', 'feature_list']].apply(self.merge_text, axis=1) 54 | # df = self.remove_anomalies(df) 55 | # return df 56 | 57 | @staticmethod 58 | def label_price(price): 59 | if price < 8000: 60 | return "Cheap" 61 | elif price < 20_000: 62 | return "Average" 63 | else: 64 | return "Expensive" 65 | 66 | def preprocess_properties(self, df): 67 | df["longitude"] = df["location"].apply(lambda x: x["longitude"]) 68 | df["latitude"] = df["location"].apply(lambda x: x["latitude"]) 69 | df["price"] = df["price"].apply(self.convert_frequencies) 70 | 71 | if self.with_text: 72 | df["text"] = df[["summary", "feature_list"]].apply(self.merge_text, axis=1) 73 | if self.with_binary: 74 | df["commercial"] = df["commercial"].apply(lambda x: 1 if x else 0) 75 | df["development"] = df["development"].apply(lambda x: 1 if x else 0) 76 | df["students"] = df["students"].apply(lambda x: 1 if x else 0) 77 | 78 | df["price_category"] = df["price"].apply(self.label_price) 79 | df["listingUpdateReason"] = df["listingUpdate"].apply( 80 | lambda x: x["listingUpdateReason"] 81 | ) 82 | df["firstVisibleDate"] = pd.to_datetime(df["firstVisibleDate"], utc=True) 83 | df = self.remove_anomalies(df) 84 | df = df.drop(columns=["location", "_id", "listingUpdate"]) 85 | return df 86 | 87 | @staticmethod 88 | def preprocess_walk_score(df): 89 | df = df.drop_duplicates(subset=["id"]) 90 | df["walk_score"] = df["scores"].apply(lambda x: int(x["walk_score"])) 91 | df = df.drop(columns=["_id", "scores"]) 92 | return df 93 | -------------------------------------------------------------------------------- /rightmove/orchestration/airflow_app/dags/rightmove/visualization_data.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | import pandas as pd 3 | from pymongo import MongoClient 4 | import os 5 | from google.cloud import storage 6 | import logging 7 | from io import BytesIO 8 | 9 | from airflow import DAG 10 | from airflow.operators.python_operator import PythonOperator 11 | from airflow.operators.dummy_operator import DummyOperator 12 | from rightmove.data_processing.data_processor import DataPreprocessor 13 | 14 | logging.basicConfig(level=logging.INFO) 15 | 16 | client = storage.Client() 17 | bucket = client.get_bucket("rightmove-artifacts-ml") 18 | 19 | MONGO_URI = os.environ.get("MONGO_URI") 20 | def load_data_from_mongo(collection_name="properties", fields=None): 21 | logging.info("Loading data from mongo") 22 | 23 | client = MongoClient(MONGO_URI) # Hosted with Docker 24 | 25 | db = client["rightmove"] 26 | 27 | collection = db[collection_name] 28 | 29 | query = {} 30 | 31 | data = collection.find(query, fields) 32 | 33 | df = pd.DataFrame(list(data)) 34 | 35 | if len(df) == 0: 36 | raise ValueError(f"No data found in collection {collection_name}") 37 | else: 38 | logging.info(f"Data loaded from collection {collection_name}") 39 | 40 | return df 41 | 42 | def generate_foldername(): 43 | now = datetime.now() 44 | return now.strftime("%Y-%m-%d-%H-%M-%S") 45 | 46 | 47 | def load_df_to_gcs_parquet(df, dest_path): 48 | # Create an in-memory bytes buffer 49 | buffer = BytesIO() 50 | 51 | try: 52 | # Save the dataframe to the buffer in parquet format 53 | df.to_parquet(buffer, index=False) 54 | 55 | # Move the buffer's pointer to the beginning of the file 56 | buffer.seek(0) 57 | 58 | # Create a blob in the specified GCS bucket path 59 | blob = bucket.blob(dest_path) 60 | 61 | # Upload the buffer content as a parquet file 62 | blob.upload_from_file(buffer, content_type='application/octet-stream') 63 | 64 | logging.info(f"Data uploaded to {dest_path} in Parquet format") 65 | return True 66 | except Exception as e: 67 | logging.error(f"Failed to upload data to {dest_path}: {e}") 68 | return False 69 | 70 | def preprocess_data(property_df, walkscore_df): 71 | preprocessor = DataPreprocessor(with_text=True, with_binary=False) 72 | 73 | property_df = preprocessor.preprocess_properties(property_df) 74 | walk_df = preprocessor.preprocess_walk_score(walkscore_df) 75 | 76 | df = property_df.merge(walk_df, on="id", how="left") 77 | 78 | logging.info("Data preprocessed") 79 | 80 | return df 81 | 82 | def fetch_preprocess_data(): 83 | property_df = load_data_from_mongo( 84 | collection_name="properties", 85 | fields={ 86 | "id": 1, 87 | "price.amount": 1, 88 | "price.frequency": 1, 89 | "firstVisibleDate": 1, 90 | "bedrooms": 1, 91 | "bathrooms": 1, 92 | "listingUpdate": 1, 93 | "location": 1, 94 | "summary": 1, 95 | "feature_list": 1, 96 | }, 97 | ) 98 | walkscore_df = load_data_from_mongo( 99 | collection_name="walk_scores", fields={"id": 1, "scores": 1} 100 | ) 101 | 102 | df = preprocess_data(property_df, walkscore_df) 103 | 104 | dest_path = f"streamlit_data/{generate_foldername()}/data.parquet" 105 | load_df_to_gcs_parquet(df, dest_path) 106 | 107 | logging.info(f"Data saved to {dest_path}") 108 | 109 | return df 110 | def load_data(): 111 | df = fetch_preprocess_data() 112 | logging.info("Data loaded") 113 | 114 | default_args = { 115 | "owner": "airflow_app", 116 | "depends_on_past": False, 117 | "email_on_failure": False, 118 | "email_on_retry": False, 119 | "retries": 1, 120 | "retry_delay": timedelta(minutes=5), 121 | } 122 | 123 | dag = DAG( 124 | "streamlit_data_extraction", 125 | default_args=default_args, 126 | description="DAG for extracting data for Streamlit app", 127 | schedule_interval=timedelta(days=1), 128 | start_date=datetime(2023, 1, 1), 129 | catchup=False, 130 | max_active_runs=1, 131 | ) 132 | 133 | start_task = DummyOperator(task_id="start", dag=dag) 134 | 135 | load_data_task = PythonOperator( 136 | task_id="load_data", python_callable=load_data, dag=dag 137 | ) 138 | 139 | end_task = DummyOperator(task_id="end", dag=dag) 140 | 141 | start_task>> load_data_task >> end_task 142 | -------------------------------------------------------------------------------- /rightmove/backend/app/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | from http.client import HTTPException 3 | from typing import Union 4 | 5 | from fastapi import FastAPI 6 | import psycopg2 7 | from psycopg2.extras import RealDictCursor 8 | 9 | from pydantic import BaseModel 10 | from app.data_processing.walk_score_processing import WalkScoreProcessor 11 | # from data_processing.walk_score_processing import WalkScoreProcessor 12 | from sklearn.neighbors import BallTree 13 | import json 14 | 15 | from math import radians 16 | from typing import List 17 | import mlflow.pyfunc 18 | 19 | import pandas as pd 20 | 21 | import math 22 | import numpy as np 23 | 24 | # from dotenv import load_dotenv 25 | # load_dotenv("/Users/alexander.girardet/Code/Personal/projects/rightmove_project/.env") 26 | 27 | 28 | class Property(BaseModel): 29 | bedrooms: float 30 | bathrooms: float 31 | longitude: float 32 | latitude: float 33 | walk_score: float 34 | 35 | 36 | class Coordinates(BaseModel): 37 | longitude: float 38 | latitude: float 39 | 40 | 41 | MLFLOW_TRACKING_URI = os.environ.get("MLFLOW_TRACKING_URI") 42 | 43 | app = FastAPI() 44 | 45 | model_name = "Random Forest Walk Score" 46 | model_stage = "Staging" 47 | model_uri = f"models:/{model_name}/{model_stage}" 48 | model = mlflow.pyfunc.load_model(model_uri) 49 | 50 | MONGO_URI = os.environ.get("MONGO_URI") 51 | 52 | GCS_PARQUET_URL = ( 53 | "https://storage.googleapis.com/rightmove-resources-public/UK_pois.parquet" 54 | ) 55 | WALK_SCORES_COLLECTION = "walk_scores" 56 | 57 | BATCH_SIZE = 50 58 | 59 | 60 | @app.post("/predict") 61 | async def predict_rent(input_property: Property): 62 | try: 63 | features_df = pd.DataFrame(input_property.dict(), index=[0]) 64 | prediction = model.predict(features_df) 65 | return {"prediction": prediction[0]} 66 | 67 | except Exception as e: 68 | raise HTTPException() 69 | 70 | 71 | @app.post("/batch-predict") 72 | async def batch_predict_rent(input_properties: List[Property]): 73 | try: 74 | properties_dicts = [property.dict() for property in input_properties] 75 | features_df = pd.DataFrame(properties_dicts) 76 | predictions = model.predict(features_df) 77 | 78 | return {"predictions": predictions.tolist()} 79 | 80 | except Exception as e: 81 | raise HTTPException(status_code=400, detail=str(e)) 82 | 83 | 84 | @app.post("/walk_score") 85 | async def generate_walk_score(input_coordinates: Coordinates): 86 | try: 87 | walk_score_processor = WalkScoreProcessor() 88 | input_coordinates = input_coordinates.dict() 89 | longitude = input_coordinates["longitude"] 90 | latitude = input_coordinates["latitude"] 91 | scores_dict = walk_score_processor.calculuate_walk_score(longitude, latitude) 92 | walk_score = sum(scores_dict.values()) * 6.67 93 | return {"walk_score": walk_score} 94 | 95 | except Exception as e: 96 | raise HTTPException() 97 | 98 | 99 | SQL_QUERY = """ 100 | SELECT d.dataset_source 101 | FROM datasets d 102 | INNER JOIN ( 103 | SELECT i.source_id AS dataset_id 104 | FROM inputs i 105 | INNER JOIN ( 106 | SELECT run_id 107 | FROM model_versions 108 | ORDER BY version DESC 109 | LIMIT 1 110 | ) mv ON i.destination_id = mv.run_id 111 | WHERE i.source_type = 'DATASET' 112 | LIMIT 1 113 | ) subquery ON d.dataset_uuid = subquery.dataset_id; 114 | """ 115 | def fix_database_uri(uri: str) -> str: 116 | # Check if URI contains '+psycopg2' and remove it 117 | if "+psycopg2" in uri: 118 | uri = uri.replace("+psycopg2", "") 119 | return uri 120 | @app.get("/latest-dataset") 121 | def get_latest_dataset_source(): 122 | try: 123 | PG_URI = fix_database_uri(MLFLOW_TRACKING_URI) 124 | 125 | with psycopg2.connect(PG_URI, cursor_factory=RealDictCursor) as conn: 126 | with conn.cursor() as cur: 127 | cur.execute(SQL_QUERY) 128 | 129 | result = cur.fetchone() 130 | if result: 131 | dataset_source_info = json.loads(result["dataset_source"]) 132 | uri = dataset_source_info.get("uri", "URI not found") 133 | return {"uri": uri} 134 | else: 135 | return {"error": "No dataset source found for the latest model version."} 136 | 137 | except Exception as e: 138 | return {"error": str(e)} 139 | 140 | 141 | 142 | @app.get("/") 143 | def read_root(): 144 | return {"Hello": "World"} 145 | -------------------------------------------------------------------------------- /rightmove/orchestration/airflow_app/dags/rightmove/rightmove_ingest.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | import time 3 | import requests 4 | import logging 5 | 6 | from airflow import DAG 7 | from airflow.operators.python_operator import PythonOperator 8 | from airflow.operators.dummy_operator import DummyOperator 9 | 10 | from rightmove.data_processing.rightmove_processing import run 11 | 12 | 13 | SCRAPYD_ENDPOINT = "http://scrapy_app:6800" 14 | SPIDER = "rightmove" 15 | PROJECT = "scraper" 16 | 17 | 18 | def start_spider(): 19 | payload = f"project={PROJECT}&spider={SPIDER}" 20 | headers = {"Content-Type": "application/x-www-form-urlencoded"} 21 | 22 | url = SCRAPYD_ENDPOINT + "/schedule.json" 23 | 24 | response = requests.request("POST", url, headers=headers, data=payload) 25 | 26 | if response.status_code == 200: 27 | logging.info("Request successful") 28 | if response.json()["status"] == "ok": 29 | logging.info("Spider started successfully") 30 | job_id = response.json()["jobid"] 31 | return job_id 32 | else: 33 | logging.info(response.text) 34 | raise ValueError("Spider has not been started") 35 | else: 36 | print(response.text) 37 | raise ValueError("Request failed") 38 | 39 | 40 | def cancel_spider(**kwargs): 41 | job_id = kwargs["ti"].xcom_pull(task_ids="start_spider") 42 | 43 | print(f"Cancelling job id: {job_id}") 44 | 45 | payload = f"project={PROJECT}&job={job_id}" 46 | headers = {"Content-Type": "application/x-www-form-urlencoded"} 47 | 48 | url = SCRAPYD_ENDPOINT + "/cancel.json" 49 | 50 | response = requests.request("POST", url, headers=headers, data=payload) 51 | 52 | print(response.text) 53 | if response.status_code == 200: 54 | print("Request successful") 55 | if response.json()["status"] == "ok": 56 | print("Job cancelled successfully") 57 | else: 58 | print(response.text) 59 | else: 60 | print(response.text) 61 | raise ValueError("Request failed spider has not been canceled") 62 | 63 | return "Success" 64 | 65 | 66 | def repeated_requests(**kwargs): 67 | end_time = datetime.now() + timedelta(seconds=900) # 15 minute scraping session 68 | 69 | # url = f"http://scrapyapp:6800/listjobs.json?project={PROJECT}" 70 | 71 | url = SCRAPYD_ENDPOINT + "/listjobs.json?project=" + PROJECT 72 | 73 | payload = {} 74 | headers = {} 75 | 76 | job_id = kwargs["ti"].xcom_pull(task_ids="start_spider") 77 | 78 | while datetime.now() < end_time: 79 | response = requests.request("GET", url, headers=headers, data=payload) 80 | 81 | print(f"Response code: {response.status_code}") 82 | if response.status_code == 200: 83 | print("Request successful") 84 | if response.json()["status"] == "ok": 85 | print("Scrapy status is okay") 86 | 87 | running_jobs = response.json()["running"] 88 | 89 | if job_id in [job["id"] for job in running_jobs]: 90 | print("Job is running") 91 | 92 | elif response.json()["status"] == "error": 93 | print("Scrapy status is error") 94 | print(response.json()["message"]) 95 | raise ValueError("Scrapy status is error") 96 | else: 97 | print(response.text) 98 | 99 | time.sleep(30) # wait for 30 seconds before next request 100 | return "Success" 101 | 102 | 103 | default_args = { 104 | "owner": "airflow_app", 105 | "depends_on_past": False, 106 | "email_on_failure": False, 107 | "email_on_retry": False, 108 | "retries": 1, 109 | "retry_delay": timedelta(minutes=5), 110 | } 111 | 112 | dag = DAG( 113 | "scrape_rightmove", 114 | default_args=default_args, 115 | description="DAG for making scraping rightmove", 116 | schedule_interval=timedelta(days=1), 117 | start_date=datetime(2023, 1, 1), 118 | catchup=False, 119 | max_active_runs=1, 120 | ) 121 | 122 | start_task = DummyOperator(task_id="start", dag=dag) 123 | 124 | start_spider_task = PythonOperator( 125 | task_id="start_spider", python_callable=start_spider, dag=dag 126 | ) 127 | 128 | periodic_requests = PythonOperator( 129 | task_id="periodic_requests", 130 | python_callable=repeated_requests, 131 | provide_context=True, 132 | dag=dag, 133 | ) 134 | 135 | cancel_spider_task = PythonOperator( 136 | task_id="cancel_spider", 137 | python_callable=cancel_spider, 138 | provide_context=True, 139 | dag=dag, 140 | ) 141 | 142 | run_beam_pipeline = PythonOperator( 143 | task_id="run_beam_pipeline", python_callable=run, dag=dag 144 | ) 145 | 146 | 147 | end_task = DummyOperator(task_id="end", dag=dag) 148 | 149 | ( 150 | start_task 151 | >> start_spider_task 152 | >> periodic_requests 153 | >> cancel_spider_task 154 | >> run_beam_pipeline 155 | >> end_task 156 | ) 157 | -------------------------------------------------------------------------------- /notebooks/data_ingestion/fetch_outcodes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "id": "7995756c-fe5f-48d9-8512-d797b73e5157", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from selenium import webdriver\n", 11 | "from selenium.webdriver.common.keys import Keys\n", 12 | "from selenium.webdriver.common.by import By\n", 13 | "\n", 14 | "import pandas as pd\n", 15 | "import re" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "id": "f9e3c5c5-bcad-4d4b-bdc9-0a43adbfd444", 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "# Function that returns right ID for a given outcode\n", 26 | "def get_outcode_value(postcode, driver):\n", 27 | " driver.get(\"https://www.rightmove.co.uk/property-to-rent.html\")\n", 28 | " input_box = driver.find_element(By.XPATH, '//*[@id=\"searchLocation\"]')\n", 29 | " input_box.send_keys(postcode)\n", 30 | " search_box = driver.find_element(By.XPATH, '//*[@id=\"search\"]')\n", 31 | " search_box.click()\n", 32 | " \n", 33 | " try:\n", 34 | " submit = driver.find_element(By.ID, \"submit\")\n", 35 | " submit.click()\n", 36 | " url = driver.current_url\n", 37 | " outcode_value = re.findall(\"(?<=locationIdentifier=OUTCODE%5E)(.*)(?=&insId)\", url)[0]\n", 38 | " except:\n", 39 | " header_title = driver.find_element(By.ID, \"headerTitle\")\n", 40 | " outcode_value = None\n", 41 | " \n", 42 | " \n", 43 | " return outcode_value\n", 44 | "\n", 45 | "# Function to fetch currently loaded outcodes in case selenium crashed\n", 46 | "def fetch_current_rightmove_outcodes(cursor):\n", 47 | " cursor.execute(\"SELECT outcode FROM rightmove_outcodes\")\n", 48 | " fetched_outcodes = cursor.fetchall()\n", 49 | " outcode_list = [x[0] for x in fetched_outcodes]\n", 50 | "\n", 51 | " return outcode_list" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "id": "adcf7674-49d6-4c70-991c-6f9463e11782", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "def fetch_outcodes(df, cursor, driver):\n", 62 | " for row in df.itertuples():\n", 63 | " outcode = row.postcode\n", 64 | " index = row.Index\n", 65 | " \n", 66 | " if outcode not in current_outcodes:\n", 67 | " outcode_value = get_outcode_value(outcode, driver)\n", 68 | " \n", 69 | " if outcode_value is not None:\n", 70 | "\n", 71 | " transaction = \"INSERT IGNORE INTO rightmove_outcodes(outcode, rightmove_code) VALUES ('{}', {});\".format(\n", 72 | " outcode, outcode_value)\n", 73 | "\n", 74 | " cursor.execute(transaction)\n", 75 | "\n", 76 | " con.commit()\n", 77 | " else:\n", 78 | " pass\n", 79 | " else:\n", 80 | " pass" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "id": "76829d13-b123-4c27-88b5-c516e5fac8a1", 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "def run():\n", 91 | " driver = webdriver.Firefox()\n", 92 | "\n", 93 | " current_outcodes = fetch_current_rightmove_outcodes(driver)\n", 94 | "\n", 95 | " # Load UK outcode csv file into pandas\n", 96 | " df = pd.read_csv(\"../../data/outcodes.csv\", index_col=0)\n", 97 | "\n", 98 | " fetch_outcodes(df, cursor, driver) " 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "id": "281a6c10-cf5d-488a-82d5-f5ef567b2179", 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "import os\n", 109 | "gecko_path = os.path.expanduser('~/Downloads/geckodriver')\n", 110 | "driver = webdriver.Firefox(executable_path=gecko_path)\n", 111 | "\n", 112 | "driver.get(\"https://www.rightmove.co.uk/property-to-rent.html\")\n", 113 | "\n", 114 | "input_box = driver.find_element(By.XPATH, '//*[@id=\"searchLocation\"]')\n", 115 | "input_box.send_keys(postcode)\n", 116 | "search_box = driver.find_element(By.XPATH, '//*[@id=\"search\"]')\n", 117 | "search_box.click()" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "id": "a842cd6d-42df-47f4-89a0-a43f33c429d9", 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [] 127 | } 128 | ], 129 | "metadata": { 130 | "kernelspec": { 131 | "display_name": "Python 3 (ipykernel)", 132 | "language": "python", 133 | "name": "python3" 134 | }, 135 | "language_info": { 136 | "codemirror_mode": { 137 | "name": "ipython", 138 | "version": 3 139 | }, 140 | "file_extension": ".py", 141 | "mimetype": "text/x-python", 142 | "name": "python", 143 | "nbconvert_exporter": "python", 144 | "pygments_lexer": "ipython3", 145 | "version": "3.11.4" 146 | } 147 | }, 148 | "nbformat": 4, 149 | "nbformat_minor": 5 150 | } 151 | -------------------------------------------------------------------------------- /notebooks/data_ingestion/.ipynb_checkpoints/fetch_outcodes-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "id": "7995756c-fe5f-48d9-8512-d797b73e5157", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from selenium import webdriver\n", 11 | "from selenium.webdriver.common.keys import Keys\n", 12 | "from selenium.webdriver.common.by import By\n", 13 | "\n", 14 | "import pandas as pd\n", 15 | "import re" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "id": "f9e3c5c5-bcad-4d4b-bdc9-0a43adbfd444", 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "# Function that returns right ID for a given outcode\n", 26 | "def get_outcode_value(postcode, driver):\n", 27 | " driver.get(\"https://www.rightmove.co.uk/property-to-rent.html\")\n", 28 | " input_box = driver.find_element(By.XPATH, '//*[@id=\"searchLocation\"]')\n", 29 | " input_box.send_keys(postcode)\n", 30 | " search_box = driver.find_element(By.XPATH, '//*[@id=\"search\"]')\n", 31 | " search_box.click()\n", 32 | " \n", 33 | " try:\n", 34 | " submit = driver.find_element(By.ID, \"submit\")\n", 35 | " submit.click()\n", 36 | " url = driver.current_url\n", 37 | " outcode_value = re.findall(\"(?<=locationIdentifier=OUTCODE%5E)(.*)(?=&insId)\", url)[0]\n", 38 | " except:\n", 39 | " header_title = driver.find_element(By.ID, \"headerTitle\")\n", 40 | " outcode_value = None\n", 41 | " \n", 42 | " \n", 43 | " return outcode_value\n", 44 | "\n", 45 | "# Function to fetch currently loaded outcodes in case selenium crashed\n", 46 | "def fetch_current_rightmove_outcodes(cursor):\n", 47 | " cursor.execute(\"SELECT outcode FROM rightmove_outcodes\")\n", 48 | " fetched_outcodes = cursor.fetchall()\n", 49 | " outcode_list = [x[0] for x in fetched_outcodes]\n", 50 | "\n", 51 | " return outcode_list" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "id": "adcf7674-49d6-4c70-991c-6f9463e11782", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "def fetch_outcodes(df, cursor, driver):\n", 62 | " for row in df.itertuples():\n", 63 | " outcode = row.postcode\n", 64 | " index = row.Index\n", 65 | " \n", 66 | " if outcode not in current_outcodes:\n", 67 | " outcode_value = get_outcode_value(outcode, driver)\n", 68 | " \n", 69 | " if outcode_value is not None:\n", 70 | "\n", 71 | " transaction = \"INSERT IGNORE INTO rightmove_outcodes(outcode, rightmove_code) VALUES ('{}', {});\".format(\n", 72 | " outcode, outcode_value)\n", 73 | "\n", 74 | " cursor.execute(transaction)\n", 75 | "\n", 76 | " con.commit()\n", 77 | " else:\n", 78 | " pass\n", 79 | " else:\n", 80 | " pass" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "id": "76829d13-b123-4c27-88b5-c516e5fac8a1", 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "def run():\n", 91 | " driver = webdriver.Firefox()\n", 92 | "\n", 93 | " current_outcodes = fetch_current_rightmove_outcodes(driver)\n", 94 | "\n", 95 | " # Load UK outcode csv file into pandas\n", 96 | " df = pd.read_csv(\"../../data/outcodes.csv\", index_col=0)\n", 97 | "\n", 98 | " fetch_outcodes(df, cursor, driver) " 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "id": "281a6c10-cf5d-488a-82d5-f5ef567b2179", 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "import os\n", 109 | "gecko_path = os.path.expanduser('~/Downloads/geckodriver')\n", 110 | "driver = webdriver.Firefox(executable_path=gecko_path)\n", 111 | "\n", 112 | "driver.get(\"https://www.rightmove.co.uk/property-to-rent.html\")\n", 113 | "\n", 114 | "input_box = driver.find_element(By.XPATH, '//*[@id=\"searchLocation\"]')\n", 115 | "input_box.send_keys(postcode)\n", 116 | "search_box = driver.find_element(By.XPATH, '//*[@id=\"search\"]')\n", 117 | "search_box.click()" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "id": "a842cd6d-42df-47f4-89a0-a43f33c429d9", 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [] 127 | } 128 | ], 129 | "metadata": { 130 | "kernelspec": { 131 | "display_name": "Python 3 (ipykernel)", 132 | "language": "python", 133 | "name": "python3" 134 | }, 135 | "language_info": { 136 | "codemirror_mode": { 137 | "name": "ipython", 138 | "version": 3 139 | }, 140 | "file_extension": ".py", 141 | "mimetype": "text/x-python", 142 | "name": "python", 143 | "nbconvert_exporter": "python", 144 | "pygments_lexer": "ipython3", 145 | "version": "3.11.4" 146 | } 147 | }, 148 | "nbformat": 4, 149 | "nbformat_minor": 5 150 | } 151 | -------------------------------------------------------------------------------- /rightmove/data_ingestion/rightmove_scraper/rightmove_scraper/spiders/rightmove.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import os 3 | import csv 4 | import requests 5 | import io 6 | 7 | import logging 8 | 9 | logging.basicConfig(level=logging.DEBUG) 10 | logger = logging.getLogger(__name__) 11 | 12 | from bs4 import BeautifulSoup 13 | 14 | from pymongo import MongoClient 15 | 16 | # MONGO_URL = "mongodb://mongodb:27017/" 17 | MONGO_URI = os.environ.get("MONGO_URI") 18 | 19 | 20 | class RightmoveSpider(scrapy.Spider): 21 | name = "rightmove" 22 | 23 | def __init__(self, *args, **kwargs): 24 | self.headers = { 25 | "Accept": "application/json, text/plain, */*", 26 | "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8", 27 | "Connection": "keep-alive", 28 | "Referer": "https://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=REGION%5E87490&index=24&propertyTypes=&includeLetAgreed=false&mustHave=&dontShow=&furnishTypes=&keywords=", 29 | "Sec-Fetch-Dest": "empty", 30 | "Sec-Fetch-Mode": "cors", 31 | "Sec-Fetch-Site": "same-origin", 32 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36", 33 | "sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"', 34 | "sec-ch-ua-mobile": "?0", 35 | "sec-ch-ua-platform": '"macOS"', 36 | } 37 | 38 | self.rightmove_ids = self.get_property_ids() 39 | 40 | print(self.rightmove_ids) 41 | 42 | print("Number of IDs: ", len(self.rightmove_ids)) 43 | 44 | logger.info(f"Fetching new MongoDB data from {MONGO_URI}...") 45 | 46 | self.fetched_outcodes = self.get_outcodes() 47 | 48 | def start_requests(self): 49 | for codes in self.fetched_outcodes: 50 | rightmove_code = codes[1] 51 | postcode = codes[0] 52 | for index_jump in range( 53 | 0, 100, 25 54 | ): # Adjusting to 100 so I can have some extra values to test with 55 | url = f"https://www.rightmove.co.uk/api/_search?locationIdentifier=OUTCODE%5E{rightmove_code}&numberOfPropertiesPerPage=24&radius=10.0&sortType=6&index={index_jump}&includeLetAgreed=false&viewType=LIST&channel=RENT&areaSizeUnit=sqft¤cyCode=GBP&isFetching=false" 56 | 57 | yield scrapy.Request( 58 | method="GET", url=url, headers=self.headers, callback=self.parse 59 | ) 60 | 61 | def parse(self, response): 62 | listings = response.json()["properties"] 63 | for listing in listings: 64 | property_id = listing["id"] 65 | 66 | if property_id not in self.rightmove_ids: 67 | property_url = f"https://www.rightmove.co.uk/properties/{property_id}" 68 | 69 | yield scrapy.Request( 70 | method="GET", 71 | url=property_url, 72 | headers=self.headers, 73 | callback=self.parse_property, 74 | meta={"item": listing}, 75 | ) 76 | else: 77 | print("Already loaded in") 78 | 79 | def parse_property(self, response): 80 | soup = BeautifulSoup(response.text, "lxml") 81 | 82 | item = response.meta["item"] 83 | 84 | # Get feature list 85 | try: 86 | uls = soup.find("ul", {"class": "_1uI3IvdF5sIuBtRIvKrreQ"}) 87 | features = uls.find_all("li") 88 | feature_list = [feature.text for feature in features] 89 | except: 90 | feature_list = None 91 | 92 | # Get full summary 93 | summary = soup.find("div", {"class": "OD0O7FWw1TjbTD4sdRi1_"}).div.text 94 | 95 | # Assign content to item 96 | item["feature_list"] = feature_list 97 | item["summary"] = summary 98 | 99 | yield item 100 | 101 | def get_outcodes(self): 102 | # URL of the CSV file in the public GCS bucket 103 | csv_url = "https://storage.googleapis.com/rightmove-resources-public/rightmove_outcodes.csv" 104 | 105 | # Download the CSV file 106 | response = requests.get(csv_url) 107 | if response.status_code == 200: 108 | # Convert binary data to a text stream 109 | csv_text = io.StringIO(response.content.decode("utf-8")) 110 | 111 | # Read CSV data 112 | reader = csv.reader(csv_text) 113 | outcodes = list(reader) 114 | outcodes = outcodes[1:] # Skip header row 115 | outcodes = [(outcode[1], outcode[2]) for outcode in outcodes] 116 | return outcodes 117 | else: 118 | print("Failed to download CSV file") 119 | return [] 120 | 121 | def get_property_ids(self) -> list: 122 | client = MongoClient(MONGO_URI) 123 | # client = MongoClient("mongodb://localhost:27017/") 124 | db = client["rightmove"] 125 | # Access collection 126 | collection = db["properties"] 127 | 128 | # logging.info("Connected to MongoDB") 129 | 130 | rightmove_ids = collection.find({}, {"id": 1}) 131 | 132 | # Convert the result to a list of IDs 133 | ids = [doc["id"] for doc in rightmove_ids] 134 | 135 | client.close() 136 | 137 | return ids 138 | -------------------------------------------------------------------------------- /rightmove/dashboard/streamlit/01_LandingPage.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import requests 3 | import pydeck as pdk 4 | from wordcloud import WordCloud 5 | import pandas as pd 6 | from datetime import datetime, timedelta 7 | import geopandas 8 | import matplotlib.pyplot as plt 9 | import json 10 | import seaborn as sns 11 | import plotly.express as px 12 | import os 13 | 14 | import logging 15 | 16 | logging.basicConfig(level=logging.INFO) 17 | 18 | from dotenv import load_dotenv 19 | load_dotenv("/.env") 20 | 21 | MONGO_URI = os.environ.get("MONGO_URI") 22 | 23 | @st.cache_data 24 | def load_data(): 25 | df = pd.read_parquet( 26 | "gs://rightmove-artifacts-ml/streamlit_data/2024-02-27-12-32-07/data.parquet" 27 | ) 28 | 29 | df["monthly_price"] = df["price"] / 12 30 | df = df.dropna() 31 | return df 32 | 33 | 34 | def get_recents(subset, days): 35 | new_df = subset[subset["listingUpdateReason"] == "new"] 36 | 37 | today = pd.Timestamp(datetime.now(), tz="UTC") 38 | 39 | # Calculate the start date of the last week (7 days ago) 40 | date_start = today - timedelta(days=days) 41 | 42 | new_df["firstVisibleDate"] = pd.to_datetime(new_df["firstVisibleDate"], utc=True) 43 | 44 | # Corrected filtering to use new_df instead of df 45 | in_between_rows = new_df[ 46 | (new_df["firstVisibleDate"] > date_start) 47 | & (new_df["firstVisibleDate"] <= today) 48 | ] 49 | 50 | # Get the total number of rows 51 | total_rows = len(in_between_rows) 52 | return total_rows 53 | 54 | 55 | def plot_bedrooms_distribution(df): 56 | max_bedrooms = df["bedrooms"].max() 57 | fig = px.histogram(df, x="bedrooms", title="Distribution of Bedrooms") 58 | fig.update_layout( 59 | xaxis=dict(title="Number of Bedrooms", tickmode="linear", dtick=1), 60 | yaxis_title="Number of Properties", 61 | plot_bgcolor="rgba(0,0,0,0)", 62 | ) 63 | return fig 64 | 65 | 66 | def plot_bathrooms_distribution(df): 67 | # Determine the maximum number of bathrooms to set appropriate bins 68 | max_bathrooms = df["bathrooms"].max() 69 | fig = px.histogram(df, x="bathrooms", title="Distribution of Bathrooms") 70 | fig.update_layout( 71 | xaxis=dict(title="Number of Bathrooms", tickmode="linear", dtick=1), 72 | yaxis_title="Number of Properties", 73 | plot_bgcolor="rgba(0,0,0,0)", 74 | ) 75 | return fig 76 | 77 | 78 | def plot_price_density(df): 79 | fig = px.histogram( 80 | df, x="monthly_price", title="Distribution of Monthly Rental Prices" 81 | ) 82 | fig.update_layout( 83 | xaxis_title="Rental Price", 84 | yaxis_title="Number of Properties", 85 | plot_bgcolor="rgba(0,0,0,0)", 86 | ) 87 | return fig 88 | 89 | 90 | df = load_data() 91 | 92 | min_price, max_price = st.sidebar.slider( 93 | "Select a monthly rental price range:", 94 | min_value=int(df["monthly_price"].min()), # Minimum value for the slider 95 | max_value=int(df["monthly_price"].max()), # Maximum value for the slider 96 | value=( 97 | int(df["monthly_price"].min()), 98 | int(df["monthly_price"].max()), 99 | ), # Initial range (min, max) 100 | ) 101 | 102 | subset = df[(df["monthly_price"] >= min_price) & (df["monthly_price"] <= max_price)] 103 | 104 | # Streamlit UI 105 | col1, col2, col3 = st.columns(3) 106 | 107 | # Calculate properties added since last week 108 | properties_last_week = get_recents(subset, 8) # Last 7 days 109 | # Display metric in the first column, restrict to 2 decimal places 110 | col1.metric(label="Properties Added Since Last Week", value=f"{properties_last_week}") 111 | 112 | # Calculate properties added since yesterday 113 | properties_yesterday = get_recents(subset, 2) # Last 1 day 114 | # Display metric in the second column, restrict to 2 decimal places 115 | col2.metric(label="Properties Added Since Yesterday", value=f"{properties_yesterday}") 116 | 117 | # Calculate the total number of properties 118 | total_properties = len(subset) 119 | # Display metric in the third column, restrict to 2 decimal places 120 | col3.metric(label="Total Properties", value=f"{total_properties}") 121 | 122 | st.header("Property Distribution Map") 123 | layer = pdk.Layer( 124 | "HexagonLayer", # `type` positional argument is here 125 | subset[["longitude", "latitude"]], # `data` positional argument is here 126 | get_position=["longitude", "latitude"], 127 | auto_highlight=True, 128 | elevation_scale=50, 129 | pickable=True, 130 | elevation_range=[0, 3000], 131 | extruded=True, 132 | coverage=1, 133 | ) 134 | 135 | # Set the viewport location 136 | view_state = pdk.ViewState( 137 | longitude=-1.415, 138 | latitude=52.2323, 139 | zoom=6, 140 | min_zoom=5, 141 | max_zoom=15, 142 | pitch=40.5, 143 | bearing=-27.36, 144 | ) 145 | 146 | # Combine everything and render a viewport 147 | r = pdk.Deck(layers=[layer], initial_view_state=view_state) 148 | st.info( 149 | "The map displays the distribution of properties based on their location. The higher the concentration of properties, the higher the elevation." 150 | ) 151 | st.pydeck_chart(r) 152 | 153 | st.header("Histogram and Density Plots of Property Features") 154 | 155 | st.info( 156 | "The following plots provide a visual representation of the distribution of property features such as bedrooms, bathrooms, and rental prices." 157 | ) 158 | 159 | col1, col2 = st.columns(2) 160 | with col1: 161 | st.plotly_chart(plot_bedrooms_distribution(subset), use_container_width=True) 162 | with col2: 163 | st.plotly_chart(plot_bathrooms_distribution(subset), use_container_width=True) 164 | 165 | # Density plot for price 166 | st.plotly_chart(plot_price_density(subset), use_container_width=True) -------------------------------------------------------------------------------- /rightmove/dashboard/streamlit/pages/03_WalkScore.py: -------------------------------------------------------------------------------- 1 | import geopandas 2 | import streamlit as st 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import math 6 | 7 | 8 | @st.cache_data 9 | def load_geo_data(): 10 | # Load your GeoPandas DataFrame here 11 | # For example: gdf = gpd.read_file('your_file_path.shp') 12 | # Returning an example gdf, replace this with your actual data loading 13 | gdf = geopandas.read_file( 14 | "/Users/alexander.girardet/Code/Personal/projects/rightmove_project/notebooks/serving/london_borough_stats.geojson" 15 | ) 16 | return gdf 17 | 18 | 19 | def plot_geo_data(gdf, column): 20 | # Create the figure and axis with a larger size for better visibility 21 | fig, ax = plt.subplots(figsize=(14, 8)) 22 | 23 | # Plot the GeoDataFrame with a more appealing color map and adjust the legend 24 | gdf.plot( 25 | column=column, 26 | cmap="viridis", 27 | legend=True, 28 | ax=ax, 29 | legend_kwds={ 30 | "label": f"{reversed_options[column]}", 31 | "orientation": "horizontal", 32 | }, 33 | ) 34 | 35 | # Adjust the figure layout to accommodate the legend and ensure no clipping 36 | plt.tight_layout() 37 | 38 | return fig 39 | 40 | 41 | def distance_decay(distance): 42 | dist = distance / 1000 # Convert distance to kilometers 43 | score = math.e ** ((-5.0 * (dist / 4)) ** 5.0) 44 | return score 45 | 46 | 47 | def plot_distance_decay(): 48 | # Generate distances from 0 to 2000 meters 49 | distances = np.linspace(0, 2000, 500) 50 | scores = np.array([distance_decay(d) for d in distances]) 51 | 52 | # Plotting the decay of distance 53 | plt.figure(figsize=(10, 6)) 54 | plt.plot(distances, scores, label="Distance Decay", color="blue") 55 | plt.xlabel("Distance (meters)") 56 | plt.ylabel("Score") 57 | plt.title("Distance Decay Effect on Score") 58 | plt.grid(True) 59 | plt.xlim(0, 2000) # Limit x-axis to 2000 meters 60 | plt.legend() 61 | plt.tight_layout() 62 | return plt 63 | 64 | 65 | def calculate_amenity_walk_score(distances, amenity_weights): 66 | total_score = 0 67 | for amenity, distance in distances.items(): 68 | decayed_distance = distance_decay(distance) 69 | weights = amenity_weights.get( 70 | amenity, [1] 71 | ) # Default weight if amenity not found 72 | # Assume the first weight for simplicity, could be adapted for multiple distances per amenity 73 | amenity_score = decayed_distance * weights[0] 74 | total_score += amenity_score 75 | return total_score 76 | 77 | 78 | # def get_walk_score(subset): 79 | # return subset['walk_score'].mean() 80 | # 81 | # # walk_score = get_walk_score(subset) 82 | # # Display metric in the third column, restrict to 2 decimal places 83 | # col3.metric(label="Average Walk Score", value=f"{walk_score:.2f}") 84 | 85 | amenity_weights = { 86 | "grocery": [3], 87 | "restaurants": [3], 88 | "shopping": [2], 89 | "coffee": [2], 90 | "banks": [1], 91 | "parks": [1], 92 | "schools": [1], 93 | "books": [1], 94 | "entertainment": [1], 95 | } 96 | 97 | # Streamlit app setup for interactive walk score explanation 98 | st.title("Interactive Walk Score Explanation") 99 | 100 | st.write( 101 | """ 102 | This application demonstrates how the walk score for a property is calculated based on the distances to various amenities. 103 | Walk score is a measure of how friendly an area is to walking with a score from 0 to 100, where higher scores indicate better walkability. 104 | """ 105 | ) 106 | 107 | st.header("Walk Score Visualization") 108 | 109 | st.write( 110 | """ 111 | In visualizing the walk score we consider the average price of properties, the walk score, and the property count. This could provide an indication 112 | of the relationship between the walk score and the average price of properties in a given area. Additionally, the property count could provide an 113 | indication of the demand, and supply for properties in a given area. Logically, the higher the walk score, the higher the density of properties in 114 | a given area, and the higher the average price of properties in a given area. 115 | """ 116 | ) 117 | 118 | options = { 119 | "Price": "avg_price", 120 | "Walk Score": "mean_walk_score", 121 | "Property Count": "property_count", 122 | } 123 | 124 | reversed_options = {value: key for key, value in options.items()} 125 | 126 | # Use the dictionary keys as the display labels and get the selected option value 127 | selected_label = st.selectbox("Choose attribute to visualize:", options.keys()) 128 | 129 | option_value = options[selected_label] 130 | 131 | gdf = load_geo_data() 132 | 133 | # Display the plot in Streamlit 134 | st.pyplot(plot_geo_data(gdf, option_value)) 135 | 136 | st.header("Distance Decay Visualization") 137 | st.write( 138 | "This plot shows the decay of scores with increasing distance for a single amenity. It illustrates how closer amenities contribute more significantly to the walk score." 139 | ) 140 | fig = plot_distance_decay() 141 | st.pyplot(fig) 142 | 143 | st.header("Customize Your Walk Score") 144 | st.write( 145 | "Adjust the sliders below to simulate distances to different amenities and calculate a simplified walk score." 146 | ) 147 | 148 | # Example of creating sliders for different amenities (simplified version) 149 | 150 | distances = {} 151 | for amenity in amenity_weights.keys(): 152 | distance = st.slider(f"Distance to nearest {amenity} (meters)", 0, 2000, 500, 50) 153 | distances[amenity] = distance 154 | 155 | total_walk_score = calculate_amenity_walk_score(distances, amenity_weights) 156 | 157 | walk_score = total_walk_score * 6.67 158 | 159 | st.metric("Total Walk Score", walk_score) 160 | -------------------------------------------------------------------------------- /rightmove/dashboard/streamlit/pages/02_Price.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | import pydeck as pdk 7 | 8 | 9 | @st.cache_data 10 | def load_data(): 11 | df = pd.read_parquet( 12 | "gs://rightmove-artifacts-ml/streamlit_data/2024-02-27-12-32-07/data.parquet" 13 | ) 14 | df["monthly_price"] = df["price"] / 12 15 | return df 16 | 17 | 18 | df = load_data() 19 | # Streamlit page setup 20 | st.title("Bedroom, Bathroom, and Location Relationship with Price") 21 | 22 | import plotly.express as px 23 | 24 | 25 | def get_average_price(df): 26 | avg_price = df["monthly_price"].mean() 27 | return avg_price 28 | 29 | 30 | def get_average_bedrooms(df): 31 | avg_bedrooms = df["bedrooms"].mean() 32 | return avg_bedrooms 33 | 34 | 35 | def get_average_bathrooms(df): 36 | avg_bathrooms = df["bathrooms"].mean() 37 | return avg_bathrooms 38 | 39 | 40 | def plot_price_by_bedrooms(df): 41 | fig = px.box( 42 | df, 43 | x="bedrooms", 44 | y="monthly_price", 45 | title="Rental Price Distribution by Number of bedrooms", 46 | ) 47 | fig.update_layout( 48 | xaxis=dict(title="Number of Bedrooms"), 49 | yaxis=dict(title="Rental Price"), 50 | plot_bgcolor="rgba(0,0,0,0)", 51 | title_x=0.5, 52 | ) 53 | return fig 54 | 55 | 56 | def plot_price_by_bathrooms(df): 57 | fig = px.box( 58 | df, 59 | x="bathrooms", 60 | y="monthly_price", 61 | title="Rental Price Distribution by Number of Bathrooms", 62 | ) 63 | fig.update_layout( 64 | xaxis=dict(title="Number of Bathrooms"), 65 | yaxis=dict(title="Rental Price"), 66 | plot_bgcolor="rgba(0,0,0,0)", 67 | title_x=0.5, 68 | ) 69 | 70 | return fig 71 | 72 | 73 | st.sidebar.title("Filters") 74 | min_price, max_price = st.sidebar.slider( 75 | "Select Rental Price Range", 76 | min_value=int(df["monthly_price"].min()), 77 | max_value=int(df["monthly_price"].max()), 78 | value=(int(df["monthly_price"].min()), int(df["monthly_price"].max())), 79 | ) 80 | 81 | # Filtering the DataFrame based on the selected price range 82 | filtered_df = df[ 83 | (df["monthly_price"] >= min_price) & (df["monthly_price"] <= max_price) 84 | ] 85 | 86 | col1, col2, col3 = st.columns(3) 87 | with col1: 88 | st.metric( 89 | label="Average Monthly Price", value=f"{get_average_price(filtered_df):.2f}" 90 | ) 91 | # Average Bedrooms and Bathroom 92 | with col2: 93 | st.metric( 94 | label="Average Number of Bathrooms", 95 | value=f"{get_average_bathrooms(filtered_df):.2f}", 96 | ) 97 | 98 | with col3: 99 | st.metric( 100 | label="Average Number of Bedrooms", 101 | value=f"{get_average_bedrooms(filtered_df):.2f}", 102 | ) 103 | 104 | st.plotly_chart(plot_price_by_bathrooms(filtered_df), use_container_width=True) 105 | st.plotly_chart(plot_price_by_bedrooms(filtered_df), use_container_width=True) 106 | 107 | max_rental_price = df["monthly_price"].max() 108 | 109 | # Assuming you've already grouped your data as needed or if you're using individual points, 110 | # you can directly use the rental_price for elevation. For a true mean aggregation, you'd need 111 | # to aggregate your data by the hexagon/bin locations, which requires additional preprocessing. 112 | 113 | # For color, normalize the rental_price to get a value between 0 and 255 for the color scale 114 | df["color_value"] = (df["monthly_price"] / max_rental_price) * 255 115 | df["color_value"] = df["color_value"].astype( 116 | int 117 | ) # Ensure it's an integer for color coding 118 | 119 | 120 | # Function to create the heatmap 121 | st.info( 122 | "The following map shows the distribution of rental prices in the selected area. The elevation of the hexagons represents the mean rental price of properties within each hexagon." 123 | ) 124 | @st.cache_resource 125 | def create_hexagon_map( 126 | dataframe, 127 | lat_col="latitude", 128 | lon_col="longitude", 129 | value_col="monthly_price", 130 | radius=200, 131 | ): 132 | """Create a hexagon map where the elevation represents the mean rental price of properties within each hexagon. 133 | 134 | Args: 135 | dataframe (pd.DataFrame): The dataframe containing the data. 136 | lat_col (str): Column name for latitude values. 137 | lon_col (str): Column name for longitude values. 138 | value_col (str): Column name for the values to average (mean rental price). 139 | radius (int): Radius of the hexagons in meters. 140 | 141 | Returns: 142 | pydeck.Deck: A pydeck Deck object ready to be displayed. 143 | """ 144 | # Aggregate data by hexagon 145 | layer = pdk.Layer( 146 | "HexagonLayer", 147 | dataframe[[lon_col, lat_col, value_col]], 148 | get_position=[lon_col, lat_col], 149 | auto_highlight=True, 150 | elevation_scale=50, # Adjust based on your data's scale for better visualization 151 | pickable=True, 152 | elevation_range=[0, 3000], # Max elevation in meters 153 | extruded=True, # Make hexagon 3D 154 | coverage=4, 155 | opacity=0.3, 156 | radius=radius, # Radius of hexagon in meters 157 | get_elevation="monthly_price", # Use the 'elevation' column if you've aggregated data 158 | get_fill_color="[255, 255, color_value, 140]", 159 | ) 160 | 161 | # Set the initial view 162 | view_state = pdk.ViewState( 163 | longitude=-1.415, 164 | latitude=52.2323, 165 | zoom=6, 166 | min_zoom=5, 167 | max_zoom=15, 168 | pitch=40.5, 169 | bearing=-27.36, 170 | ) 171 | 172 | # Combine everything and render a viewport 173 | r = pdk.Deck(layers=[layer], initial_view_state=view_state) 174 | 175 | return r 176 | 177 | 178 | # Example usage 179 | hex_map = create_hexagon_map(filtered_df) 180 | st.pydeck_chart(hex_map) 181 | -------------------------------------------------------------------------------- /notebooks/data_storage/mongo_integration.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "4b764f16-3e43-463f-af40-fbcc51f3f9cb", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from pymongo import MongoClient\n", 11 | "\n", 12 | "client = MongoClient(\"mongodb://localhost:27017/\") # Hosted with Docker" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "id": "3c7dc024-1194-4ca9-9179-dedcd5ca476c", 18 | "metadata": {}, 19 | "source": [ 20 | "## Access Database" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "id": "35c41cd0-27e0-48a3-9b35-4c15988f0010", 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "db = client[\"rightmove\"]\n", 31 | "\n", 32 | "# Access collection\n", 33 | "collection = db[\"properties\"]" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "id": "fd6b7ec8-958b-4b02-b882-5eac66268c09", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "# Insert a document\n", 44 | "collection.insert_one({\"name\": \"John Doe\", \"age\": 30})\n", 45 | "\n", 46 | "# Find a document\n", 47 | "user = collection.find_one({\"name\": \"John Doe\"})\n", 48 | "\n", 49 | "# Update a document\n", 50 | "collection.update_one({\"name\": \"John Doe\"}, {\"$set\": {\"age\": 31}})\n", 51 | "\n", 52 | "# Delete a document\n", 53 | "collection.delete_one({\"name\": \"John Doe\"})" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "id": "cf50b563-4cac-4156-9b6e-faaa4af9ca48", 59 | "metadata": {}, 60 | "source": [ 61 | "## Fetch data" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 30, 67 | "id": "77e22ba8-c237-4d04-959d-3683c114e35c", 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "import json\n", 72 | "\n", 73 | "with open(\"../resources/data/property.json\", \"r\") as file:\n", 74 | " property = json.load(file) \n", 75 | "\n", 76 | "with open(\"../resources/data/property_1.json\", \"r\") as file:\n", 77 | " property_1 = json.load(file) " 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "id": "478f104c-0b48-4211-991f-76fe790f65ba", 83 | "metadata": {}, 84 | "source": [ 85 | "## Load data to MongoDB" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "id": "f97139f9-4383-4e3b-a368-d54b4463509d", 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "collection.insert_one(property)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 31, 101 | "id": "6771b78d-0020-42d6-a96b-4301bf6b834f", 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "data": { 106 | "text/plain": [ 107 | "" 108 | ] 109 | }, 110 | "execution_count": 31, 111 | "metadata": {}, 112 | "output_type": "execute_result" 113 | } 114 | ], 115 | "source": [ 116 | "collection.insert_one(property_1)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "id": "bcbe4789-0c1c-41d6-bede-22efb03fc8cd", 122 | "metadata": {}, 123 | "source": [ 124 | "## Load Data from MongoDB" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 13, 130 | "id": "9a27999b-7315-4a02-805e-25e05458427f", 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "property = collection.find_one({\"id\": 142547498})" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "id": "1609ed92-78c7-48e1-8d02-d1ea9eac2fde", 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "property" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "id": "9b51e9c9-8baf-4140-ad2c-18ee30995c5a", 150 | "metadata": {}, 151 | "source": [ 152 | "## Get a list of all rightmove IDs" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "id": "69201660-191d-4daf-8b6b-7944f8c58c4a", 158 | "metadata": {}, 159 | "source": [ 160 | "I created a new unique index in MongoDB to enforce uniqueness and quick access for IDs." 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 35, 166 | "id": "367dcb10-508f-49a2-91bb-8bec6359f08c", 167 | "metadata": {}, 168 | "outputs": [ 169 | { 170 | "name": "stdout", 171 | "output_type": "stream", 172 | "text": [ 173 | "[142547498, 142659089]\n" 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "rightmove_ids = collection.find({}, {\"id\": 1})\n", 179 | "\n", 180 | "# Convert the result to a list of IDs\n", 181 | "ids = [doc['id'] for doc in rightmove_ids]\n", 182 | "\n", 183 | "print(ids)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "id": "e4521181-d231-443b-8ed8-849763b0d736", 189 | "metadata": {}, 190 | "source": [ 191 | "### Close connection" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 37, 197 | "id": "8acde9bf-e654-4e7c-91c1-5b653d7ecd1a", 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "client.close()" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "id": "48a83a75-144e-466f-bbd1-31e9b26b074b", 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [] 211 | } 212 | ], 213 | "metadata": { 214 | "kernelspec": { 215 | "display_name": "Python 3 (ipykernel)", 216 | "language": "python", 217 | "name": "python3" 218 | }, 219 | "language_info": { 220 | "codemirror_mode": { 221 | "name": "ipython", 222 | "version": 3 223 | }, 224 | "file_extension": ".py", 225 | "mimetype": "text/x-python", 226 | "name": "python", 227 | "nbconvert_exporter": "python", 228 | "pygments_lexer": "ipython3", 229 | "version": "3.11.4" 230 | } 231 | }, 232 | "nbformat": 4, 233 | "nbformat_minor": 5 234 | } 235 | -------------------------------------------------------------------------------- /notebooks/data_storage/.ipynb_checkpoints/mongo_integration-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "4b764f16-3e43-463f-af40-fbcc51f3f9cb", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from pymongo import MongoClient\n", 11 | "\n", 12 | "client = MongoClient(\"mongodb://localhost:27017/\") # Hosted with Docker" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "id": "3c7dc024-1194-4ca9-9179-dedcd5ca476c", 18 | "metadata": {}, 19 | "source": [ 20 | "## Access Database" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "id": "35c41cd0-27e0-48a3-9b35-4c15988f0010", 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "db = client[\"rightmove\"]\n", 31 | "\n", 32 | "# Access collection\n", 33 | "collection = db[\"properties\"]" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "id": "fd6b7ec8-958b-4b02-b882-5eac66268c09", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "# Insert a document\n", 44 | "collection.insert_one({\"name\": \"John Doe\", \"age\": 30})\n", 45 | "\n", 46 | "# Find a document\n", 47 | "user = collection.find_one({\"name\": \"John Doe\"})\n", 48 | "\n", 49 | "# Update a document\n", 50 | "collection.update_one({\"name\": \"John Doe\"}, {\"$set\": {\"age\": 31}})\n", 51 | "\n", 52 | "# Delete a document\n", 53 | "collection.delete_one({\"name\": \"John Doe\"})" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "id": "cf50b563-4cac-4156-9b6e-faaa4af9ca48", 59 | "metadata": {}, 60 | "source": [ 61 | "## Fetch data" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 30, 67 | "id": "77e22ba8-c237-4d04-959d-3683c114e35c", 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "import json\n", 72 | "\n", 73 | "with open(\"../resources/data/property.json\", \"r\") as file:\n", 74 | " property = json.load(file) \n", 75 | "\n", 76 | "with open(\"../resources/data/property_1.json\", \"r\") as file:\n", 77 | " property_1 = json.load(file) " 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "id": "478f104c-0b48-4211-991f-76fe790f65ba", 83 | "metadata": {}, 84 | "source": [ 85 | "## Load data to MongoDB" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "id": "f97139f9-4383-4e3b-a368-d54b4463509d", 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "collection.insert_one(property)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 31, 101 | "id": "6771b78d-0020-42d6-a96b-4301bf6b834f", 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "data": { 106 | "text/plain": [ 107 | "" 108 | ] 109 | }, 110 | "execution_count": 31, 111 | "metadata": {}, 112 | "output_type": "execute_result" 113 | } 114 | ], 115 | "source": [ 116 | "collection.insert_one(property_1)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "id": "bcbe4789-0c1c-41d6-bede-22efb03fc8cd", 122 | "metadata": {}, 123 | "source": [ 124 | "## Load Data from MongoDB" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 13, 130 | "id": "9a27999b-7315-4a02-805e-25e05458427f", 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "property = collection.find_one({\"id\": 142547498})" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "id": "1609ed92-78c7-48e1-8d02-d1ea9eac2fde", 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "property" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "id": "9b51e9c9-8baf-4140-ad2c-18ee30995c5a", 150 | "metadata": {}, 151 | "source": [ 152 | "## Get a list of all rightmove IDs" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "id": "69201660-191d-4daf-8b6b-7944f8c58c4a", 158 | "metadata": {}, 159 | "source": [ 160 | "I created a new unique index in MongoDB to enforce uniqueness and quick access for IDs." 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 35, 166 | "id": "367dcb10-508f-49a2-91bb-8bec6359f08c", 167 | "metadata": {}, 168 | "outputs": [ 169 | { 170 | "name": "stdout", 171 | "output_type": "stream", 172 | "text": [ 173 | "[142547498, 142659089]\n" 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "rightmove_ids = collection.find({}, {\"id\": 1})\n", 179 | "\n", 180 | "# Convert the result to a list of IDs\n", 181 | "ids = [doc['id'] for doc in rightmove_ids]\n", 182 | "\n", 183 | "print(ids)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "id": "e4521181-d231-443b-8ed8-849763b0d736", 189 | "metadata": {}, 190 | "source": [ 191 | "### Close connection" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 37, 197 | "id": "8acde9bf-e654-4e7c-91c1-5b653d7ecd1a", 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "client.close()" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "id": "48a83a75-144e-466f-bbd1-31e9b26b074b", 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [] 211 | } 212 | ], 213 | "metadata": { 214 | "kernelspec": { 215 | "display_name": "Python 3 (ipykernel)", 216 | "language": "python", 217 | "name": "python3" 218 | }, 219 | "language_info": { 220 | "codemirror_mode": { 221 | "name": "ipython", 222 | "version": 3 223 | }, 224 | "file_extension": ".py", 225 | "mimetype": "text/x-python", 226 | "name": "python", 227 | "nbconvert_exporter": "python", 228 | "pygments_lexer": "ipython3", 229 | "version": "3.11.4" 230 | } 231 | }, 232 | "nbformat": 4, 233 | "nbformat_minor": 5 234 | } 235 | -------------------------------------------------------------------------------- /rightmove/orchestration/airflow_app/dags/rightmove/data_processing/rightmove_processing.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from apache_beam.io.mongodbio import ReadFromMongoDB, WriteToMongoDB 4 | 5 | import apache_beam as beam 6 | from apache_beam.options.pipeline_options import PipelineOptions 7 | import logging 8 | from sklearn.neighbors import BallTree 9 | import pandas as pd 10 | import math 11 | import numpy as np 12 | import datetime 13 | from math import radians 14 | 15 | from pymongo import MongoClient 16 | 17 | # from dotenv import load_dotenv 18 | # load_dotenv("/Users/alexander.girardet/Code/Personal/projects/rightmove_project/.env") 19 | 20 | MONGO_URI = os.environ.get("MONGO_URI") 21 | 22 | GCS_PARQUET_URL = "https://storage.googleapis.com/rightmove-resources-public/UK_pois.parquet" # TODO: Make this private 23 | WALK_SCORES_COLLECTION = "walk_scores" 24 | 25 | BATCH_SIZE = 50 26 | 27 | 28 | class ProcessElement(beam.DoFn): 29 | def fetch_current_ids(self): 30 | client = MongoClient(MONGO_URI) 31 | db = client["rightmove"] 32 | collection = db[WALK_SCORES_COLLECTION] 33 | query = {} 34 | data = collection.find(query, {"id": 1}) 35 | return [x["id"] for x in list(data)] 36 | 37 | def process_results_df(self, distance_series, pois_df): 38 | results_df = pd.DataFrame(distance_series) 39 | 40 | results_df = results_df.join(pois_df["amenities"], how="left") 41 | 42 | results_df["distance_in_metres"] = results_df["distance"].apply( 43 | lambda x: x * self.earth_radius 44 | ) 45 | 46 | results_df["distance_decayed"] = results_df["distance_in_metres"].apply( 47 | lambda x: float(self.distance_decay(x)) 48 | ) 49 | 50 | return results_df 51 | 52 | def distance_decay(sefl, distance): 53 | M = float(1) 54 | dist = distance / 1000 55 | score = math.e ** ((-5.0 * (dist / 4)) ** 5.0) 56 | return score 57 | 58 | def calculate_amenity_walk_score(self, property_distance_df, amenity, weights): 59 | k = len(weights) 60 | weight_array = np.array(weights) 61 | 62 | dist_array = ( 63 | property_distance_df[property_distance_df["amenities"] == amenity] 64 | .iloc[0:k]["distance_decayed"] 65 | .values 66 | ) 67 | dist_array_padded = np.pad( 68 | dist_array, (0, weight_array.size - dist_array.size), "constant" 69 | ) 70 | 71 | scores_array = dist_array_padded * weight_array 72 | 73 | amenity_score = scores_array.sum() 74 | 75 | return amenity_score 76 | 77 | def calculuate_walk_score(self, property, ball_tree, amenity_weights, pois_df): 78 | property_id = property["id"] 79 | latitude = property["location"]["latitude"] 80 | longitude = property["location"]["longitude"] 81 | 82 | radian_longitude = radians(longitude) 83 | radian_latitude = radians(latitude) 84 | 85 | k = 100 # Maximum number of amenities to return 86 | 87 | distances, indices = ball_tree.query( 88 | [[radian_longitude, radian_latitude]], k=k, return_distance=True 89 | ) 90 | 91 | dist_series = pd.Series(distances[0], index=indices[0], name="distance") 92 | 93 | results_df = self.process_results_df(dist_series, pois_df) 94 | 95 | # print(results_df) 96 | 97 | scores_dict = {} 98 | 99 | walk_score = 0 100 | 101 | for key, values in amenity_weights.items(): 102 | amenity_score = self.calculate_amenity_walk_score(results_df, key, values) 103 | 104 | scores_dict[key] = amenity_score 105 | 106 | return scores_dict 107 | 108 | def setup(self): 109 | self.earth_radius = 6371000 # Earth radius in metres 110 | self.pois_df = pd.read_parquet(GCS_PARQUET_URL) 111 | self.ball_tree = BallTree( 112 | self.pois_df[["lon_rad", "lat_rad"]].values, metric="haversine" 113 | ) # What is the ball tree doing? 114 | self.amenity_weights = { 115 | "grocery": [3], 116 | "restaurants": [ 117 | 0.75, 118 | 0.45, 119 | 0.25, 120 | 0.25, 121 | 0.225, 122 | 0.225, 123 | 0.225, 124 | 0.225, 125 | 0.2, 126 | 0.2, 127 | ], 128 | "shopping": [0.5, 0.45, 0.4, 0.35, 0.3], 129 | "coffee": [1.25, 0.75], 130 | "banks": [1], 131 | "parks": [1], 132 | "schools": [1], 133 | "books": [1], 134 | "entertainment": [1], 135 | } 136 | self.processed_ids = self.fetch_current_ids() 137 | 138 | def process(self, element): # TODO: ADD ID processing to avoid duplicate processing 139 | logging.info(f"Processing element: {len(element)}") 140 | for ele in element: 141 | if ele["id"] not in self.processed_ids: 142 | property = {"id": ele["id"], "location": ele["location"]} 143 | logging.info(f"Processing property: {property}") 144 | scores_dict = self.calculuate_walk_score( 145 | property, self.ball_tree, self.amenity_weights, self.pois_df 146 | ) 147 | walk_score = sum(scores_dict.values()) * 6.67 148 | scores_dict["walk_score"] = walk_score 149 | 150 | property["scores"] = scores_dict 151 | 152 | property[ 153 | "processing_timestamp" 154 | ] = datetime.datetime.utcnow().timestamp() 155 | 156 | yield property 157 | else: 158 | logging.info(f"Property already processed: {ele['id']}") 159 | continue 160 | 161 | 162 | def run(): 163 | with beam.Pipeline(options=PipelineOptions()) as pipeline: 164 | ( 165 | pipeline 166 | | "Read from Mongo" 167 | >> ReadFromMongoDB( 168 | uri=MONGO_URI, db="rightmove", coll="properties", bucket_auto=True 169 | ) # Only return the id and the location 170 | | "Batch Elements" 171 | >> beam.BatchElements(min_batch_size=BATCH_SIZE, max_batch_size=BATCH_SIZE) 172 | | "Process each element" >> beam.ParDo(ProcessElement()) 173 | | "Write to MongoDB" 174 | >> WriteToMongoDB( 175 | uri=MONGO_URI, 176 | db="rightmove", 177 | coll=WALK_SCORES_COLLECTION, 178 | batch_size=10, 179 | ) 180 | ) 181 | 182 | 183 | if __name__ == "__main__": 184 | logging.getLogger().setLevel(logging.INFO) 185 | run() 186 | -------------------------------------------------------------------------------- /notebooks/data_ingestion/scrapy_connection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "5e5492f1-7a93-4509-a5d6-99ac7c56dadb", 6 | "metadata": {}, 7 | "source": [ 8 | "This notebook is to test the integration with scrapyd to scehdule runs." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "f53b22b9-f901-4000-bc8b-822f87abb6e2", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import requests" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "id": "bdfd9578-2d5d-4a08-a51c-e919a93774ba", 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "spider = \"rightmove\"\n", 29 | "project = \"rightmove_scraper\"" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "id": "e5fb7732-c1ef-45a4-83d4-118818da84ed", 35 | "metadata": {}, 36 | "source": [ 37 | "## Schedule job" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 15, 43 | "id": "5268cded-e321-4c5d-a11d-228cc52615b6", 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "name": "stdout", 48 | "output_type": "stream", 49 | "text": [ 50 | "{\"node_name\": \"d9198cfc8c27\", \"status\": \"ok\", \"jobid\": \"c3fc2a8a936b11eebe730242ac1b0006\"}\n", 51 | "\n", 52 | "Request successful\n", 53 | "Job started\n" 54 | ] 55 | } 56 | ], 57 | "source": [ 58 | "SCRAPYD_ENDPOINT = \"http://localhost:6800/schedule.json\"\n", 59 | "\n", 60 | "spider = \"rightmove\"\n", 61 | "project = \"rightmove_scraper\"\n", 62 | "payload = f\"project={project}&spider={spider}\"\n", 63 | "headers = {\n", 64 | " 'Content-Type': 'application/x-www-form-urlencoded'\n", 65 | "}\n", 66 | "\n", 67 | "response = requests.request(\"POST\", SCRAPYD_ENDPOINT, headers=headers, data=payload)\n", 68 | "\n", 69 | "print(response.text)\n", 70 | "if response.status_code == 200:\n", 71 | " print(\"Request successful\")\n", 72 | " if response.json()['status'] == 'ok':\n", 73 | " print(\"Job started\")\n", 74 | " job_id = response.json()['jobid']\n", 75 | "else:\n", 76 | " print(response.text)\n", 77 | " raise ValueError(\"Request failed\")" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 16, 83 | "id": "2ea697a8-f1b1-4c34-8759-8457ed6001f8", 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "data": { 88 | "text/plain": [ 89 | "'c3fc2a8a936b11eebe730242ac1b0006'" 90 | ] 91 | }, 92 | "execution_count": 16, 93 | "metadata": {}, 94 | "output_type": "execute_result" 95 | } 96 | ], 97 | "source": [ 98 | "job_id" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "id": "cf474c91-356f-4d90-8765-85564cb3266d", 104 | "metadata": {}, 105 | "source": [ 106 | "## Check job" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 17, 112 | "id": "c7152ebb-c49e-4b6f-812c-82c425a2794c", 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "name": "stdout", 117 | "output_type": "stream", 118 | "text": [ 119 | "{\"node_name\": \"d9198cfc8c27\", \"status\": \"ok\", \"pending\": [], \"running\": [{\"project\": \"rightmove_scraper\", \"spider\": \"rightmove\", \"id\": \"c3fc2a8a936b11eebe730242ac1b0006\", \"pid\": 45, \"start_time\": \"2023-12-05 12:42:39.979369\"}], \"finished\": [{\"project\": \"rightmove_scraper\", \"spider\": \"rightmove\", \"id\": \"cddc3cf8936a11eebe730242ac1b0006\", \"start_time\": \"2023-12-05 12:35:44.985567\", \"end_time\": \"2023-12-05 12:37:23.300061\", \"log_url\": \"/logs/rightmove_scraper/rightmove/cddc3cf8936a11eebe730242ac1b0006.log\", \"items_url\": \"/items/rightmove_scraper/rightmove/cddc3cf8936a11eebe730242ac1b0006.jl\"}, {\"project\": \"rightmove_scraper\", \"spider\": \"rightmove\", \"id\": \"93ae7428936b11eebe730242ac1b0006\", \"start_time\": \"2023-12-05 12:41:15.135047\", \"end_time\": \"2023-12-05 12:42:13.706062\", \"log_url\": \"/logs/rightmove_scraper/rightmove/93ae7428936b11eebe730242ac1b0006.log\", \"items_url\": \"/items/rightmove_scraper/rightmove/93ae7428936b11eebe730242ac1b0006.jl\"}]}\n", 120 | "\n" 121 | ] 122 | } 123 | ], 124 | "source": [ 125 | "import requests\n", 126 | "\n", 127 | "url = f\"http://localhost:6800/listjobs.json?project={project}\"\n", 128 | "\n", 129 | "payload = {}\n", 130 | "headers = {}\n", 131 | "\n", 132 | "response = requests.request(\"GET\", url, headers=headers, data=payload)\n", 133 | "\n", 134 | "print(response.text)" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "id": "93ec794d-9cc7-44bb-905a-4650404eeeb0", 140 | "metadata": {}, 141 | "source": [ 142 | "## Cancel Job" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 18, 148 | "id": "60c90559-2f28-4d7a-aa12-f601ba6ec8bd", 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "name": "stdout", 153 | "output_type": "stream", 154 | "text": [ 155 | "{\"node_name\": \"d9198cfc8c27\", \"status\": \"ok\", \"prevstate\": \"running\"}\n", 156 | "\n", 157 | "Request successful\n" 158 | ] 159 | } 160 | ], 161 | "source": [ 162 | "SCRAPYD_ENDPOINT = \"http://localhost:6800/cancel.json\"\n", 163 | "\n", 164 | "spider = \"rightmove\"\n", 165 | "project = \"rightmove_scraper\"\n", 166 | "job_id = \"c3fc2a8a936b11eebe730242ac1b0006\"\n", 167 | "payload = f\"project={project}&job={job_id}\"\n", 168 | "headers = {\n", 169 | " 'Content-Type': 'application/x-www-form-urlencoded'\n", 170 | "}\n", 171 | "\n", 172 | "response = requests.request(\"POST\", SCRAPYD_ENDPOINT, headers=headers, data=payload)\n", 173 | "\n", 174 | "print(response.text)\n", 175 | "if response.status_code == 200:\n", 176 | " print(\"Request successful\")\n", 177 | "else:\n", 178 | " print(response.text)\n", 179 | " raise ValueError(\"Request failed\")" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "id": "b44e1572-9bc0-4e57-be99-6b25c15201a7", 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [] 189 | } 190 | ], 191 | "metadata": { 192 | "kernelspec": { 193 | "display_name": "Python 3 (ipykernel)", 194 | "language": "python", 195 | "name": "python3" 196 | }, 197 | "language_info": { 198 | "codemirror_mode": { 199 | "name": "ipython", 200 | "version": 3 201 | }, 202 | "file_extension": ".py", 203 | "mimetype": "text/x-python", 204 | "name": "python", 205 | "nbconvert_exporter": "python", 206 | "pygments_lexer": "ipython3", 207 | "version": "3.11.4" 208 | } 209 | }, 210 | "nbformat": 4, 211 | "nbformat_minor": 5 212 | } 213 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Rightmove Rental Prediction System 2 | 3 | In recent months, I've deepened my expertise in creating machine learning (ML) systems through comprehensive study and application of three pivotal areas: Data Engineering, MLOps, and ML Engineering, as structured by the Data Talk Club courses. My project, the Rightmove Rental Prediction System, encapsulates this journey, demonstrating a cohesive application of these skills. 4 | 5 | The essence of this project lies in its comprehensive architecture, designed to predict rental prices with precision. It integrates: 6 | 7 | 1. **Data Engineering** through an asynchronous web scraper and batch ingestion pipelines, enabling efficient data extraction and preprocessing. 8 | 2. **ML Engineering** with a focus on model training and feature engineering, including the development of an innovative "Walk Score." 9 | 3. **MLOps** by implementing monitoring practices to ensure the system's reliability and performance over time. 10 | 11 | ### **Project Components** 12 | 13 | 1. **Extraction and Data Processing Pipeline**: Automated to handle large-scale data extraction, cleaning, and preparation. 14 | 2. **ML Training Pipeline**: Designed for iterative experimentation and training, leveraging a RandomForest model among others, to identify the most effective prediction method. 15 | 3. **MLOps Monitoring Pipeline**: Ensures model performance remains optimal through continuous monitoring for data drift and other potential issues. 16 | 4. **Model Serving API**: Utilizes FastAPI for efficient model deployment, allowing real-time predictions. 17 | 5. **Visualization Dashboard**: Built with Streamlit and Grafana, offering insightful data visualizations and monitoring dashboards to track system performance and data quality. 18 | 19 | ### **Infrastructure and Deployment** 20 | 21 | My approach combines DevOps and software engineering principles, employing Terraform for infrastructure management and Docker Compose for containerization, across both AWS and GCP platforms. This dual-cloud strategy not only leverages the strengths of both services but also optimizes costs through their free tier options. 22 | 23 | ### **ML and MLOps Implementation** 24 | 25 | The project showcases my ML and MLOps expertise through the development of a RandomForest model, enhanced by a unique feature, the Walk Score, to improve predictive accuracy. MLFlow serves as the backbone for experiment tracking and model registry, facilitating the model's evolution and serving. 26 | 27 | ### **Data Extraction and Processing** 28 | 29 | ![Data Extraction Pipeline](/static/images/Processing_pipeline_rightmove.png) 30 | 31 | 32 | Choosing Rightmove, a leading UK property listing site, as the data source, I developed a Scrapy spider deployed on a Scrapyd server. This setup enhances control over scraping activities and integrates seamlessly with Airflow for orchestration, ensuring ethical data usage and compliance with best practices. 33 | 34 | Data storage is managed through PostgreSQL and MongoDB, supporting structured and unstructured data, respectively. This configuration not only facilitates efficient data management but also integrates a custom Beam job to compute the Walk Score for enhanced model input. 35 | 36 | ### **ML Training with MLFlow** 37 | 38 | ![ML Training pipeline](/static/images/model_training_pipeline.png) 39 | 40 | For the ML training component, MLFlow played a critical role as a central hub for experiment tracking, model versioning, and serving. This tool allowed for a systematic approach to managing the lifecycle of machine learning models. Here's how it was integrated into the workflow: 41 | 42 | - **Experiment Tracking**: Every training run, along with its parameters, metrics, and outcomes, was logged in MLFlow. This facilitated a comprehensive analysis of each experiment, enabling quick iteration over models to find the best performing ones based on Root Mean Squared Error (RMSE) metrics. 43 | - **Model Registry**: The most promising models, particularly the RandomForest model which outperformed others including XGBoost, were registered in MLFlow's model registry. This registry acted as a repository, making it simple to version, store, and access models for deployment. 44 | - **Model Serving**: MLFlow also streamlined the deployment process. The serving component fetched the latest and most effective model version directly from the registry, ensuring that the prediction service always utilized the best available model. 45 | 46 | The use of MLFlow not only brought organization and efficiency to the model training process but also ensured transparency and reproducibility, which are essential for collaboration and continuous improvement in ML projects. 47 | 48 | ## **DevOps and Scraper Monitoring** 49 | 50 | The Rightmove Rental Prediction System employs a focused approach to monitor its web scraping operations, leveraging Grafana and PostgreSQL for a streamlined and effective oversight. 51 | 52 | ### **Monitoring Framework** 53 | 54 | **Grafana Dashboard**: Provides real-time visualization of critical metrics such as success rates, error counts, and response times. This dashboard enables quick identification of performance issues or errors in the web scraping process. 55 | 56 | **PostgreSQL**: Acts as the storage backbone for logging detailed metrics from each scraping session. This includes timestamps, counts of extracted records, and error logs, offering a comprehensive view for analysis and troubleshooting. 57 | 58 | ### **Key Objectives** 59 | 60 | - **Efficiency and Error Management**: Monitoring ensures the scraper runs efficiently, with a quick response to any errors or bottlenecks. 61 | - **Compliance and Rate Limiting**: Keeps the scraping activities within ethical and legal boundaries by tracking request rates and adherence to site policies. 62 | 63 | ### **DevOps Integration** 64 | 65 | The setup integrates seamlessly with our DevOps practices, with Grafana alerts configured to trigger automated actions or notifications for immediate attention, ensuring the system's robustness and reliability. 66 | 67 | #### System Monitoring 68 | ![Extraction Monitoring](/static/images/scrapy_monitoring.png) 69 | 70 | System Monitoring of Scrapy Sessions 71 | 72 | ## **MLOps** 73 | 74 | ![MLOps Diagram](/static/images/mlops_pipeline.png) 75 | 76 | Understanding and mitigating concept drift and data drift are critical for maintaining the performance of ML models in production. Here’s how these challenges were approached: 77 | 78 | - **Concept Drift**: This occurs when the statistical properties of the target variable, which the model is trying to predict, change over time. This can degrade the model's performance because the patterns the model learned during training may no longer apply. To detect concept drift, the monitoring pipeline employed statistical tests and comparisons between predictions and actual outcomes over time. When significant drift was detected, a model retraining workflow was triggered, incorporating new data to adapt the model to the current reality. 79 | - **Data Drift**: Data drift refers to changes in the input data's distribution. It's crucial to monitor because even if the target variable's distribution remains the same, changes in input data can lead to poor model performance. The project utilized Evidently to monitor key features' distributions, comparing incoming data against a historical baseline (the golden dataset). Alerts were set up to notify when data drift exceeded predefined thresholds, prompting an evaluation to determine if model retraining or adjustment in data preprocessing steps was necessary. 80 | 81 | #### ML Model Monitoring 82 | ![Model Monitoring](/static/images/model_monitoring.png) 83 | 84 | MLOps monitoring of Data and Concept Drift 85 | 86 | ### Addressing change 87 | 88 | Grafana enables automated actions. In the case our model’s prediction performance drops below a certain threshold, we will trigger an automatic retraining of the model on new data that include the new patterns. This should ensure our model is up to date in an automated fashion. 89 | -------------------------------------------------------------------------------- /notebooks/resources/data/property_1.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": 142659089, 3 | "bedrooms": 2, 4 | "bathrooms": 2, 5 | "numberOfImages": 9, 6 | "numberOfFloorplans": 0, 7 | "numberOfVirtualTours": 1, 8 | "summary": "Property Reference: 1915996.This 2 bedroom house is in the popular area of Mugiemoss, Bucksburn and is available for let from the start of January 2024The property boasts an enviable location within an established community and with an easy commute to the city centre. Good quality laminate and carpet along with blinds and white goods are all included with the house. Upstairs there are two double bedrooms, one featuring a built in wardrobe. There is also a three piece bathroom suite upstairs with a shower over the bath and there is a separate downstairs cloakroom.The spacious open plan lounge/kitchen features a built in oven and hob, washer-dryer and fridge freezer.Outside there is a fully enclosed garden and also a residents parking area.A deposit of one month's rent is required and eligibility criteria will need to be met. This property is suitable for pets with an additional 50% deposit.\u00a0Scottish charity number: SCO44825, **some photos are for illustration purposes only**Summary & Exclusions:- Rent Amount: \u00a3632.42 per month (\u00a3145.94 per week)- Deposit / Bond: \u00a3632.41- 2 Bedrooms- 2 Bathrooms- Property comes unfurnished- Available to move in from 05 January, 2024- Maximum number of tenants is 3- DSS enquiries welcome- Students welcome to enquire- Pets considered / by arrangement- No Smokers- Family Friendly- Bills not included- Property has parking- Property has garden access- EPC Rating: B If calling, please quote reference: 1915996 Fees:You will not be charged any admin fees. ** Contact today to book a viewing and have the landlord show you round! ** Request Details form responded to 24/7, with phone bookings available 9am-9pm, 7 days a week.OpenRent is on the Scottish Letting Agent Register, registration number LARN1809026The landlord is on the Scottish Landlord Register, registration number 453083/100/19571", 9 | "displayAddress": "Mugiemoss Road, Aberdeen, AB21", 10 | "countryCode": "GB", 11 | "location": { 12 | "latitude": 57.17924, 13 | "longitude": -2.16878 14 | }, 15 | "propertyImages": { 16 | "images": [ 17 | { 18 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/97k/96668/142659089/96668_191599604122023_IMG_00_0000_max_476x317.jpeg", 19 | "url": "97k/96668/142659089/96668_191599604122023_IMG_00_0000.jpeg", 20 | "caption": null 21 | }, 22 | { 23 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/97k/96668/142659089/96668_191599604122023_IMG_01_0000_max_476x317.jpeg", 24 | "url": "97k/96668/142659089/96668_191599604122023_IMG_01_0000.jpeg", 25 | "caption": null 26 | }, 27 | { 28 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/97k/96668/142659089/96668_191599604122023_IMG_02_0000_max_476x317.jpeg", 29 | "url": "97k/96668/142659089/96668_191599604122023_IMG_02_0000.jpeg", 30 | "caption": null 31 | }, 32 | { 33 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/97k/96668/142659089/96668_191599604122023_IMG_03_0000_max_476x317.jpeg", 34 | "url": "97k/96668/142659089/96668_191599604122023_IMG_03_0000.jpeg", 35 | "caption": null 36 | }, 37 | { 38 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/97k/96668/142659089/96668_191599604122023_IMG_04_0000_max_476x317.jpeg", 39 | "url": "97k/96668/142659089/96668_191599604122023_IMG_04_0000.jpeg", 40 | "caption": null 41 | }, 42 | { 43 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/97k/96668/142659089/96668_191599604122023_IMG_05_0000_max_476x317.jpeg", 44 | "url": "97k/96668/142659089/96668_191599604122023_IMG_05_0000.jpeg", 45 | "caption": null 46 | }, 47 | { 48 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/97k/96668/142659089/96668_191599604122023_IMG_06_0000_max_476x317.jpeg", 49 | "url": "97k/96668/142659089/96668_191599604122023_IMG_06_0000.jpeg", 50 | "caption": null 51 | }, 52 | { 53 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/97k/96668/142659089/96668_191599604122023_IMG_07_0000_max_476x317.jpeg", 54 | "url": "97k/96668/142659089/96668_191599604122023_IMG_07_0000.jpeg", 55 | "caption": null 56 | }, 57 | { 58 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/97k/96668/142659089/96668_191599604122023_IMG_08_0000_max_476x317.jpeg", 59 | "url": "97k/96668/142659089/96668_191599604122023_IMG_08_0000.jpeg", 60 | "caption": null 61 | } 62 | ], 63 | "mainImageSrc": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/97k/96668/142659089/96668_191599604122023_IMG_00_0000_max_476x317.jpeg", 64 | "mainMapImageSrc": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/97k/96668/142659089/96668_191599604122023_IMG_00_0000_max_296x197.jpeg" 65 | }, 66 | "propertySubType": "Terraced", 67 | "listingUpdate": { 68 | "listingUpdateReason": "new", 69 | "listingUpdateDate": "2023-12-04T11:54:03Z" 70 | }, 71 | "premiumListing": false, 72 | "featuredProperty": false, 73 | "price": { 74 | "amount": 632, 75 | "frequency": "monthly", 76 | "currencyCode": "GBP", 77 | "displayPrices": [ 78 | { 79 | "displayPrice": "\u00a3632 pcm", 80 | "displayPriceQualifier": "" 81 | }, 82 | { 83 | "displayPrice": "\u00a3146 pw", 84 | "displayPriceQualifier": "" 85 | } 86 | ] 87 | }, 88 | "customer": { 89 | "branchId": 96668, 90 | "brandPlusLogoURI": "/company/clogo_rmchoice_37106_0000.png", 91 | "contactTelephone": "020 3322 3265", 92 | "branchDisplayName": "OpenRent, London", 93 | "branchName": "London", 94 | "brandTradingName": "OpenRent", 95 | "branchLandingPageUrl": "/estate-agents/agent/OpenRent/London-96668.html", 96 | "development": false, 97 | "showReducedProperties": true, 98 | "commercial": false, 99 | "showOnMap": true, 100 | "enhancedListing": false, 101 | "developmentContent": null, 102 | "buildToRent": false, 103 | "buildToRentBenefits": [], 104 | "brandPlusLogoUrl": "https://media.rightmove.co.uk:443/dir/company/clogo_rmchoice_37106_0000_max_100x50.png" 105 | }, 106 | "distance": null, 107 | "transactionType": "rent", 108 | "productLabel": { 109 | "productLabelText": "", 110 | "spotlightLabel": false 111 | }, 112 | "commercial": false, 113 | "development": false, 114 | "residential": true, 115 | "students": false, 116 | "auction": false, 117 | "feesApply": false, 118 | "feesApplyText": null, 119 | "displaySize": "", 120 | "showOnMap": true, 121 | "propertyUrl": "/properties/142659089#/?channel=RES_LET", 122 | "contactUrl": "/property-to-rent/contactBranch.html?propertyId=142659089", 123 | "staticMapUrl": null, 124 | "channel": "RENT", 125 | "firstVisibleDate": "2023-12-04T11:48:11Z", 126 | "keywords": [], 127 | "keywordMatchType": "no_keyword", 128 | "saved": false, 129 | "hidden": false, 130 | "onlineViewingsAvailable": false, 131 | "lozengeModel": { 132 | "matchingLozenges": [] 133 | }, 134 | "hasBrandPlus": true, 135 | "displayStatus": "", 136 | "enquiredTimestamp": null, 137 | "heading": "", 138 | "isRecent": true, 139 | "enhancedListing": false, 140 | "formattedBranchName": " by OpenRent, London", 141 | "formattedDistance": "", 142 | "propertyTypeFullDescription": "2 bedroom terraced house", 143 | "addedOrReduced": "Added today", 144 | "feature_list": [ 145 | "No Agent Fees", 146 | "Students Can Enquire", 147 | "Property Reference Number: 1915996" 148 | ] 149 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate==0.27.2 2 | affine==2.4.0 3 | aiohttp==3.9.3 4 | aiosignal==1.3.1 5 | alembic==1.13.1 6 | altair==5.2.0 7 | annotated-types==0.6.0 8 | anyio==4.2.0 9 | apache-airflow==2.8.1 10 | apache-airflow-providers-common-io==1.2.0 11 | apache-airflow-providers-common-sql==1.10.1 12 | apache-airflow-providers-ftp==3.7.0 13 | apache-airflow-providers-http==4.9.0 14 | apache-airflow-providers-imap==3.5.0 15 | apache-airflow-providers-sqlite==3.7.0 16 | apache-beam==2.52.0 17 | apispec==6.4.0 18 | appdirs==1.4.4 19 | appnope==0.1.3 20 | argcomplete==3.2.2 21 | argon2-cffi==23.1.0 22 | argon2-cffi-bindings==21.2.0 23 | arrow==1.3.0 24 | asgiref==3.7.2 25 | asttokens==2.4.1 26 | async-lru==2.0.4 27 | attrs==23.1.0 28 | Automat==22.10.0 29 | Babel==2.14.0 30 | backoff==2.2.1 31 | beautifulsoup4==4.12.2 32 | bleach==6.1.0 33 | blinker==1.7.0 34 | cachelib==0.9.0 35 | cachetools==5.3.2 36 | certifi==2023.11.17 37 | cffi==1.16.0 38 | charset-normalizer==3.3.2 39 | click==8.1.7 40 | click-plugins==1.1.1 41 | clickclick==20.10.2 42 | cligj==0.7.2 43 | cloudpickle==2.2.1 44 | colorama==0.4.6 45 | colorlog==4.8.0 46 | comm==0.2.1 47 | ConfigUpdater==3.2 48 | connexion==2.14.2 49 | constantly==23.10.4 50 | contextily==1.5.0 51 | contourpy==1.2.0 52 | crcmod==1.7 53 | cron-descriptor==1.4.3 54 | croniter==2.0.1 55 | cryptography==41.0.7 56 | cssselect==1.2.0 57 | cycler==0.12.1 58 | cykhash==2.0.1 59 | Cython==3.0.8 60 | databricks-cli==0.18.0 61 | datasets==2.17.1 62 | debugpy==1.8.0 63 | decorator==5.1.1 64 | defusedxml==0.7.1 65 | Deprecated==1.2.14 66 | dill==0.3.8 67 | distro==1.9.0 68 | dnspython==2.4.2 69 | docker==7.0.0 70 | docopt==0.6.2 71 | docutils==0.20.1 72 | email-validator==2.1.0.post1 73 | entrypoints==0.4 74 | evidently==0.4.14 75 | executing==2.0.1 76 | fastapi==0.109.0 77 | fastapi-restful==0.5.0 78 | fastavro==1.9.0 79 | fasteners==0.19 80 | fastjsonschema==2.19.1 81 | filelock==3.13.1 82 | fiona==1.9.5 83 | Flask==2.2.5 84 | Flask-AppBuilder==4.3.10 85 | Flask-Babel==2.0.0 86 | Flask-Caching==2.1.0 87 | Flask-JWT-Extended==4.6.0 88 | Flask-Limiter==3.5.1 89 | Flask-Login==0.6.3 90 | Flask-Session==0.6.0 91 | Flask-SQLAlchemy==2.5.1 92 | Flask-WTF==1.2.1 93 | fonttools==4.47.2 94 | fqdn==1.5.1 95 | frozenlist==1.4.1 96 | fsspec==2023.10.0 97 | gcsfs==2024.2.0 98 | geographiclib==2.0 99 | geopandas==0.14.3 100 | geopy==2.4.1 101 | gitdb==4.0.11 102 | GitPython==3.1.41 103 | google-api-core==2.17.0 104 | google-auth==2.27.0 105 | google-auth-oauthlib==1.2.0 106 | google-cloud==0.34.0 107 | google-cloud-core==2.4.1 108 | google-cloud-storage==2.14.0 109 | google-crc32c==1.5.0 110 | google-re2==1.1 111 | google-resumable-media==2.7.0 112 | googleapis-common-protos==1.62.0 113 | grpcio==1.59.3 114 | gunicorn==21.2.0 115 | h11==0.14.0 116 | hdfs==2.7.3 117 | httpcore==1.0.2 118 | httplib2==0.22.0 119 | httpx==0.26.0 120 | huggingface-hub==0.20.3 121 | hyperlink==21.0.0 122 | idna==3.6 123 | importlib-metadata==6.11.0 124 | importlib-resources==6.1.1 125 | incremental==22.10.0 126 | inflection==0.5.1 127 | iniconfig==2.0.0 128 | ipykernel==6.29.0 129 | ipython==8.20.0 130 | ipython-genutils==0.2.0 131 | ipywidgets==8.1.1 132 | isoduration==20.11.0 133 | itemadapter==0.8.0 134 | itemloaders==1.1.0 135 | iterative-telemetry==0.0.8 136 | itsdangerous==2.1.2 137 | jedi==0.19.1 138 | Jinja2==3.1.3 139 | jmespath==1.0.1 140 | joblib==1.3.2 141 | Js2Py==0.74 142 | json5==0.9.14 143 | jsonpointer==2.4 144 | jsonschema==4.20.0 145 | jsonschema-specifications==2023.11.2 146 | jupyter==1.0.0 147 | jupyter-console==6.6.3 148 | jupyter-contrib-core==0.4.2 149 | jupyter-contrib-nbextensions==0.7.0 150 | jupyter-events==0.9.0 151 | jupyter-highlight-selected-word==0.2.0 152 | jupyter-lsp==2.2.2 153 | jupyter-nbextensions-configurator==0.6.3 154 | jupyter_client==8.6.0 155 | jupyter_core==5.7.1 156 | jupyter_server==2.12.5 157 | jupyter_server_terminals==0.5.2 158 | jupyterlab==4.0.11 159 | jupyterlab-widgets==3.0.9 160 | jupyterlab_pygments==0.3.0 161 | jupyterlab_server==2.25.2 162 | kiwisolver==1.4.5 163 | lazy-object-proxy==1.10.0 164 | limits==3.7.0 165 | linkify-it-py==2.0.3 166 | lockfile==0.12.2 167 | lxml==4.9.3 168 | Mako==1.3.0 169 | Markdown==3.5.2 170 | markdown-it-py==3.0.0 171 | MarkupSafe==2.1.4 172 | marshmallow==3.20.2 173 | marshmallow-oneofschema==3.1.1 174 | marshmallow-sqlalchemy==0.26.1 175 | matplotlib==3.8.2 176 | matplotlib-inline==0.1.6 177 | mdit-py-plugins==0.4.0 178 | mdurl==0.1.2 179 | mercantile==1.2.1 180 | mistune==3.0.2 181 | mlflow==2.10.2 182 | mpmath==1.3.0 183 | multidict==6.0.5 184 | multiprocess==0.70.16 185 | mypy-extensions==1.0.0 186 | nbclient==0.9.0 187 | nbconvert==7.14.2 188 | nbformat==5.9.2 189 | nest-asyncio==1.6.0 190 | networkx==3.2.1 191 | nltk==3.8.1 192 | notebook==7.0.7 193 | notebook_shim==0.2.3 194 | numpy==1.24.4 195 | oauthlib==3.2.2 196 | objsize==0.6.1 197 | opentelemetry-api==1.22.0 198 | opentelemetry-exporter-otlp==1.22.0 199 | opentelemetry-exporter-otlp-proto-common==1.22.0 200 | opentelemetry-exporter-otlp-proto-grpc==1.22.0 201 | opentelemetry-exporter-otlp-proto-http==1.22.0 202 | opentelemetry-proto==1.22.0 203 | opentelemetry-sdk==1.22.0 204 | opentelemetry-semantic-conventions==0.43b0 205 | ordered-set==4.1.0 206 | orjson==3.9.13 207 | overrides==7.7.0 208 | packaging==23.2 209 | pandas==2.2.0 210 | pandocfilters==1.5.1 211 | parsel==1.8.1 212 | parso==0.8.3 213 | pathspec==0.12.1 214 | patsy==0.5.6 215 | pendulum==3.0.0 216 | pexpect==4.9.0 217 | pillow==10.2.0 218 | platformdirs==4.1.0 219 | plotly==5.18.0 220 | pluggy==1.4.0 221 | prison==0.2.1 222 | prometheus-client==0.19.0 223 | prompt-toolkit==3.0.43 224 | Protego==0.3.0 225 | proto-plus==1.22.3 226 | protobuf==4.25.1 227 | psutil==5.9.8 228 | psycopg==3.1.17 229 | psycopg2==2.9.9 230 | psycopg2-binary==2.9.9 231 | ptyprocess==0.7.0 232 | pure-eval==0.2.2 233 | pyarrow==15.0.0 234 | pyarrow-hotfix==0.6 235 | pyasn1==0.5.1 236 | pyasn1-modules==0.3.0 237 | pycparser==2.21 238 | pydantic==2.6.1 239 | pydantic_core==2.16.2 240 | pydeck==0.8.0 241 | PyDispatcher==2.0.7 242 | pydot==1.4.2 243 | Pygments==2.17.2 244 | pyjsparser==2.7.1 245 | PyJWT==2.8.0 246 | pymongo==4.6.1 247 | PyMySQL==1.1.0 248 | pyOpenSSL==23.3.0 249 | pyparsing==3.1.1 250 | pyproj==3.6.1 251 | pyrobuf==0.9.3 252 | pyrosm==0.6.2 253 | pytest==8.0.2 254 | python-daemon==3.0.1 255 | python-dateutil==2.8.2 256 | python-dotenv==1.0.1 257 | python-json-logger==2.0.7 258 | python-nvd3==0.15.0 259 | python-rapidjson==1.14 260 | python-slugify==8.0.4 261 | pytz==2023.3.post1 262 | PyYAML==6.0.1 263 | pyzmq==25.1.2 264 | qtconsole==5.5.1 265 | QtPy==2.4.1 266 | querystring-parser==1.2.4 267 | queuelib==1.6.2 268 | rasterio==1.3.9 269 | referencing==0.31.1 270 | regex==2023.10.3 271 | requests==2.31.0 272 | requests-file==1.5.1 273 | requests-oauthlib==1.3.1 274 | requests-toolbelt==1.0.0 275 | rfc3339-validator==0.1.4 276 | rfc3986-validator==0.1.1 277 | rich==13.7.0 278 | rich-argparse==1.4.0 279 | rpds-py==0.13.2 280 | rsa==4.9 281 | safetensors==0.4.2 282 | scikit-learn==1.3.2 283 | scipy==1.12.0 284 | Scrapy==2.11.0 285 | scrapyd==1.4.3 286 | scrapyd-client==1.2.3 287 | seaborn==0.13.2 288 | Send2Trash==1.8.2 289 | service-identity==23.1.0 290 | setproctitle==1.3.3 291 | shapely==2.0.2 292 | six==1.16.0 293 | smmap==5.0.1 294 | sniffio==1.3.0 295 | snuggs==1.4.7 296 | soupsieve==2.5 297 | SQLAlchemy==1.4.51 298 | SQLAlchemy-JSONField==1.0.2 299 | SQLAlchemy-Utils==0.41.1 300 | sqlparse==0.4.4 301 | stack-data==0.6.3 302 | starlette==0.35.1 303 | statsmodels==0.14.1 304 | streamlit==1.31.1 305 | sympy==1.12 306 | tabulate==0.9.0 307 | tenacity==8.2.3 308 | termcolor==2.4.0 309 | terminado==0.18.0 310 | text-unidecode==1.3 311 | threadpoolctl==3.2.0 312 | time-machine==2.13.0 313 | tinycss2==1.2.1 314 | tldextract==5.1.1 315 | tokenizers==0.15.2 316 | toml==0.10.2 317 | toolz==0.12.1 318 | torch==2.2.1 319 | tornado==6.4 320 | tqdm==4.66.1 321 | traitlets==5.14.1 322 | transformers==4.38.1 323 | Twisted==22.10.0 324 | typer==0.9.0 325 | types-python-dateutil==2.8.19.20240106 326 | typing-inspect==0.9.0 327 | typing_extensions==4.8.0 328 | tzdata==2023.4 329 | tzlocal==5.2 330 | uberegg==0.1.1 331 | uc-micro-py==1.0.3 332 | unicodecsv==0.14.1 333 | universal_pathlib==0.2.0 334 | uri-template==1.3.0 335 | urllib3==2.1.0 336 | uvicorn==0.27.0.post1 337 | validators==0.22.0 338 | w3lib==2.1.2 339 | watchdog==3.0.0 340 | wcwidth==0.2.13 341 | webcolors==1.13 342 | webencodings==0.5.1 343 | websocket-client==1.7.0 344 | Werkzeug==2.2.3 345 | widgetsnbextension==4.0.9 346 | wordcloud==1.9.3 347 | wrapt==1.16.0 348 | WTForms==3.1.2 349 | xgboost==2.0.3 350 | xxhash==3.4.1 351 | xyzservices==2023.10.1 352 | yarl==1.9.4 353 | zipp==3.17.0 354 | zope.interface==6.1 355 | zstandard==0.22.0 356 | -------------------------------------------------------------------------------- /rightmove/orchestration/airflow_app/dags/rightmove/train_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime, timedelta 3 | 4 | from pymongo import MongoClient 5 | from google.cloud import storage 6 | import pandas as pd 7 | from sklearn.model_selection import train_test_split 8 | 9 | from sklearn.metrics import mean_squared_error 10 | from mlflow.data.pandas_dataset import PandasDataset 11 | from airflow import DAG 12 | from airflow.operators.python_operator import PythonOperator 13 | from airflow.operators.dummy_operator import DummyOperator 14 | 15 | from rightmove.data_processing.data_processor import DataPreprocessor 16 | 17 | from sklearn.ensemble import RandomForestRegressor 18 | 19 | from dotenv import load_dotenv 20 | 21 | load_dotenv("/Users/alexander.girardet/Code/Personal/projects/rightmove_project/.env") 22 | 23 | import mlflow 24 | 25 | import logging 26 | 27 | logging.basicConfig(level=logging.INFO) 28 | 29 | MONGO_URI = os.environ.get("MONGO_URI") 30 | 31 | mlflow.set_tracking_uri( 32 | "postgresql+psycopg2://postgres:postgres@realestate-database.czkkjkojmucd.eu-west-2.rds.amazonaws.com:5432/mlflow" 33 | ) 34 | experiment_name = "rightmove-prediction" 35 | mlflow.set_experiment(experiment_name) 36 | 37 | client = storage.Client() 38 | bucket = client.get_bucket("rightmove-artifacts-ml") 39 | 40 | def load_data_from_mongo(collection_name="properties", fields=None): 41 | logging.info("Loading data from mongo") 42 | 43 | client = MongoClient(MONGO_URI) # Hosted with Docker 44 | 45 | db = client["rightmove"] 46 | 47 | collection = db[collection_name] 48 | 49 | query = {} 50 | 51 | data = collection.find(query, fields) 52 | 53 | df = pd.DataFrame(list(data)) 54 | 55 | if len(df) == 0: 56 | raise ValueError(f"No data found in collection {collection_name}") 57 | else: 58 | logging.info(f"Data loaded from collection {collection_name}") 59 | 60 | return df 61 | 62 | def generate_foldername(): 63 | now = datetime.now() 64 | return now.strftime("%Y-%m-%d-%H-%M-%S") 65 | 66 | 67 | def load_df_to_gcs(df, dest_path): 68 | blob = bucket.blob(dest_path) 69 | try: 70 | blob.upload_from_string(df.to_csv(), "text/csv") 71 | logging.info(f"Data uploaded to {dest_path}") 72 | return True 73 | except Exception as e: 74 | print(e) 75 | 76 | 77 | def preprocess_data(property_df, walkscore_df): 78 | preprocessor = DataPreprocessor(with_text=False, with_binary=False) 79 | 80 | property_df = preprocessor.preprocess_properties(property_df) 81 | walk_df = preprocessor.preprocess_walk_score(walkscore_df) 82 | 83 | df = property_df.merge(walk_df, on="id", how="left") 84 | 85 | logging.info("Data preprocessed") 86 | 87 | return df 88 | 89 | 90 | def load_data_from_gcs(source_url): 91 | logging.info(f"Loading {source_url} from GCS") 92 | df = pd.read_csv(source_url, index_col=0) 93 | return df 94 | 95 | 96 | def fetch_preprocess_and_upload_data(): 97 | property_df = load_data_from_mongo( 98 | collection_name="properties", 99 | fields={ 100 | "id": 1, 101 | "price.amount": 1, 102 | "price.frequency": 1, 103 | "firstVisibleDate": 1, 104 | "bedrooms": 1, 105 | "bathrooms": 1, 106 | "listingUpdate": 1, 107 | "location": 1, 108 | }, 109 | ) 110 | walkscore_df = load_data_from_mongo( 111 | collection_name="walk_scores", fields={"id": 1, "scores": 1} 112 | ) 113 | 114 | df = preprocess_data(property_df, walkscore_df) 115 | 116 | df = df[["bedrooms", "bathrooms", "price", "longitude", "latitude", "walk_score"]] 117 | 118 | folder_name = generate_foldername() 119 | parent_folder = "data" 120 | 121 | df["price_bin"] = pd.qcut(df["price"], q=10, duplicates="drop") 122 | 123 | # Create train test, validation split 124 | train_val, test_df = train_test_split( 125 | df, test_size=0.1, stratify=df["price_bin"], random_state=42 126 | ) 127 | train_df, val_df = train_test_split( 128 | train_val, test_size=0.2, stratify=train_val["price_bin"], random_state=42 129 | ) 130 | 131 | # Upload to GCS train, test, and validation data 132 | load_df_to_gcs(train_df, f"{parent_folder}/{folder_name}/train.csv") 133 | load_df_to_gcs(val_df, f"{parent_folder}/{folder_name}/val.csv") 134 | load_df_to_gcs(test_df, f"{parent_folder}/{folder_name}/test.csv") 135 | 136 | logging.info("Data uploaded to GCS") 137 | 138 | return folder_name 139 | 140 | 141 | def train_model(**kwargs): 142 | if "ti" in kwargs: 143 | ti = kwargs["ti"] 144 | folder_name = ti.xcom_pull(task_ids="load_data") 145 | else: 146 | folder_name = kwargs["folder_name"] 147 | 148 | logging.info(f"Training model with data from {folder_name}") 149 | 150 | train_dataset_source_url = ( 151 | f"gs://rightmove-artifacts-ml/data/{folder_name}/train.csv" 152 | ) 153 | val_dataset_source_url = f"gs://rightmove-artifacts-ml/data/{folder_name}/val.csv" 154 | test_dataset_source_url = f"gs://rightmove-artifacts-ml/data/{folder_name}/test.csv" 155 | 156 | train_df = load_data_from_gcs(train_dataset_source_url) 157 | val_df = load_data_from_gcs(val_dataset_source_url) 158 | test_df = load_data_from_gcs(test_dataset_source_url) 159 | 160 | train_df = train_df.dropna() 161 | val_df = val_df.dropna() 162 | 163 | features = ["bedrooms", "bathrooms", "longitude", "latitude", "walk_score"] 164 | target = "price" 165 | 166 | X_train = train_df[features] 167 | y_train = train_df[target] 168 | 169 | X_val = val_df[features] 170 | y_val = val_df[target] 171 | 172 | train_dataset: PandasDataset = mlflow.data.from_pandas( 173 | train_df, source=train_dataset_source_url 174 | ) 175 | val_dataset: PandasDataset = mlflow.data.from_pandas( 176 | val_df, source=val_dataset_source_url 177 | ) 178 | test_dataset: PandasDataset = mlflow.data.from_pandas( 179 | test_df, source=val_dataset_source_url 180 | ) 181 | 182 | with mlflow.start_run() as run: 183 | mlflow.set_tag("developer", "Alex") 184 | 185 | mlflow.log_param("Model type", "Random Forest") 186 | model = RandomForestRegressor() 187 | 188 | # Log the datasets 189 | mlflow.log_input(train_dataset, context="training") 190 | mlflow.log_input(val_dataset, context="validation") 191 | mlflow.log_input(test_dataset, context="test") 192 | 193 | logging.info("Fitting model") 194 | model.fit(X_train, y_train) 195 | 196 | y_pred = model.predict(X_val) 197 | 198 | rmse = mean_squared_error(y_val, y_pred, squared=False) 199 | 200 | r2 = model.score(X_val, y_val) 201 | 202 | mlflow.log_metric("rmse", rmse) 203 | mlflow.sklearn.log_model(model, "random-forest") 204 | 205 | logging.info("Model trained and logged to MLflow") 206 | 207 | return run.info.run_id 208 | 209 | 210 | def register_model(**kwargs): 211 | if "ti" in kwargs: 212 | ti = kwargs["ti"] 213 | run_id = ti.xcom_pull(task_ids="train_model") 214 | else: 215 | run_id = kwargs["run_id"] 216 | 217 | model_name = "Random Forest Walk Score" 218 | artifact_path = "random-forest" 219 | 220 | model_uri = f"runs:/{run_id}/{artifact_path}" 221 | 222 | model_details = mlflow.register_model(model_uri=model_uri, name=model_name) 223 | logging.info( 224 | f"Model registered with name: {model_name} and version: {model_details.version}" 225 | ) 226 | 227 | client = mlflow.tracking.MlflowClient() 228 | client.transition_model_version_stage( 229 | name=model_name, 230 | version=model_details.version, 231 | stage="Staging", 232 | archive_existing_versions=False, 233 | ) 234 | logging.info(f"Model version {model_details.version} transitioned to Staging") 235 | 236 | 237 | default_args = { 238 | "owner": "airflow_app", 239 | "depends_on_past": False, 240 | "email_on_failure": False, 241 | "email_on_retry": False, 242 | "retries": 1, 243 | "retry_delay": timedelta(minutes=5), 244 | } 245 | 246 | dag = DAG( 247 | "train_model", 248 | default_args=default_args, 249 | description="DAG for making scraping rightmove", 250 | schedule_interval=timedelta(days=1), 251 | start_date=datetime(2023, 1, 1), 252 | catchup=False, 253 | max_active_runs=1, 254 | ) 255 | with dag: 256 | load_and_preprocess_data_task = PythonOperator( 257 | task_id="load_data", python_callable=fetch_preprocess_and_upload_data 258 | ) 259 | 260 | train_model_task = PythonOperator( 261 | task_id="train_model", python_callable=train_model, provide_context=True 262 | ) 263 | 264 | register_model_task = PythonOperator( 265 | task_id="register_model", python_callable=register_model, provide_context=True 266 | ) 267 | 268 | load_and_preprocess_data_task >> train_model_task >> register_model_task 269 | 270 | if __name__ == "__main__": 271 | folder_name = fetch_preprocess_and_upload_data() 272 | print(folder_name) 273 | run_id = train_model(folder_name=folder_name) 274 | print(run_id) 275 | register_model(run_id=run_id) 276 | -------------------------------------------------------------------------------- /rightmove/data_ingestion/rightmove_scraper/rightmove_scraper/middlewares.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your spider middleware 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 5 | import os 6 | 7 | from scrapy import signals 8 | from scrapy import signals 9 | import datetime 10 | from psycopg2.extras import execute_values 11 | from scrapy.signalmanager import dispatcher 12 | import psycopg2 13 | 14 | # useful for handling different item types with a single interface 15 | from itemadapter import is_item, ItemAdapter 16 | 17 | import logging 18 | 19 | logging.basicConfig(level=logging.DEBUG) 20 | logger = logging.getLogger(__name__) 21 | 22 | POSTGRES_URI = os.environ.get("MONITORING_URI_PG") 23 | 24 | 25 | class RightmoveScraperSpiderMiddleware: 26 | # Not all methods need to be defined. If a method is not defined, 27 | # scrapy acts as if the spider middleware does not modify the 28 | # passed objects. 29 | 30 | @classmethod 31 | def from_crawler(cls, crawler): 32 | # This method is used by Scrapy to create your spiders. 33 | s = cls() 34 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 35 | crawler.signals.connect(s.spider_closed, signal=signals.spider_closed) 36 | return s 37 | 38 | def process_spider_input(self, response, spider): 39 | # Called for each response that goes through the spider 40 | # middleware and into the spider. 41 | 42 | # Should return None or raise an exception. 43 | return None 44 | 45 | def process_spider_output(self, response, result, spider): 46 | # Called with the results returned from the Spider, after 47 | # it has processed the response. 48 | 49 | # Must return an iterable of Request, or item objects. 50 | for i in result: 51 | yield i 52 | 53 | def process_spider_exception(self, response, exception, spider): 54 | # Called when a spider or process_spider_input() method 55 | # (from other spider middleware) raises an exception. 56 | 57 | # Should return either None or an iterable of Request or item objects. 58 | pass 59 | 60 | def process_start_requests(self, start_requests, spider): 61 | # Called with the start requests of the spider, and works 62 | # similarly to the process_spider_output() method, except 63 | # that it doesn’t have a response associated. 64 | 65 | # Must return only requests (not items). 66 | for r in start_requests: 67 | yield r 68 | 69 | def spider_opened(self, spider): 70 | spider.logger.info("Spider opened: %s" % spider.name) 71 | 72 | def spider_closed(self, spider): 73 | # Retrieve stats 74 | stats = spider.crawler.stats.get_stats() 75 | 76 | # Call the method to save stats to PostgreSQL 77 | self.save_stats_to_postgres(stats) 78 | 79 | def save_stats_to_postgres(self, stats): 80 | # Setup database connection 81 | logger.info(f"Logging stats to Postgres: {stats}") 82 | 83 | start_time = stats.get("start_time") 84 | finish_time = stats.get("finish_time") 85 | elapsed_time_seconds = stats.get("elapsed_time_seconds") 86 | item_scraped_count = stats.get("item_scraped_count", 0) 87 | finish_reason = stats.get("finish_reason") 88 | log_count_debug = stats.get("log_count/DEBUG", 0) 89 | log_count_info = stats.get("log_count/INFO", 0) 90 | log_count_error = stats.get("log_count/ERROR", 0) 91 | mem_usage_startup = stats.get("memusage/startup") 92 | mem_usage_max = stats.get("memusage/max") 93 | scheduler_enqueued_memory = stats.get("scheduler/enqueued/memory") 94 | downloader_request_count = stats.get("downloader/request_count") 95 | downloader_reponse_count = stats.get("downloader/response_count") 96 | response_received_count = stats.get("response_received_count") 97 | downloader_request_method_count_get = stats.get( 98 | "downloader/request_method_count/GET" 99 | ) 100 | downloader_request_bytes = stats.get("downloader/request_bytes") 101 | 102 | logger.info("Saving stats to PostgreSQL") 103 | logger.info(f"start_time: {start_time}") 104 | logger.info(f"finish_time: {finish_time}") 105 | logger.info(f"elapsed_time_seconds: {elapsed_time_seconds}") 106 | logger.info(f"item_scraped_count: {item_scraped_count}") 107 | logger.info(f"finish_reason: {finish_reason}") 108 | logger.info(f"log_count_debug: {log_count_debug}") 109 | logger.info(f"log_count_info: {log_count_info}") 110 | logger.info(f"log_count_error: {log_count_error}") 111 | logger.info(f"mem_usage_startup: {mem_usage_startup}") 112 | logger.info(f"mem_usage_max: {mem_usage_max}") 113 | logger.info(f"scheduler_enqueued_memory: {scheduler_enqueued_memory}") 114 | logger.info(f"downloader_request_count: {downloader_request_count}") 115 | logger.info(f"downloader_reponse_count: {downloader_reponse_count}") 116 | logger.info(f"response_received_count: {response_received_count}") 117 | logger.info( 118 | f"downloader_request_method_count_get: {downloader_request_method_count_get}" 119 | ) 120 | logger.info(f"downloader_request_bytes: {downloader_request_bytes}") 121 | 122 | insert_sql = """ 123 | INSERT INTO scrapy_rightmove_rental_stats ( 124 | start_time, finish_time, elapsed_time_seconds, item_scraped_count, finish_reason, 125 | log_count_debug, log_count_info, log_count_error, mem_usage_startup, mem_usage_max, scheduler_enqueued_memory, 126 | downloader_request_count, downloader_response_count, response_received_count, 127 | downloader_request_method_count_get, downloader_request_bytes 128 | ) VALUES %s; 129 | """ 130 | 131 | # Data tuple to insert 132 | data = ( 133 | stats.get("start_time"), 134 | stats.get("finish_time"), 135 | stats.get("elapsed_time_seconds"), 136 | stats.get("item_scraped_count", 0), 137 | stats.get("finish_reason"), 138 | stats.get("log_count/DEBUG", 0), 139 | stats.get("log_count/INFO", 0), 140 | stats.get("log_count/ERROR", 0), 141 | stats.get("memusage/startup"), 142 | stats.get("memusage/max"), 143 | stats.get("scheduler/enqueued/memory"), 144 | stats.get("downloader/request_count"), 145 | stats.get("downloader/response_count"), 146 | stats.get("response_received_count"), 147 | stats.get("downloader/request_method_count/GET"), 148 | stats.get("downloader/request_bytes"), 149 | ) 150 | cur = None 151 | conn = None 152 | try: 153 | # Connect to your database 154 | conn = psycopg2.connect(POSTGRES_URI) 155 | cur = conn.cursor() 156 | 157 | # Execute the insert statement 158 | execute_values(cur, insert_sql, [data]) 159 | 160 | # Commit the transaction 161 | conn.commit() 162 | 163 | logger.info("Stats successfully saved to PostgreSQL") 164 | except Exception as e: 165 | logger.error(f"An error occurred: {e}") 166 | finally: 167 | if cur is not None: 168 | cur.close() 169 | 170 | if conn is not None: 171 | conn.close() 172 | 173 | 174 | class RightmoveScraperDownloaderMiddleware: 175 | # Not all methods need to be defined. If a method is not defined, 176 | # scrapy acts as if the downloader middleware does not modify the 177 | # passed objects. 178 | 179 | @classmethod 180 | def from_crawler(cls, crawler): 181 | # This method is used by Scrapy to create your spiders. 182 | s = cls() 183 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 184 | return s 185 | 186 | def process_request(self, request, spider): 187 | # Called for each request that goes through the downloader 188 | # middleware. 189 | 190 | # Must either: 191 | # - return None: continue processing this request 192 | # - or return a Response object 193 | # - or return a Request object 194 | # - or raise IgnoreRequest: process_exception() methods of 195 | # installed downloader middleware will be called 196 | return None 197 | 198 | def process_response(self, request, response, spider): 199 | # Called with the response returned from the downloader. 200 | 201 | # Must either; 202 | # - return a Response object 203 | # - return a Request object 204 | # - or raise IgnoreRequest 205 | return response 206 | 207 | def process_exception(self, request, exception, spider): 208 | # Called when a download handler or a process_request() 209 | # (from other downloader middleware) raises an exception. 210 | 211 | # Must either: 212 | # - return None: continue processing this exception 213 | # - return a Response object: stops process_exception() chain 214 | # - return a Request object: stops process_exception() chain 215 | pass 216 | 217 | def spider_opened(self, spider): 218 | spider.logger.info("Spider opened: %s" % spider.name) 219 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: "3.8" 2 | 3 | x-airflow-common: 4 | &airflow-common 5 | build: 6 | context: ./rightmove/orchestration/airflow_app/ 7 | env_file: 8 | - .env 9 | environment: 10 | &airflow-common-env 11 | AIRFLOW__CORE__EXECUTOR: LocalExecutor 12 | AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 13 | # AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: ${POSTGRES_URI:-}/airflow 14 | AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 15 | # AIRFLOW__CORE__SQL_ALCHEMY_CONN: ${POSTGRES_URI:-}/airflow 16 | AIRFLOW__CORE__FERNET_KEY: '' 17 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' 18 | AIRFLOW__CORE__LOAD_EXAMPLES: 'false' 19 | AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session' 20 | _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-} 21 | AIRFLOW_UID: ${AIRFLOW_UID:-50000} 22 | GOOGLE_APPLICATION_CREDENTIALS: /opt/airflow/credentials/airflow-service-account.json 23 | GCP_GCS_BUCKET: ${GCP_GCS_BUCKET:-} 24 | 25 | volumes: 26 | - ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags 27 | - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs 28 | - ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins 29 | - /var/run/docker.sock:/var/run/docker.sock 30 | - /Users/alexander.girardet/Code/Personal/projects/rightmove_project/credentials:/opt/airflow/credentials 31 | user: "${AIRFLOW_UID:-50000}:0" 32 | depends_on: 33 | &airflow-common-depends-on 34 | postgres: 35 | condition: service_healthy 36 | networks: 37 | - backend 38 | 39 | services: 40 | 41 | scrapy_app: 42 | build: 43 | context: ./rightmove/data_ingestion/rightmove_scraper/ 44 | ports: 45 | - "6800:6800" 46 | networks: 47 | - backend 48 | env_file: 49 | - .env 50 | 51 | # streamlit_app: 52 | # build: 53 | # context: ./rightmove/dashboard/ 54 | # depends_on: 55 | # - mongodb 56 | # ports: 57 | # - "8501:8501" 58 | # environment: 59 | # GOOGLE_APPLICATION_CREDENTIALS: /tmp/keys/credentials/airflow-service-account.json 60 | # env_file: 61 | # - .env 62 | # volumes: 63 | # - /Users/alexander.girardet/Code/Personal/projects/rightmove_project/credentials:/tmp/keys/credentials 64 | # networks: 65 | # - frontend 66 | # - backend 67 | # 68 | fastapi_app: 69 | build: 70 | context: ./rightmove/backend/ 71 | ports: 72 | - "8000:8000" 73 | environment: 74 | GOOGLE_APPLICATION_CREDENTIALS: /tmp/keys/credentials/airflow-service-account.json 75 | env_file: 76 | - .env 77 | volumes: 78 | - /Users/alexander.girardet/Code/Personal/projects/rightmove_project/credentials:/tmp/keys/credentials 79 | networks: 80 | - frontend 81 | - backend 82 | 83 | mlflow-server: 84 | build: 85 | context: ./rightmove/mlflow/ 86 | ports: 87 | - "5001:5001" 88 | environment: 89 | MLFLOW_BACKEND_STORE_URI: ${POSTGRES_URI:-}/mlflow 90 | MLFLOW_ARTIFACTS_DESTINATION: ${GCS_ARTIFACT_BUCKET:-} 91 | GOOGLE_APPLICATION_CREDENTIALS: /tmp/keys/credentials/airflow-service-account.json 92 | volumes: 93 | - /Users/alexander.girardet/Code/Personal/projects/rightmove_project/credentials:/tmp/keys/credentials 94 | networks: 95 | - backend 96 | 97 | postgres: 98 | image: postgres:13 99 | environment: 100 | POSTGRES_USER: airflow 101 | POSTGRES_PASSWORD: airflow 102 | POSTGRES_DB: airflow 103 | volumes: 104 | - postgres-db-volume:/var/lib/postgresql/data 105 | healthcheck: 106 | test: [ "CMD", "pg_isready", "-U", "airflow" ] 107 | interval: 5s 108 | retries: 5 109 | ports: 110 | - "5432:5432" 111 | restart: always 112 | networks: 113 | - backend 114 | 115 | grafana: 116 | image: grafana/grafana 117 | user: "472" 118 | ports: 119 | - "3000:3000" 120 | volumes: 121 | - ./rightmove/monitoring/config/grafana_datasources.yaml:/etc/grafana/provisioning/datasources/datasource.yaml:ro 122 | - ./rightmove/monitoring/config/grafana_dashboards.yaml:/etc/grafana/provisioning/dashboards/dashboards.yaml:ro 123 | - ./rightmove/monitoring/config/dashboards:/var/lib/grafana/dashboards:ro 124 | networks: 125 | - backend 126 | env_file: 127 | - .env 128 | restart: always 129 | 130 | airflow-webserver: 131 | <<: *airflow-common 132 | command: webserver 133 | ports: 134 | - "8080:8080" 135 | healthcheck: 136 | test: 137 | [ 138 | "CMD", 139 | "curl", 140 | "--fail", 141 | "http://localhost:8080/health" 142 | ] 143 | interval: 10s 144 | timeout: 10s 145 | retries: 5 146 | restart: always 147 | depends_on: 148 | <<: *airflow-common-depends-on 149 | airflow-init: 150 | condition: service_completed_successfully 151 | 152 | airflow-scheduler: 153 | <<: *airflow-common 154 | command: scheduler 155 | healthcheck: 156 | test: 157 | [ 158 | "CMD-SHELL", 159 | 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"' 160 | ] 161 | interval: 10s 162 | timeout: 10s 163 | retries: 5 164 | restart: always 165 | depends_on: 166 | <<: *airflow-common-depends-on 167 | airflow-init: 168 | condition: service_completed_successfully 169 | 170 | airflow-triggerer: 171 | <<: *airflow-common 172 | command: triggerer 173 | healthcheck: 174 | test: 175 | [ 176 | "CMD-SHELL", 177 | 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"' 178 | ] 179 | interval: 10s 180 | timeout: 10s 181 | retries: 5 182 | restart: always 183 | depends_on: 184 | <<: *airflow-common-depends-on 185 | airflow-init: 186 | condition: service_completed_successfully 187 | 188 | airflow-init: 189 | <<: *airflow-common 190 | entrypoint: /bin/bash 191 | # yamllint disable rule:line-length 192 | command: 193 | - -c 194 | - | 195 | function ver() { 196 | printf "%04d%04d%04d%04d" $${1//./ } 197 | } 198 | airflow_version=$$(AIRFLOW__LOGGING__LOGGING_LEVEL=INFO && gosu airflow airflow version) 199 | airflow_version_comparable=$$(ver $${airflow_version}) 200 | min_airflow_version=2.2.0 201 | min_airflow_version_comparable=$$(ver $${min_airflow_version}) 202 | if (( airflow_version_comparable < min_airflow_version_comparable )); then 203 | echo 204 | echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m" 205 | echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!" 206 | echo 207 | exit 1 208 | fi 209 | if [[ -z "${AIRFLOW_UID}" ]]; then 210 | echo 211 | echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m" 212 | echo "If you are on Linux, you SHOULD follow the instructions below to set " 213 | echo "AIRFLOW_UID environment variable, otherwise files will be owned by root." 214 | echo "For other operating systems you can get rid of the warning with manually created .env file:" 215 | echo " See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user" 216 | echo 217 | fi 218 | one_meg=1048576 219 | mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg)) 220 | cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat) 221 | disk_available=$$(df / | tail -1 | awk '{print $$4}') 222 | warning_resources="false" 223 | if (( mem_available < 4000 )) ; then 224 | echo 225 | echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m" 226 | echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))" 227 | echo 228 | warning_resources="true" 229 | fi 230 | if (( cpus_available < 2 )); then 231 | echo 232 | echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m" 233 | echo "At least 2 CPUs recommended. You have $${cpus_available}" 234 | echo 235 | warning_resources="true" 236 | fi 237 | if (( disk_available < one_meg * 10 )); then 238 | echo 239 | echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m" 240 | echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))" 241 | echo 242 | warning_resources="true" 243 | fi 244 | if [[ $${warning_resources} == "true" ]]; then 245 | echo 246 | echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m" 247 | echo "Please follow the instructions to increase amount of resources available:" 248 | echo " https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#before-you-begin" 249 | echo 250 | fi 251 | mkdir -p /sources/logs /sources/dags /sources/plugins 252 | chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins} 253 | exec /entrypoint airflow version 254 | # yamllint enable rule:line-length 255 | environment: 256 | <<: *airflow-common-env 257 | _AIRFLOW_DB_UPGRADE: 'true' 258 | _AIRFLOW_WWW_USER_CREATE: 'true' 259 | _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} 260 | _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} 261 | _PIP_ADDITIONAL_REQUIREMENTS: '' 262 | user: "0:0" 263 | volumes: 264 | - ${AIRFLOW_PROJ_DIR:-.}:/sources 265 | 266 | airflow-cli: 267 | <<: *airflow-common 268 | profiles: 269 | - debug 270 | environment: 271 | <<: *airflow-common-env 272 | CONNECTION_CHECK_MAX_COUNT: "0" 273 | command: 274 | - bash 275 | - -c 276 | - airflow 277 | networks: 278 | backend: 279 | frontend: 280 | 281 | volumes: 282 | mongodb_data: 283 | postgres-db-volume: 284 | my_db_volume: -------------------------------------------------------------------------------- /rightmove/orchestration/airflow_app/dags/rightmove/data_processing/metric_extraction.py: -------------------------------------------------------------------------------- 1 | import psycopg2 2 | from datetime import datetime 3 | 4 | from evidently.report import Report 5 | from evidently.metric_preset import ( 6 | DataDriftPreset, 7 | TargetDriftPreset, 8 | RegressionPreset, 9 | DataQualityPreset, 10 | ) 11 | from evidently import ColumnMapping 12 | from evidently.metrics import * 13 | import logging 14 | 15 | logging.basicConfig(level=logging.INFO) 16 | 17 | import os 18 | 19 | from dotenv import load_dotenv 20 | 21 | load_dotenv("/Users/alexander.girardet/Code/Personal/projects/rightmove_project/.env") 22 | 23 | PG_URI = os.environ.get("MONITORING_URI_PG") 24 | 25 | 26 | class MetricExtraction: 27 | def __init__(self): 28 | self.conn = None 29 | self.cur = None 30 | 31 | def connect_to_postgres(self): 32 | self.conn = psycopg2.connect(dsn=PG_URI) 33 | self.cur = self.conn.cursor() 34 | 35 | def close_connection(self): 36 | self.conn.close() 37 | 38 | def extract_data_quality(self, quality_report): 39 | # Initialize a dictionary to store the results 40 | summary_dict = { 41 | "walk_score": {}, 42 | "price": {}, 43 | "bedrooms": {}, 44 | "bathrooms": {}, 45 | } 46 | 47 | nans_by_columns = {} 48 | 49 | for metric in quality_report["metrics"]: 50 | if metric["metric"] == "DatasetSummaryMetric": 51 | nans_by_columns = metric["result"]["current"]["nans_by_columns"] 52 | continue 53 | 54 | if "column_name" in metric["result"]: 55 | column_name = metric["result"]["column_name"] 56 | 57 | if column_name in summary_dict: 58 | summary_dict[column_name]["reference_mean"] = metric["result"][ 59 | "reference_characteristics" 60 | ]["mean"] 61 | summary_dict[column_name]["current_mean"] = metric["result"][ 62 | "current_characteristics" 63 | ]["mean"] 64 | summary_dict[column_name]["current_count"] = metric["result"][ 65 | "current_characteristics" 66 | ]["count"] 67 | summary_dict[column_name]["current_nulls"] = nans_by_columns.get( 68 | column_name, 0 69 | ) 70 | 71 | return summary_dict 72 | 73 | def extract_drift(self, drift_report): 74 | share_of_drifted_columns = drift_report["metrics"][0]["result"][ 75 | "share_of_drifted_columns" 76 | ] 77 | dataset_drift_binary = drift_report["metrics"][0]["result"]["dataset_drift"] 78 | target_drift_score = drift_report["metrics"][1]["result"]["drift_by_columns"][ 79 | "target" 80 | ]["drift_score"] 81 | target_drift_detected = drift_report["metrics"][1]["result"][ 82 | "drift_by_columns" 83 | ]["target"]["drift_detected"] 84 | 85 | summary_dict = { 86 | "share_of_drifted_columns": share_of_drifted_columns, 87 | "dataset_drift_binary": dataset_drift_binary, 88 | "target_drift_score": target_drift_score, 89 | "target_drift_detected": target_drift_detected, 90 | } 91 | 92 | return summary_dict 93 | 94 | def extract_performance(self, performance_report): 95 | reference_r2 = performance_report["metrics"][0]["result"]["reference"][ 96 | "r2_score" 97 | ] 98 | reference_rmse = performance_report["metrics"][0]["result"]["reference"]["rmse"] 99 | reference_mean_error = performance_report["metrics"][0]["result"]["reference"][ 100 | "mean_error" 101 | ] 102 | reference_mean_abs_error = performance_report["metrics"][0]["result"][ 103 | "reference" 104 | ]["mean_abs_error"] 105 | 106 | current_r2 = performance_report["metrics"][0]["result"]["current"]["r2_score"] 107 | current_rmse = performance_report["metrics"][0]["result"]["current"]["rmse"] 108 | current_mean_error = performance_report["metrics"][0]["result"]["current"][ 109 | "mean_error" 110 | ] 111 | current_mean_abs_error = performance_report["metrics"][0]["result"]["current"][ 112 | "mean_abs_error" 113 | ] 114 | 115 | summary_dict = { 116 | "reference": { 117 | "r2": reference_r2, 118 | "rmse": reference_rmse, 119 | "mean_error": reference_mean_error, 120 | "mean_abs_error": reference_mean_abs_error, 121 | }, 122 | "current": { 123 | "r2": current_r2, 124 | "rmse": current_rmse, 125 | "mean_error": current_mean_error, 126 | "mean_abs_error": current_mean_abs_error, 127 | }, 128 | } 129 | 130 | return summary_dict 131 | 132 | def extract_prediction(self, prediction_report): 133 | prediction_drift_score = prediction_report["metrics"][0]["result"][ 134 | "drift_score" 135 | ] 136 | prediction_drift_detected = prediction_report["metrics"][0]["result"][ 137 | "drift_detected" 138 | ] 139 | 140 | summary_dict = { 141 | "prediction_drift_score": prediction_drift_score, 142 | "prediction_drift_detected": prediction_drift_detected, 143 | } 144 | 145 | return summary_dict 146 | 147 | def load_metrics_to_postgres( 148 | self, data_dict, metric_category, loading_timestamp=None 149 | ): 150 | insert_query = """ 151 | INSERT INTO model_metrics (metric_category, metric_name, metric_value, metric_status, created_at) 152 | VALUES (%s, %s, %s, %s, %s) 153 | """ 154 | 155 | if self.cur is None: 156 | self.connect_to_postgres() 157 | 158 | if loading_timestamp: 159 | current_timestamp = loading_timestamp 160 | else: 161 | current_timestamp = datetime.now() 162 | 163 | for key, value in data_dict.items(): 164 | if isinstance( 165 | value, dict 166 | ): # For nested dictionaries like in 'extract_means' 167 | for sub_key, sub_value in value.items(): 168 | # Determine if sub_value is a boolean and assign appropriately 169 | if isinstance(sub_value, bool): 170 | self.cur.execute( 171 | insert_query, 172 | ( 173 | metric_category, 174 | f"{key}_{sub_key}", 175 | None, 176 | sub_value, 177 | current_timestamp, 178 | ), 179 | ) 180 | else: 181 | self.cur.execute( 182 | insert_query, 183 | ( 184 | metric_category, 185 | f"{key}_{sub_key}", 186 | sub_value, 187 | None, 188 | current_timestamp, 189 | ), 190 | ) 191 | else: 192 | # Check if the value is boolean and assign to metric_status instead of metric_value 193 | if isinstance(value, bool): 194 | self.cur.execute( 195 | insert_query, 196 | (metric_category, key, None, value, current_timestamp), 197 | ) 198 | else: 199 | # Assuming all non-dict and non-boolean values should be treated as numeric 200 | self.cur.execute( 201 | insert_query, 202 | (metric_category, key, value, None, current_timestamp), 203 | ) 204 | 205 | self.conn.commit() 206 | 207 | def get_target_drift_metrics(self, current_data, reference_data): 208 | target_drift_report = Report(metrics=[ColumnDriftMetric("target")]) 209 | 210 | target_drift_report.run( 211 | reference_data=reference_data, current_data=current_data 212 | ) 213 | 214 | predict_drift_report_dict = target_drift_report.as_dict() 215 | 216 | prediction_data = self.extract_prediction(predict_drift_report_dict) 217 | return prediction_data 218 | 219 | def get_performance_metrics(self, current_data, reference_data): 220 | reg_performance_report = Report( 221 | metrics=[ 222 | RegressionQualityMetric(), 223 | ] 224 | ) 225 | 226 | reg_performance_report.run( 227 | reference_data=reference_data, current_data=current_data 228 | ) 229 | 230 | reg_performance_dict = reg_performance_report.as_dict() 231 | 232 | performance_data = self.extract_performance(reg_performance_dict) 233 | return performance_data 234 | 235 | def get_data_drift_metrics(self, current_data, reference_data): 236 | data_drift_report = Report( 237 | metrics=[ 238 | DataDriftPreset(), 239 | ] 240 | ) 241 | 242 | data_drift_report.run(reference_data=reference_data, current_data=current_data) 243 | 244 | data_drift_report_dict = data_drift_report.as_dict() 245 | 246 | drift_data = self.extract_drift(data_drift_report_dict) 247 | return drift_data 248 | 249 | def get_data_quality_metrics(self, current_data, reference_data): 250 | column_mapping = ColumnMapping() 251 | 252 | current_data = current_data[["bedrooms", "bathrooms", "walk_score", "target"]] 253 | reference_data = reference_data[ 254 | ["bedrooms", "bathrooms", "walk_score", "target"] 255 | ] 256 | 257 | numerical_features = ["bedrooms", "bathrooms", "walk_score"] 258 | 259 | column_mapping.numerical_features = numerical_features 260 | column_mapping.target = "target" 261 | 262 | data_quality_report = Report(metrics=[DataQualityPreset()]) 263 | 264 | data_quality_report.run( 265 | current_data=current_data, reference_data=reference_data 266 | ) 267 | 268 | data_quality_report_dict = data_quality_report.as_dict() 269 | 270 | quality_data = self.extract_data_quality(data_quality_report_dict) 271 | return quality_data 272 | -------------------------------------------------------------------------------- /rightmove/orchestration/airflow_app/dags/rightmove/ml_monitoring.py: -------------------------------------------------------------------------------- 1 | mfrom airflow import DAG 2 | from airflow.operators.python_operator import PythonOperator 3 | from airflow.operators.dummy_operator import DummyOperator 4 | 5 | from rightmove.data_processing.data_processor import DataPreprocessor 6 | from rightmove.data_processing.metric_extraction import MetricExtraction 7 | 8 | import re 9 | 10 | 11 | from pymongo import MongoClient 12 | import pandas as pd 13 | from datetime import datetime, timedelta 14 | from dotenv import load_dotenv 15 | from google.cloud import storage 16 | import os 17 | import random 18 | import requests 19 | import logging 20 | 21 | import mlflow 22 | 23 | client = storage.Client() 24 | bucket = client.get_bucket("rightmove-artifacts-ml") 25 | 26 | MONITORING_URI_PG = os.environ.get("MONITORING_URI_PG") 27 | 28 | mlflow.set_tracking_uri(MONITORING_URI_PG) 29 | 30 | experiment_name = "rightmove-prediction" 31 | mlflow.set_experiment(experiment_name) 32 | 33 | ML_SERVING_URL = "http://fastapi_app:8000/batch-predict" 34 | 35 | logging.basicConfig(level=logging.INFO) 36 | 37 | # load_dotenv("/Users/alexander.girardet/Code/Personal/projects/rightmove_project/.env") 38 | MONGO_URI = os.environ.get("MONGO_URI") 39 | 40 | default_args = { 41 | "owner": "airflow_app", 42 | "depends_on_past": False, 43 | "email_on_failure": False, 44 | "email_on_retry": False, 45 | "retries": 1, 46 | "retry_delay": timedelta(minutes=5), 47 | } 48 | 49 | def modify_uri_to_test(uri: str) -> str: 50 | parts = uri.split('/') 51 | filename = parts[-1] 52 | new_filename = re.sub(r'(train|val|test)\.csv', 'test.csv', filename) 53 | parts[-1] = new_filename 54 | new_uri = '/'.join(parts) 55 | return new_uri 56 | 57 | 58 | 59 | def fetch_reference_df(): 60 | response = requests.get("http://fastapi_app:8000/latest-dataset") 61 | latest_uri = response.json().get("uri") 62 | 63 | test_uri = modify_uri_to_test(latest_uri) 64 | 65 | 66 | reference_data = pd.read_csv( 67 | test_uri, index_col=0 68 | ) 69 | return reference_data 70 | 71 | 72 | def preprocess_data(property_df, walkscore_df): 73 | preprocessor = DataPreprocessor(with_text=False, with_binary=False) 74 | 75 | property_df = preprocessor.preprocess_properties(property_df) 76 | walk_df = preprocessor.preprocess_walk_score(walkscore_df) 77 | 78 | df = property_df.merge(walk_df, on="id", how="left") 79 | 80 | logging.info("Data preprocessed") 81 | 82 | return df 83 | 84 | 85 | def load_data_from_mongo(collection_name, fields, timestamp_field): 86 | client = MongoClient(MONGO_URI) 87 | 88 | db = client["rightmove"] 89 | 90 | collection = db[collection_name] 91 | 92 | two_hours_ago = datetime.now() - timedelta(hours=12) 93 | two_hours_ago_unix = two_hours_ago.timestamp() 94 | 95 | query = {timestamp_field: {"$gt": two_hours_ago_unix}} 96 | 97 | data = collection.find(query, fields) 98 | 99 | return pd.DataFrame(list(data)) 100 | 101 | 102 | def fetch_latest_batch(): 103 | property_fields = { 104 | "id": 1, 105 | "bedrooms": 1, 106 | "bathrooms": 1, 107 | "location": 1, 108 | "price": 1, 109 | "listingUpdate": 1, 110 | "firstVisibleDate": 1, 111 | } 112 | property_df = load_data_from_mongo( 113 | "properties", property_fields, "extraction_timestamp" 114 | ) 115 | 116 | walk_score_fields = {"id": 1, "scores": 1} 117 | walk_score_df = load_data_from_mongo( 118 | "walk_scores", walk_score_fields, "processing_timestamp" 119 | ) 120 | 121 | df = preprocess_data(property_df, walk_score_df) 122 | 123 | df = df[["bedrooms", "bathrooms", "price", "longitude", "latitude", "walk_score"]] 124 | 125 | return df 126 | 127 | 128 | def load_predictions_from_gcs(folder_name): 129 | current_data = pd.read_csv( 130 | f"gs://rightmove-artifacts-ml/predictions/{folder_name}/current.csv", 131 | index_col=0, 132 | ) 133 | reference_data = pd.read_csv( 134 | f"gs://rightmove-artifacts-ml/predictions/{folder_name}/reference.csv", 135 | index_col=0, 136 | ) 137 | 138 | return current_data, reference_data 139 | 140 | 141 | def monitor_datasets(**kwargs): 142 | synthetic_data = False 143 | 144 | if "ti" in kwargs: 145 | ti = kwargs["ti"] 146 | folder_name = ti.xcom_pull(task_ids="load_predictions_to_gcs") 147 | else: 148 | folder_name = kwargs.get("folder_name") 149 | synthetic_data = kwargs.get("synthetic_data") 150 | 151 | current_data, reference_data = load_predictions_from_gcs(folder_name) 152 | 153 | current_data = current_data[['bedrooms', 'bathrooms', 'longitude', 'latitude', 'walk_score', 'prediction', 'target']] 154 | reference_data = reference_data[['bedrooms', 'bathrooms', 'longitude', 'latitude', 'walk_score', 'prediction', 'target']] 155 | 156 | metric_extractor = MetricExtraction() 157 | 158 | metric_extractor.connect_to_postgres() 159 | 160 | performance_data = metric_extractor.get_performance_metrics( 161 | current_data, reference_data 162 | ) 163 | 164 | prediction_data = metric_extractor.get_target_drift_metrics( 165 | current_data, reference_data 166 | ) 167 | 168 | drift_data = metric_extractor.get_data_drift_metrics(current_data, reference_data) 169 | 170 | quality_data = metric_extractor.get_data_quality_metrics( 171 | current_data, reference_data 172 | ) 173 | 174 | if synthetic_data: 175 | fake_timestamp = datetime.now() - timedelta(days=random.randint(0, 30)) 176 | else: 177 | fake_timestamp = None 178 | 179 | metric_extractor.load_metrics_to_postgres( 180 | prediction_data, "prediction_drift", loading_timestamp=fake_timestamp 181 | ) 182 | metric_extractor.load_metrics_to_postgres( 183 | performance_data, "performance", loading_timestamp=fake_timestamp 184 | ) 185 | metric_extractor.load_metrics_to_postgres( 186 | drift_data, "drift", loading_timestamp=fake_timestamp 187 | ) 188 | metric_extractor.load_metrics_to_postgres( 189 | quality_data, "quality", loading_timestamp=fake_timestamp 190 | ) 191 | logging.info("Metrics loaded to Postgres") 192 | 193 | metric_extractor.close_connection() 194 | logging.info("Connection to Postgres closed") 195 | 196 | 197 | def predict_properties(properties_features): 198 | try: 199 | response = requests.post(ML_SERVING_URL, json=properties_features) 200 | if response.status_code != 200: 201 | raise ValueError("Request failed") 202 | else: 203 | predictions = response.json().get("predictions") 204 | return predictions 205 | except Exception as e: 206 | raise e 207 | 208 | 209 | def generate_predictions(current_data=None, reference_data=None): 210 | if current_data is None: 211 | raise ValueError("No current data set") 212 | 213 | # new_logged_model = 'runs:/5c5b195cf1b74219993b436489545b7a/random-forest' # Replace with latest model from API 214 | # new_logged_model = mlflow.pyfunc.load_model(new_logged_model) 215 | 216 | current_features = current_data[ 217 | ["bedrooms", "bathrooms", "longitude", "latitude", "walk_score"] 218 | ].to_dict("records") 219 | reference_features = reference_data[ 220 | ["bedrooms", "bathrooms", "longitude", "latitude", "walk_score"] 221 | ].to_dict("records") 222 | 223 | current_predictions = predict_properties(current_features) 224 | reference_predictions = predict_properties(reference_features) 225 | 226 | # reference_data['predictions'] = new_logged_model.predict(reference_data.drop(columns=['price'])) 227 | # 228 | # current_data['predictions'] = new_logged_model.predict(current_data.drop(columns=['price'])) 229 | 230 | current_data["predictions"] = current_predictions 231 | 232 | reference_data["predictions"] = reference_predictions 233 | 234 | logging.info("Predictions generated") 235 | 236 | return current_data, reference_data 237 | 238 | 239 | def load_df_to_gcs(df, dest_path): 240 | blob = bucket.blob(dest_path) 241 | try: 242 | blob.upload_from_string(df.to_csv(), "text/csv") 243 | logging.info(f"Data uploaded to {dest_path}") 244 | return True 245 | except Exception as e: 246 | print(e) 247 | 248 | 249 | def load_data_from_gcs(source_url): 250 | logging.info(f"Loading {source_url} from GCS") 251 | df = pd.read_csv(source_url, index_col=0) 252 | return df 253 | 254 | 255 | def generate_foldername(): 256 | now = datetime.now() 257 | return now.strftime("%Y-%m-%d-%H-%M-%S") 258 | 259 | 260 | def load_predictions_to_gcs(): 261 | logging.info("Fetching data") 262 | current_data = fetch_latest_batch() 263 | reference_data = fetch_reference_df() 264 | 265 | logging.info("Generating predictions") 266 | current_data, reference_data = generate_predictions( 267 | current_data=current_data, reference_data=reference_data 268 | ) 269 | 270 | current_data = current_data.rename( 271 | columns={"predictions": "prediction", "price": "target"} 272 | ) 273 | reference_data = reference_data.rename( 274 | columns={"predictions": "prediction", "price": "target"} 275 | ) 276 | 277 | folder_name = generate_foldername() 278 | parent_folder = "predictions" 279 | 280 | load_df_to_gcs(current_data, f"{parent_folder}/{folder_name}/current.csv") 281 | load_df_to_gcs(reference_data, f"{parent_folder}/{folder_name}/reference.csv") 282 | 283 | logging.info("Data loaded to GCS") 284 | 285 | return folder_name 286 | 287 | 288 | dag = DAG( 289 | "monitor_ml_performance_rightmove", 290 | default_args=default_args, 291 | description="DAG for monitoring ML performance for rightmove", 292 | schedule_interval=timedelta(days=1), 293 | start_date=datetime(2023, 1, 1), 294 | catchup=False, 295 | max_active_runs=1, 296 | ) 297 | 298 | start_task = DummyOperator(task_id="start", dag=dag) 299 | 300 | load_predictions_to_gcs_task = PythonOperator( 301 | task_id="load_predictions_to_gcs", python_callable=load_predictions_to_gcs, dag=dag 302 | ) 303 | 304 | monitor_datasets_task = PythonOperator( 305 | task_id="monitor_datasets", 306 | python_callable=monitor_datasets, 307 | provide_context=True, 308 | dag=dag, 309 | ) 310 | 311 | end_task = DummyOperator(task_id="end", dag=dag) 312 | 313 | start_task >> load_predictions_to_gcs_task >> monitor_datasets_task >> end_task 314 | 315 | if __name__ == "__main__": 316 | # folder_name = load_predictions_to_gcs() 317 | # monitor_datasets(folder_name=folder_name) 318 | 319 | response = requests.get("http://localhost:8000/latest-dataset") 320 | latest_uri = response.json().get("uri") 321 | 322 | test_uri = modify_uri_to_test(latest_uri) 323 | 324 | print(test_uri) 325 | # 326 | # import logging 327 | # import pandas as pd 328 | 329 | # def split_df_into_chunks(df, chunk_size=500): 330 | # """Yield successive chunks of rows from df.""" 331 | # for i in range(0, df.shape[0], chunk_size): 332 | # yield df.iloc[i:i + chunk_size] 333 | # 334 | # 335 | # def load_chunk_to_gcs(chunk, parent_folder, folder_name, base_filename, chunk_index): 336 | # """Load a single chunk of DataFrame to GCS, with a unique filename.""" 337 | # filename = f"{base_filename}_part{chunk_index}.csv" 338 | # path = f"{parent_folder}/{folder_name}/{filename}" 339 | # # This function should be defined to handle the actual loading process to GCS 340 | # load_df_to_gcs(chunk, path) 341 | # 342 | # 343 | # logging.info("Fetching data") 344 | # current_data = fetch_latest_batch() 345 | # reference_data = fetch_reference_df() 346 | # 347 | # logging.info("Generating predictions") 348 | # current_data, reference_data = generate_predictions(current_data=current_data, reference_data=reference_data) 349 | # 350 | # current_data = current_data.rename(columns={"predictions": "prediction", "price": "target"}) 351 | # reference_data = reference_data.rename(columns={"predictions": "prediction", "price": "target"}) 352 | # 353 | # folder_name = generate_foldername() 354 | # parent_folder = "predictions" 355 | # 356 | # # Split and load current_data 357 | # for index, chunk in enumerate(split_df_into_chunks(current_data)): 358 | # monitor_datasets(chunk, reference_data) 359 | # # load_chunk_to_gcs(chunk, parent_folder, folder_name, "current", index) 360 | 361 | logging.info("Data loaded to GCS") 362 | -------------------------------------------------------------------------------- /notebooks/resources/data/property.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": 142547498, 3 | "bedrooms": 4, 4 | "bathrooms": 5, 5 | "numberOfImages": 28, 6 | "numberOfFloorplans": 0, 7 | "numberOfVirtualTours": 0, 8 | "summary": "Stonehouse Lettings are delighted to offer the opportunity to Lease this impressive detached family home which is available on a part furnished basis. All rooms offer comfortable and modern living throughout. The welcoming central hallway allows access to most rooms within the property. The main feature is the two separate staircases leading to the first floor along with the high vaulted ceilings. The lounge is exceptionally spacious and overlooks the front and the rear of the property. The drawing room is generously proportioned and also overlooks the front. The dining kitchen has been fitted with a range of luxury base and wall units and comes complete with integrated appliances. It should be noted the family room is on semi open plan. Utility room and Cloakroom WC are also located on the ground floor. On the first floor there are 4 generously proportioned double bedrooms which all benefit from separate en suite facilities. The master bedroom is has been tastefully decorated and fitted with a range of wardrobes. A particular feature is the external balcony. Externally the garden grounds are enclosed and mainly laid to lawn. A large driveway leads to the detached double garage which is equipped with power and light. Gas central heating and double glazed windows. Early viewing is highly recommended.", 9 | "displayAddress": "Kepplestone Gardens, West End, Aberdeen, AB15", 10 | "countryCode": "GB", 11 | "location": { 12 | "latitude": 57.137373, 13 | "longitude": -2.14488 14 | }, 15 | "propertyImages": { 16 | "images": [ 17 | { 18 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_00_0000_max_476x317.jpeg", 19 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_00_0000.jpeg", 20 | "caption": "Picture No. 36" 21 | }, 22 | { 23 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_01_0000_max_476x317.jpeg", 24 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_01_0000.jpeg", 25 | "caption": "Picture No. 07" 26 | }, 27 | { 28 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_02_0000_max_476x317.jpeg", 29 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_02_0000.jpeg", 30 | "caption": "Picture No. 12" 31 | }, 32 | { 33 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_03_0000_max_476x317.jpeg", 34 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_03_0000.jpeg", 35 | "caption": "Picture No. 13" 36 | }, 37 | { 38 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_04_0000_max_476x317.jpeg", 39 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_04_0000.jpeg", 40 | "caption": "Picture No. 10" 41 | }, 42 | { 43 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_05_0000_max_476x317.jpeg", 44 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_05_0000.jpeg", 45 | "caption": "Picture No. 11" 46 | }, 47 | { 48 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_06_0000_max_476x317.jpeg", 49 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_06_0000.jpeg", 50 | "caption": "Picture No. 08" 51 | }, 52 | { 53 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_07_0000_max_476x317.jpeg", 54 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_07_0000.jpeg", 55 | "caption": "Picture No. 09" 56 | }, 57 | { 58 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_08_0000_max_476x317.jpeg", 59 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_08_0000.jpeg", 60 | "caption": "Picture No. 24" 61 | }, 62 | { 63 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_09_0000_max_476x317.jpeg", 64 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_09_0000.jpeg", 65 | "caption": "Picture No. 25" 66 | }, 67 | { 68 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_10_0000_max_476x317.jpeg", 69 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_10_0000.jpeg", 70 | "caption": "Picture No. 26" 71 | }, 72 | { 73 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_11_0000_max_476x317.jpeg", 74 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_11_0000.jpeg", 75 | "caption": "Picture No. 27" 76 | }, 77 | { 78 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_12_0000_max_476x317.jpeg", 79 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_12_0000.jpeg", 80 | "caption": "Picture No. 28" 81 | }, 82 | { 83 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_13_0000_max_476x317.jpeg", 84 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_13_0000.jpeg", 85 | "caption": "Picture No. 29" 86 | }, 87 | { 88 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_14_0000_max_476x317.jpeg", 89 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_14_0000.jpeg", 90 | "caption": "Picture No. 16" 91 | }, 92 | { 93 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_15_0000_max_476x317.jpeg", 94 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_15_0000.jpeg", 95 | "caption": "Picture No. 17" 96 | }, 97 | { 98 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_16_0000_max_476x317.jpeg", 99 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_16_0000.jpeg", 100 | "caption": "Picture No. 18" 101 | }, 102 | { 103 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_17_0000_max_476x317.jpeg", 104 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_17_0000.jpeg", 105 | "caption": "Picture No. 19" 106 | }, 107 | { 108 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_18_0000_max_476x317.jpeg", 109 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_18_0000.jpeg", 110 | "caption": "Picture No. 20" 111 | }, 112 | { 113 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_19_0000_max_476x317.jpeg", 114 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_19_0000.jpeg", 115 | "caption": "Picture No. 22" 116 | }, 117 | { 118 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_20_0000_max_476x317.jpeg", 119 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_20_0000.jpeg", 120 | "caption": "Picture No. 23" 121 | }, 122 | { 123 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_21_0000_max_476x317.jpeg", 124 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_21_0000.jpeg", 125 | "caption": "Picture No. 30" 126 | }, 127 | { 128 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_22_0000_max_476x317.jpeg", 129 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_22_0000.jpeg", 130 | "caption": "Picture No. 31" 131 | }, 132 | { 133 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_23_0000_max_476x317.jpeg", 134 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_23_0000.jpeg", 135 | "caption": "Picture No. 32" 136 | }, 137 | { 138 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_24_0000_max_476x317.jpeg", 139 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_24_0000.jpeg", 140 | "caption": "Picture No. 15" 141 | }, 142 | { 143 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_25_0000_max_476x317.jpeg", 144 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_25_0000.jpeg", 145 | "caption": "Picture No. 14" 146 | }, 147 | { 148 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_26_0000_max_476x317.jpeg", 149 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_26_0000.jpeg", 150 | "caption": "Picture No. 34" 151 | }, 152 | { 153 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_27_0000_max_476x317.jpeg", 154 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_27_0000.jpeg", 155 | "caption": "Picture No. 35" 156 | } 157 | ], 158 | "mainImageSrc": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_00_0000_max_476x317.jpeg", 159 | "mainMapImageSrc": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_00_0000_max_296x197.jpeg" 160 | }, 161 | "propertySubType": "Detached", 162 | "listingUpdate": { 163 | "listingUpdateReason": "new", 164 | "listingUpdateDate": "2023-11-29T18:57:03Z" 165 | }, 166 | "premiumListing": false, 167 | "featuredProperty": true, 168 | "price": { 169 | "amount": 2915, 170 | "frequency": "monthly", 171 | "currencyCode": "GBP", 172 | "displayPrices": [ 173 | { 174 | "displayPrice": "\u00a32,915 pcm", 175 | "displayPriceQualifier": "" 176 | }, 177 | { 178 | "displayPrice": "\u00a3673 pw", 179 | "displayPriceQualifier": "" 180 | } 181 | ] 182 | }, 183 | "customer": { 184 | "branchId": 89488, 185 | "brandPlusLogoURI": "/brand/brand_rmchoice_logo_89714_0002.jpeg", 186 | "contactTelephone": "020 3840 3898", 187 | "branchDisplayName": "DJ Alexander, Aberdeen", 188 | "branchName": "Aberdeen", 189 | "brandTradingName": "DJ Alexander", 190 | "branchLandingPageUrl": "/estate-agents/agent/DJ-Alexander/Aberdeen-89488.html", 191 | "development": false, 192 | "showReducedProperties": true, 193 | "commercial": false, 194 | "showOnMap": true, 195 | "enhancedListing": false, 196 | "developmentContent": null, 197 | "buildToRent": false, 198 | "buildToRentBenefits": [], 199 | "brandPlusLogoUrl": "https://media.rightmove.co.uk:443/dir/brand/brand_rmchoice_logo_89714_0002_max_100x50.jpeg" 200 | }, 201 | "distance": null, 202 | "transactionType": "rent", 203 | "productLabel": { 204 | "productLabelText": "", 205 | "spotlightLabel": false 206 | }, 207 | "commercial": false, 208 | "development": false, 209 | "residential": true, 210 | "students": false, 211 | "auction": false, 212 | "feesApply": false, 213 | "feesApplyText": null, 214 | "displaySize": "", 215 | "showOnMap": true, 216 | "propertyUrl": "/properties/142547498#/?channel=RES_LET", 217 | "contactUrl": "/property-to-rent/contactBranch.html?propertyId=142547498", 218 | "staticMapUrl": null, 219 | "channel": "RENT", 220 | "firstVisibleDate": "2023-11-29T18:51:24Z", 221 | "keywords": [], 222 | "keywordMatchType": "no_keyword", 223 | "saved": false, 224 | "hidden": false, 225 | "onlineViewingsAvailable": false, 226 | "lozengeModel": { 227 | "matchingLozenges": [] 228 | }, 229 | "hasBrandPlus": true, 230 | "displayStatus": "", 231 | "enquiredTimestamp": null, 232 | "heading": "Featured Property", 233 | "isRecent": false, 234 | "enhancedListing": false, 235 | "formattedBranchName": " by DJ Alexander, Aberdeen", 236 | "formattedDistance": "", 237 | "propertyTypeFullDescription": "4 bedroom detached house", 238 | "addedOrReduced": "Added on 29/11/2023", 239 | "feature_list": [ 240 | "* Unfurnished", 241 | "* FOUR bedrooms", 242 | "* West End Location", 243 | "* Double Garage", 244 | "* Garden", 245 | "* Gas Central Heating", 246 | "* landlord reg: 255737/100/1558", 247 | "* Council Tax H" 248 | ] 249 | } -------------------------------------------------------------------------------- /notebooks/resources/data/.ipynb_checkpoints/property-checkpoint.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": 142547498, 3 | "bedrooms": 4, 4 | "bathrooms": 5, 5 | "numberOfImages": 28, 6 | "numberOfFloorplans": 0, 7 | "numberOfVirtualTours": 0, 8 | "summary": "Stonehouse Lettings are delighted to offer the opportunity to Lease this impressive detached family home which is available on a part furnished basis. All rooms offer comfortable and modern living throughout. The welcoming central hallway allows access to most rooms within the property. The main feature is the two separate staircases leading to the first floor along with the high vaulted ceilings. The lounge is exceptionally spacious and overlooks the front and the rear of the property. The drawing room is generously proportioned and also overlooks the front. The dining kitchen has been fitted with a range of luxury base and wall units and comes complete with integrated appliances. It should be noted the family room is on semi open plan. Utility room and Cloakroom WC are also located on the ground floor. On the first floor there are 4 generously proportioned double bedrooms which all benefit from separate en suite facilities. The master bedroom is has been tastefully decorated and fitted with a range of wardrobes. A particular feature is the external balcony. Externally the garden grounds are enclosed and mainly laid to lawn. A large driveway leads to the detached double garage which is equipped with power and light. Gas central heating and double glazed windows. Early viewing is highly recommended.", 9 | "displayAddress": "Kepplestone Gardens, West End, Aberdeen, AB15", 10 | "countryCode": "GB", 11 | "location": { 12 | "latitude": 57.137373, 13 | "longitude": -2.14488 14 | }, 15 | "propertyImages": { 16 | "images": [ 17 | { 18 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_00_0000_max_476x317.jpeg", 19 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_00_0000.jpeg", 20 | "caption": "Picture No. 36" 21 | }, 22 | { 23 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_01_0000_max_476x317.jpeg", 24 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_01_0000.jpeg", 25 | "caption": "Picture No. 07" 26 | }, 27 | { 28 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_02_0000_max_476x317.jpeg", 29 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_02_0000.jpeg", 30 | "caption": "Picture No. 12" 31 | }, 32 | { 33 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_03_0000_max_476x317.jpeg", 34 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_03_0000.jpeg", 35 | "caption": "Picture No. 13" 36 | }, 37 | { 38 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_04_0000_max_476x317.jpeg", 39 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_04_0000.jpeg", 40 | "caption": "Picture No. 10" 41 | }, 42 | { 43 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_05_0000_max_476x317.jpeg", 44 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_05_0000.jpeg", 45 | "caption": "Picture No. 11" 46 | }, 47 | { 48 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_06_0000_max_476x317.jpeg", 49 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_06_0000.jpeg", 50 | "caption": "Picture No. 08" 51 | }, 52 | { 53 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_07_0000_max_476x317.jpeg", 54 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_07_0000.jpeg", 55 | "caption": "Picture No. 09" 56 | }, 57 | { 58 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_08_0000_max_476x317.jpeg", 59 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_08_0000.jpeg", 60 | "caption": "Picture No. 24" 61 | }, 62 | { 63 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_09_0000_max_476x317.jpeg", 64 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_09_0000.jpeg", 65 | "caption": "Picture No. 25" 66 | }, 67 | { 68 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_10_0000_max_476x317.jpeg", 69 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_10_0000.jpeg", 70 | "caption": "Picture No. 26" 71 | }, 72 | { 73 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_11_0000_max_476x317.jpeg", 74 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_11_0000.jpeg", 75 | "caption": "Picture No. 27" 76 | }, 77 | { 78 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_12_0000_max_476x317.jpeg", 79 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_12_0000.jpeg", 80 | "caption": "Picture No. 28" 81 | }, 82 | { 83 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_13_0000_max_476x317.jpeg", 84 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_13_0000.jpeg", 85 | "caption": "Picture No. 29" 86 | }, 87 | { 88 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_14_0000_max_476x317.jpeg", 89 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_14_0000.jpeg", 90 | "caption": "Picture No. 16" 91 | }, 92 | { 93 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_15_0000_max_476x317.jpeg", 94 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_15_0000.jpeg", 95 | "caption": "Picture No. 17" 96 | }, 97 | { 98 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_16_0000_max_476x317.jpeg", 99 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_16_0000.jpeg", 100 | "caption": "Picture No. 18" 101 | }, 102 | { 103 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_17_0000_max_476x317.jpeg", 104 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_17_0000.jpeg", 105 | "caption": "Picture No. 19" 106 | }, 107 | { 108 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_18_0000_max_476x317.jpeg", 109 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_18_0000.jpeg", 110 | "caption": "Picture No. 20" 111 | }, 112 | { 113 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_19_0000_max_476x317.jpeg", 114 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_19_0000.jpeg", 115 | "caption": "Picture No. 22" 116 | }, 117 | { 118 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_20_0000_max_476x317.jpeg", 119 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_20_0000.jpeg", 120 | "caption": "Picture No. 23" 121 | }, 122 | { 123 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_21_0000_max_476x317.jpeg", 124 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_21_0000.jpeg", 125 | "caption": "Picture No. 30" 126 | }, 127 | { 128 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_22_0000_max_476x317.jpeg", 129 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_22_0000.jpeg", 130 | "caption": "Picture No. 31" 131 | }, 132 | { 133 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_23_0000_max_476x317.jpeg", 134 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_23_0000.jpeg", 135 | "caption": "Picture No. 32" 136 | }, 137 | { 138 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_24_0000_max_476x317.jpeg", 139 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_24_0000.jpeg", 140 | "caption": "Picture No. 15" 141 | }, 142 | { 143 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_25_0000_max_476x317.jpeg", 144 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_25_0000.jpeg", 145 | "caption": "Picture No. 14" 146 | }, 147 | { 148 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_26_0000_max_476x317.jpeg", 149 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_26_0000.jpeg", 150 | "caption": "Picture No. 34" 151 | }, 152 | { 153 | "srcUrl": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_27_0000_max_476x317.jpeg", 154 | "url": "90k/89488/142547498/89488_SLA140261_L_IMG_27_0000.jpeg", 155 | "caption": "Picture No. 35" 156 | } 157 | ], 158 | "mainImageSrc": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_00_0000_max_476x317.jpeg", 159 | "mainMapImageSrc": "https://media.rightmove.co.uk:443/dir/crop/10:9-16:9/90k/89488/142547498/89488_SLA140261_L_IMG_00_0000_max_296x197.jpeg" 160 | }, 161 | "propertySubType": "Detached", 162 | "listingUpdate": { 163 | "listingUpdateReason": "new", 164 | "listingUpdateDate": "2023-11-29T18:57:03Z" 165 | }, 166 | "premiumListing": false, 167 | "featuredProperty": true, 168 | "price": { 169 | "amount": 2915, 170 | "frequency": "monthly", 171 | "currencyCode": "GBP", 172 | "displayPrices": [ 173 | { 174 | "displayPrice": "\u00a32,915 pcm", 175 | "displayPriceQualifier": "" 176 | }, 177 | { 178 | "displayPrice": "\u00a3673 pw", 179 | "displayPriceQualifier": "" 180 | } 181 | ] 182 | }, 183 | "customer": { 184 | "branchId": 89488, 185 | "brandPlusLogoURI": "/brand/brand_rmchoice_logo_89714_0002.jpeg", 186 | "contactTelephone": "020 3840 3898", 187 | "branchDisplayName": "DJ Alexander, Aberdeen", 188 | "branchName": "Aberdeen", 189 | "brandTradingName": "DJ Alexander", 190 | "branchLandingPageUrl": "/estate-agents/agent/DJ-Alexander/Aberdeen-89488.html", 191 | "development": false, 192 | "showReducedProperties": true, 193 | "commercial": false, 194 | "showOnMap": true, 195 | "enhancedListing": false, 196 | "developmentContent": null, 197 | "buildToRent": false, 198 | "buildToRentBenefits": [], 199 | "brandPlusLogoUrl": "https://media.rightmove.co.uk:443/dir/brand/brand_rmchoice_logo_89714_0002_max_100x50.jpeg" 200 | }, 201 | "distance": null, 202 | "transactionType": "rent", 203 | "productLabel": { 204 | "productLabelText": "", 205 | "spotlightLabel": false 206 | }, 207 | "commercial": false, 208 | "development": false, 209 | "residential": true, 210 | "students": false, 211 | "auction": false, 212 | "feesApply": false, 213 | "feesApplyText": null, 214 | "displaySize": "", 215 | "showOnMap": true, 216 | "propertyUrl": "/properties/142547498#/?channel=RES_LET", 217 | "contactUrl": "/property-to-rent/contactBranch.html?propertyId=142547498", 218 | "staticMapUrl": null, 219 | "channel": "RENT", 220 | "firstVisibleDate": "2023-11-29T18:51:24Z", 221 | "keywords": [], 222 | "keywordMatchType": "no_keyword", 223 | "saved": false, 224 | "hidden": false, 225 | "onlineViewingsAvailable": false, 226 | "lozengeModel": { 227 | "matchingLozenges": [] 228 | }, 229 | "hasBrandPlus": true, 230 | "displayStatus": "", 231 | "enquiredTimestamp": null, 232 | "heading": "Featured Property", 233 | "isRecent": false, 234 | "enhancedListing": false, 235 | "formattedBranchName": " by DJ Alexander, Aberdeen", 236 | "formattedDistance": "", 237 | "propertyTypeFullDescription": "4 bedroom detached house", 238 | "addedOrReduced": "Added on 29/11/2023", 239 | "feature_list": [ 240 | "* Unfurnished", 241 | "* FOUR bedrooms", 242 | "* West End Location", 243 | "* Double Garage", 244 | "* Garden", 245 | "* Gas Central Heating", 246 | "* landlord reg: 255737/100/1558", 247 | "* Council Tax H" 248 | ] 249 | } --------------------------------------------------------------------------------