├── .gitignore
├── requirements.txt
├── README.md
├── yad2_parser.py
├── scraper.py
└── vehicle_analyzer.py
/.gitignore:
--------------------------------------------------------------------------------
1 | scraped_vehicles/*
2 | *.pyc
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests>=2.25.0
2 | beautifulsoup4>=4.9.0
3 | pandas>=1.3.0
4 | dash>=2.0.0
5 | plotly>=5.0.0
6 | numpy>=1.20.0
7 | scipy>=1.7.0
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Yad2 Vehicle Price Analyzer
2 |
3 |
4 |
5 |
6 | Tool for scraping and visualizing vehicle pricing data from Yad2.
7 |
8 |
9 |
10 |
11 |
12 |
13 | ## Installation
14 |
15 | ```bash
16 | # Clone repository
17 | git clone https://github.com/yourusername/yad2-vehicle-analyzer.git
18 | cd yad2-vehicle-analyzer
19 |
20 | # Install dependencies
21 | pip install -r requirements.txt
22 | ```
23 |
24 | ## Examples
25 |
26 | Basic usage:
27 | ```bash
28 | # Run with default settings (Toyota bZ4X)
29 | python vehicle_analyzer.py
30 | ```
31 |
32 | Scrape specific vehicle model:
33 | ```bash
34 | # Volkswagen ID.4
35 | python vehicle_analyzer.py --manufacturer 41 --model 11579
36 |
37 | # Hyundai Ioniq 5
38 | python vehicle_analyzer.py --manufacturer 21 --model 11239
39 |
40 | # Nissan Qashqai
41 | python vehicle_analyzer.py --manufacturer 32 --model 10449
42 | ```
43 |
44 | Use existing data:
45 | ```bash
46 | # Skip scraping
47 | python vehicle_analyzer.py --skip-scrape
48 | ```
49 |
50 | Change web server port:
51 | ```bash
52 | python vehicle_analyzer.py --port 8080
53 | ```
54 |
55 | Find manufacturer and model IDs in Yad2 URLs:
56 | ```
57 | https://www.yad2.co.il/vehicles/cars?manufacturer=19&model=12894
58 | ```
59 |
--------------------------------------------------------------------------------
/yad2_parser.py:
--------------------------------------------------------------------------------
1 | import re
2 | import json
3 | import csv
4 | from typing import List, Dict
5 | from datetime import datetime
6 | from bs4 import BeautifulSoup
7 | import os
8 | from pathlib import Path
9 |
10 | today = datetime.now().date().strftime("%y_%m_%d")
11 |
12 | def extract_json_from_html(html_content: str) -> Dict:
13 | """Extract JSON data from __NEXT_DATA__ script tag"""
14 | soup = BeautifulSoup(html_content, 'html.parser')
15 | script_tag = soup.find('script', id='__NEXT_DATA__')
16 |
17 | if script_tag is None:
18 | raise ValueError("Could not find __NEXT_DATA__ script tag in HTML")
19 |
20 | return json.loads(script_tag.string)
21 |
22 | def get_month_number(month_text: str) -> int:
23 | # Hebrew month names to numbers mapping
24 | month_mapping = {
25 | 'ינואר': 1, 'פברואר': 2, 'מרץ': 3, 'אפריל': 4,
26 | 'מאי': 5, 'יוני': 6, 'יולי': 7, 'אוגוסט': 8,
27 | 'ספטמבר': 9, 'אוקטובר': 10, 'נובמבר': 11, 'דצמבר': 12
28 | }
29 | return month_mapping.get(month_text, 1) # Default to 1 if month not found
30 |
31 | def format_date(date_str: str) -> str:
32 | # Parse ISO format and return YYYY-MM-DD
33 | return datetime.fromisoformat(date_str).strftime('%Y-%m-%d')
34 |
35 | def calculate_years_since_production(production_year: int, production_month: int) -> float:
36 | production_date = datetime(production_year, production_month, 1)
37 | current_date = datetime.now()
38 | years = (current_date - production_date).days / 365.25
39 | return years
40 |
41 | def process_vehicle_data(json_list: List[Dict], listing_type: str, output_file: str, mode: str = 'w') -> None:
42 | """Process vehicle data and write to CSV"""
43 | # Define the headers we want to extract
44 | headers = ['adNumber', 'price', 'city', 'adType', 'model', 'subModel',
45 | 'productionDate', 'km', 'hand', 'createdAt', 'updatedAt',
46 | 'rebouncedAt', 'listingType', 'number_of_years', 'km_per_year', 'description', 'link', 'make', 'hp']
47 |
48 | # Open the CSV file for writing
49 | with open(output_file, mode, newline='', encoding='utf-8') as csvfile:
50 | writer = csv.DictWriter(csvfile, fieldnames=headers)
51 | if mode == 'w': # Only write header if we're creating a new file
52 | writer.writeheader()
53 |
54 | # Process each JSON object
55 | for item in json_list:
56 | try:
57 | # Create date string in YYYY-MM-DD format for production date
58 | year = item['vehicleDates']['yearOfProduction']
59 | month = get_month_number(item['vehicleDates'].get('monthOfProduction', {"text": "ינואר"})['text'])
60 | production_date = f"{year}-{month:02d}-01" # Format: YYYY-MM-DD
61 |
62 | # Calculate years since production
63 | years_since_production = calculate_years_since_production(year, month)
64 |
65 | # Calculate km per year
66 | km = item['km']
67 | km_per_year = round(km / years_since_production if years_since_production > 0 else km, 2)
68 |
69 | row = {
70 | 'adNumber': item['adNumber'],
71 | 'price': item['price'],
72 | 'city': item['address'].get('city',{"text":""})['text'],
73 | 'adType': item['adType'],
74 | 'model': item['model']['text'],
75 | 'subModel': item['subModel']['text'],
76 | 'hp': int(re.search(r'(\d+)\s*כ״ס', item['subModel']['text']).group(1)) if re.search(r'(\d+)\s*כ״ס', item['subModel']['text']) else 0,
77 | 'make': item['manufacturer']['text'],
78 | 'productionDate': production_date,
79 | 'km': item['km'],
80 | 'hand': item['hand']["id"],
81 | 'createdAt': format_date(item['dates']['createdAt']),
82 | 'updatedAt': format_date(item['dates']['updatedAt']),
83 | 'rebouncedAt': format_date(item['dates']['rebouncedAt']),
84 | 'listingType': listing_type,
85 | 'number_of_years': years_since_production,
86 | 'km_per_year': km_per_year,
87 | 'description': item["metaData"]["description"],
88 | 'link': f'https://www.yad2.co.il/vehicles/item/{item["token"]}',
89 | }
90 | writer.writerow(row)
91 | except KeyError as e:
92 | print(f"Skipping item due to missing key: {e}")
93 | print (item)
94 | exit(-1)
95 | except Exception as e:
96 | print(f"Error processing item: {e}")
97 |
98 | def process_directory(directory_path: str) -> None:
99 | """Process all HTML files in a directory and combine the data"""
100 | # Get directory name for the output file
101 | dir_name = Path(directory_path).name
102 | output_file = f"{dir_name}_summary.csv"
103 | output_path = os.path.join(directory_path, output_file)
104 |
105 | # Process each HTML file in the directory
106 | for filename in os.listdir(directory_path):
107 | if filename.endswith('.html') and today in filename:
108 | file_path = os.path.join(directory_path, filename)
109 | try:
110 | with open(file_path, 'r', encoding='utf-8') as file:
111 | print(f"Processing {filename}...")
112 | html_content = file.read()
113 | data = extract_json_from_html(html_content)
114 | listings_data = data['props']['pageProps']['dehydratedState']['queries'][0]['state']['data']
115 |
116 | # Process commercial listings
117 | commercial_list = listings_data.get('commercial', [])
118 | if commercial_list:
119 | mode = 'a' if os.path.exists(output_path) else 'w'
120 | process_vehicle_data(commercial_list, 'commercial', output_path, mode)
121 | print(f"Processed {len(commercial_list)} commercial listings")
122 |
123 | # Process private listings
124 | private_list = listings_data.get('private', [])
125 | if private_list:
126 | mode = 'a' if os.path.exists(output_path) else 'w'
127 | process_vehicle_data(private_list, 'private', output_path, mode)
128 | print(f"Processed {len(private_list)} private listings")
129 |
130 | # Process private listings
131 | private_list = listings_data.get('solo', [])
132 | if private_list:
133 | mode = 'a' if os.path.exists(output_path) else 'w'
134 | process_vehicle_data(private_list, 'solo', output_path, mode)
135 | print(f"Processed {len(private_list)} solo listings")
136 |
137 | private_list = listings_data.get('platinum', [])
138 | if private_list:
139 | mode = 'a' if os.path.exists(output_path) else 'w'
140 | process_vehicle_data(private_list, 'platinum', output_path, mode)
141 | print(f"Processed {len(private_list)} platinum listings")
142 |
143 | except Exception as e:
144 | print(f"Error processing {filename}: {e}")
145 |
146 | print(f"Output saved to: {output_path}")
147 |
148 | if __name__ == "__main__":
149 | directory_path = "scraped_vehicles"
150 | process_directory(directory_path)
151 |
152 | # Upload to Google Drive
153 | output_file = f"{Path(directory_path).name}_summary.csv"
154 | output_path = os.path.join(directory_path, output_file)
155 | upload_drive.upload_to_sheet(output_path)
156 | os.unlink(output_path)
--------------------------------------------------------------------------------
/scraper.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import time
3 | import os
4 | from datetime import datetime, timedelta
5 | from pathlib import Path
6 | import logging
7 | import yad2_parser
8 |
9 | class VehicleScraper:
10 | def __init__(self, output_dir, manufacturer=32, model=10449):
11 | """
12 | Initialize the scraper with output directory and vehicle parameters
13 |
14 | Args:
15 | output_dir (str): Directory to save the scraped files
16 | manufacturer (int): Manufacturer ID
17 | model (int): Model ID
18 | """
19 | self.output_dir = Path(output_dir)
20 | self.manufacturer = manufacturer
21 | self.model = model
22 | self.session = requests.Session()
23 |
24 | # Set up headers exactly as in the curl command
25 | self.headers = {
26 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
27 | 'Accept-Language': 'en-US,en;q=0.9,he;q=0.8',
28 | 'Cache-Control': 'max-age=0',
29 | 'Connection': 'keep-alive',
30 | 'DNT': '1',
31 | 'Referer': 'https://www.yad2.co.il/',
32 | 'Sec-Fetch-Dest': 'document',
33 | 'Sec-Fetch-Mode': 'navigate',
34 | 'Sec-Fetch-Site': 'same-origin',
35 | 'Sec-Fetch-User': '?1',
36 | 'Upgrade-Insecure-Requests': '1',
37 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
38 | 'sec-ch-ua': '"Chromium";v="131", "Not_A Brand";v="24"',
39 | 'sec-ch-ua-mobile': '?0',
40 | 'sec-ch-ua-platform': '"macOS"'
41 | }
42 |
43 | # Set up cookies
44 | self.cookies = {
45 | '__ssds': '3',
46 | 'y2018-2-cohort': '88',
47 | 'use_elastic_search': '1',
48 | 'abTestKey': '2',
49 | 'cohortGroup': 'D'
50 | # Note: Added only essential cookies. Add more if needed.
51 | }
52 |
53 | # Create output directory if it doesn't exist
54 | self.output_dir.mkdir(parents=True, exist_ok=True)
55 |
56 | # Setup logging
57 | logging.basicConfig(
58 | level=logging.INFO,
59 | format='%(asctime)s - %(levelname)s - %(message)s'
60 | )
61 | self.logger = logging.getLogger(__name__)
62 |
63 | def build_url(self, page_num):
64 | """Build the URL for a specific page number"""
65 | base_url = "https://www.yad2.co.il/vehicles/cars"
66 | params = {
67 | "manufacturer": self.manufacturer,
68 | "model": self.model,
69 | # "carFamilyType": "5,10",
70 | # "year": "2022--1",
71 | # "price": "110000-190000",
72 | # "km": "-1-60000",
73 | # # "km": "-1-50000",
74 | "hand": "0-2",
75 | # "imgOnly": "1",
76 | "page": page_num
77 | }
78 | return f"{base_url}?{'&'.join(f'{k}={v}' for k, v in params.items())}"
79 |
80 | def get_output_filename(self, page_num):
81 | today = datetime.now().date().strftime("%y_%m_%d")
82 | """Generate output filename based on manufacturer and model"""
83 | return self.output_dir / f"{today}_manufacturer{self.manufacturer}_model{self.model}_page{page_num}.html"
84 |
85 | def should_skip_file(self, filepath):
86 | """Check if file exists and was modified in the last 24 hours"""
87 | if not filepath.exists():
88 | return False
89 |
90 | file_mtime = datetime.fromtimestamp(filepath.stat().st_mtime)
91 | return datetime.now() - file_mtime < timedelta(days=1)
92 |
93 | def fetch_page(self, page_num):
94 | """
95 | Fetch a single page and save it to file
96 |
97 | Args:
98 | page_num (int): Page number to fetch
99 |
100 | Returns:
101 | bool: True if page was fetched successfully, False if skipped or failed
102 | """
103 | output_file = self.get_output_filename(page_num)
104 |
105 | if self.should_skip_file(output_file):
106 | self.logger.info(f"Skipping page {page_num} - recent file exists")
107 | with open(output_file, 'r', encoding='utf-8') as file:
108 | print(f"Processing {output_file}...")
109 | html_content = file.read()
110 | data = yad2_parser.extract_json_from_html(html_content)
111 | listings_data = data['props']['pageProps']['dehydratedState']['queries'][0]['state']['data']
112 | return listings_data["pagination"]["pages"]
113 |
114 | try:
115 | url = self.build_url(page_num)
116 | self.logger.info(f"Fetching page {page_num}")
117 |
118 | time.sleep(5) # Rate limiting
119 | response = self.session.get(
120 | url,
121 | headers=self.headers,
122 | cookies=self.cookies,
123 | allow_redirects=True
124 | )
125 | response.raise_for_status()
126 |
127 | assert len(response.content) > 50000 and b'__NEXT_DATA__' in response.content, len(response.content)
128 |
129 | data = yad2_parser.extract_json_from_html(response.content.decode("utf-8"))
130 | listings_data = data['props']['pageProps']['dehydratedState']['queries'][0]['state']['data']
131 | with open(output_file, 'wb') as f:
132 | f.write(response.content)
133 |
134 | self.logger.info(f"Successfully saved page {page_num}")
135 | return listings_data["pagination"]["pages"]
136 |
137 | except requests.exceptions.RequestException as e:
138 | self.logger.error(f"Error fetching page {page_num}: {str(e)}")
139 | return
140 |
141 | def scrape_pages(self, max_page=100):
142 | """
143 | Fetch multiple pages with rate limiting
144 |
145 | Args:
146 | num_pages (int): Number of pages to fetch
147 | """
148 | page = 1
149 | while True:
150 | pages = self.fetch_page(page)
151 | print (f"Page {page}/{pages}")
152 | # Only wait between requests if we actually made a request
153 | if pages and page < pages and page < max_page:
154 | page += 1
155 | else:
156 | return
157 |
158 | def main():
159 | # Example usage
160 | output_dir = "scraped_vehicles" # Replace with your desired output directory
161 | # VehicleScraper(output_dir, manufacturer=32, model=1337).scrape_pages() # Nissan
162 | # return
163 | VehicleScraper(output_dir, manufacturer=19, model=12894).scrape_pages(max_page=20) # bz4x
164 | # VehicleScraper(output_dir, manufacturer=32, model=10449).scrape_pages(max_page=20) # Nissan
165 | # VehicleScraper(output_dir, manufacturer=21, model=10283).scrape_pages(max_page=1)
166 | # VehicleScraper(output_dir, manufacturer=41, model=11579).scrape_pages(max_page=5) # ID.4
167 | # VehicleScraper(output_dir, manufacturer=41, model=12928).scrape_pages(max_page=5) # ID.5
168 | # VehicleScraper(output_dir, manufacturer=40, model=10545).scrape_pages(max_page=5)
169 | # VehicleScraper(output_dir, manufacturer=21, model=11239).scrape_pages(max_page=10) # Ioniq 5
170 | # VehicleScraper(output_dir, manufacturer=92, model=12134).scrape_pages(max_page=10) # Cupra Formentor
171 | # VehicleScraper(output_dir, manufacturer=41, model=10574).scrape_pages(max_page=10) # Tiguan
172 | # VehicleScraper(output_dir, manufacturer=40, model=11568).scrape_pages(max_page=10) # Enyaq
173 | # manufacturer=19&model=10226&subModel=104254,104255,104253
174 | # VehicleScraper(output_dir, manufacturer=19, model=10226).scrape_pages(max_page=20)
175 | # scraper = VehicleScraper(output_dir, manufacturer=21, model=10283)
176 | # scraper = VehicleScraper(output_dir, manufacturer=40, model=10545)
177 | # scraper = VehicleScraper(output_dir, manufacturer=21, model=11239)
178 | # scraper = VehicleScraper(output_dir, manufacturer=41, model=10574)
179 |
180 | if __name__ == "__main__":
181 | main()
--------------------------------------------------------------------------------
/vehicle_analyzer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import argparse
4 | from pathlib import Path
5 | import pandas as pd
6 | from datetime import datetime
7 |
8 | # Import the scraper modules
9 | from scraper import VehicleScraper
10 | import yad2_parser
11 |
12 | # For web visualization
13 | import dash
14 | from dash import dcc, html, Input, Output, State, callback_context
15 | import plotly.express as px
16 | import plotly.graph_objects as go
17 | import numpy as np
18 | from dash.exceptions import PreventUpdate
19 |
20 | def parse_arguments():
21 | """Parse command line arguments"""
22 | parser = argparse.ArgumentParser(description='Vehicle Price Analyzer')
23 | parser.add_argument('--output-dir', type=str, default='scraped_vehicles',
24 | help='Directory to save scraped data')
25 | parser.add_argument('--manufacturer', type=int, default=19,
26 | help='Manufacturer ID to scrape')
27 | parser.add_argument('--model', type=int, default=12894,
28 | help='Model ID to scrape')
29 | parser.add_argument('--max-pages', type=int, default=25,
30 | help='Maximum number of pages to scrape')
31 | parser.add_argument('--skip-scrape', action='store_true',
32 | help='Skip scraping and use existing data')
33 | parser.add_argument('--port', type=int, default=8050,
34 | help='Port to run the web server on')
35 | return parser.parse_args()
36 |
37 | def scrape_data(output_dir, manufacturer, model, max_pages):
38 | """Run the scraper to collect vehicle data"""
39 | print(f"Scraping data for manufacturer={manufacturer}, model={model}...")
40 | scraper = VehicleScraper(output_dir, manufacturer, model)
41 | scraper.scrape_pages(max_page=max_pages)
42 |
43 | def process_data(output_dir):
44 | """Process the scraped HTML files into a CSV"""
45 | print("Processing scraped HTML files...")
46 | dir_name = Path(output_dir).name
47 | yad2_parser.process_directory(output_dir)
48 | output_file = f"{dir_name}_summary.csv"
49 | output_path = os.path.join(output_dir, output_file)
50 |
51 | # Check if the CSV file exists
52 | if not os.path.exists(output_path):
53 | print(f"Error: Could not find processed data at {output_path}")
54 | sys.exit(1)
55 |
56 | return output_path
57 |
58 | def load_data(csv_path):
59 | """Load and prepare the CSV data for visualization"""
60 | try:
61 | df = pd.read_csv(csv_path)
62 |
63 | # Filter out cars with no price or price = 0
64 | df = df[df['price'] > 0]
65 |
66 | # Convert date strings to datetime objects
67 | df['productionDate'] = pd.to_datetime(df['productionDate'])
68 |
69 | # Extract year from production date for easier filtering
70 | df['productionYear'] = df['productionDate'].dt.year
71 |
72 | return df
73 | except Exception as e:
74 | print(f"Error loading data: {str(e)}")
75 | sys.exit(1)
76 |
77 | def create_dashboard(df, port=8050):
78 | """Create and run an interactive Dash app for visualizing the data"""
79 | # Create a custom stylesheet
80 | external_stylesheets = [
81 | {
82 | 'href': 'https://fonts.googleapis.com/css2?family=Roboto:wght@400;700&display=swap',
83 | 'rel': 'stylesheet'
84 | }
85 | ]
86 | # Create the app
87 | app = dash.Dash(
88 | __name__,
89 | title="Vehicle Price Analyzer",
90 | external_stylesheets=external_stylesheets,
91 | suppress_callback_exceptions=True # Needed for clientside callbacks
92 | )
93 |
94 | # Get unique values for filters
95 | km_ranges = [
96 | {'label': 'All', 'value': 'all'},
97 | {'label': '≤ 10,000 km/year', 'value': '0-10000'},
98 | {'label': '≤ 15,000 km/year', 'value': '0-15000'},
99 | {'label': '≤ 20,000 km/year', 'value': '0-20000'},
100 | {'label': '≤ 25,000 km/year', 'value': '0-25000'},
101 | {'label': '> 25,000 km/year', 'value': '25000-999999'}
102 | ]
103 |
104 | hands = [{'label': 'All Hands', 'value': 'all'}] + [
105 | {'label': f'Hand ≤ {h}', 'value': f'0-{h}'} for h in sorted(df['hand'].unique()) if h > 0
106 | ]
107 |
108 | sub_models = [{'label': 'All Sub-models', 'value': 'all'}] + [
109 | {'label': sm, 'value': sm} for sm in sorted(df['subModel'].unique())
110 | ]
111 |
112 | # Create model filter options
113 | models = [{'label': m, 'value': m} for m in sorted(df['model'].unique())]
114 |
115 | ad_types = [{'label': 'All', 'value': 'all'}] + [
116 | {'label': at, 'value': at} for at in sorted(df['listingType'].unique())
117 | ]
118 |
119 | # Define CSS styles
120 | styles = {
121 | 'container': {
122 | 'font-family': 'Roboto, sans-serif',
123 | 'max-width': '1200px',
124 | 'margin': '0 auto',
125 | 'padding': '20px',
126 | 'background-color': '#f9f9f9',
127 | 'border-radius': '8px',
128 | 'box-shadow': '0 4px 8px rgba(0,0,0,0.1)'
129 | },
130 | 'header': {
131 | 'background-color': '#2c3e50',
132 | 'color': 'white',
133 | 'padding': '15px 20px',
134 | 'margin-bottom': '20px',
135 | 'border-radius': '5px',
136 | 'text-align': 'center'
137 | },
138 | 'filter_container': {
139 | 'display': 'flex',
140 | 'flex-wrap': 'wrap',
141 | 'gap': '15px',
142 | 'background-color': 'white',
143 | 'padding': '15px',
144 | 'border-radius': '5px',
145 | 'box-shadow': '0 2px 4px rgba(0,0,0,0.05)',
146 | 'margin-bottom': '20px'
147 | },
148 | 'filter': {
149 | 'width': '23%',
150 | 'min-width': '200px',
151 | 'padding': '10px'
152 | },
153 | 'label': {
154 | 'font-weight': 'bold',
155 | 'margin-bottom': '5px',
156 | 'color': '#2c3e50'
157 | },
158 | 'graph': {
159 | 'background-color': 'white',
160 | 'padding': '15px',
161 | 'border-radius': '5px',
162 | 'box-shadow': '0 2px 4px rgba(0,0,0,0.05)',
163 | 'margin-bottom': '20px'
164 | },
165 | 'summary': {
166 | 'background-color': 'white',
167 | 'padding': '15px',
168 | 'border-radius': '5px',
169 | 'box-shadow': '0 2px 4px rgba(0,0,0,0.05)'
170 | },
171 | 'summary_header': {
172 | 'color': '#2c3e50',
173 | 'border-bottom': '2px solid #3498db',
174 | 'padding-bottom': '10px',
175 | 'margin-bottom': '15px'
176 | },
177 | 'button': {
178 | 'background-color': '#2c3e50',
179 | 'color': 'white',
180 | 'border': 'none',
181 | 'padding': '10px 20px',
182 | 'border-radius': '5px',
183 | 'cursor': 'pointer',
184 | 'font-weight': 'bold',
185 | 'margin-top': '10px',
186 | 'width': '100%'
187 | },
188 | 'clear_button': {
189 | 'background-color': '#e74c3c',
190 | 'color': 'white',
191 | 'border': 'none',
192 | 'padding': '10px 20px',
193 | 'border-radius': '5px',
194 | 'cursor': 'pointer',
195 | 'font-weight': 'bold',
196 | 'margin-top': '10px',
197 | 'width': '100%'
198 | },
199 | 'click_instruction': {
200 | 'text-align': 'center',
201 | 'font-style': 'italic',
202 | 'color': '#3498db',
203 | 'margin': '10px 0',
204 | 'padding': '8px',
205 | 'background-color': '#f0f7ff',
206 | 'border-radius': '5px',
207 | 'border-left': '3px solid #3498db'
208 | }
209 | }
210 |
211 | # Create the app layout
212 | app.layout = html.Div([
213 | # Header
214 | html.Div([
215 | html.H1("Vehicle Price Analysis Dashboard", style={'margin': '0'})
216 | ], style=styles['header']),
217 |
218 | # Filter section
219 | html.Div([
220 | html.Div([
221 | html.Label("Filter by km/year:", style=styles['label']),
222 | dcc.Dropdown(
223 | id='km-filter',
224 | options=km_ranges,
225 | value='all',
226 | clearable=False
227 | ),
228 | ], style=styles['filter']),
229 |
230 | html.Div([
231 | html.Label("Filter by owner hand:", style=styles['label']),
232 | dcc.Dropdown(
233 | id='hand-filter',
234 | options=hands,
235 | value='all',
236 | clearable=False
237 | ),
238 | ], style=styles['filter']),
239 |
240 | # New model multi-select dropdown
241 | html.Div([
242 | html.Label("Filter by model:", style=styles['label']),
243 | dcc.Dropdown(
244 | id='model-filter',
245 | options=models,
246 | value=[],
247 | multi=True,
248 | placeholder="Select model(s)"
249 | ),
250 | ], style=styles['filter']),
251 |
252 | html.Div([
253 | html.Label("Filter by listing type:", style=styles['label']),
254 | dcc.Dropdown(
255 | id='adtype-filter',
256 | options=ad_types,
257 | value='all',
258 | clearable=False
259 | ),
260 | ], style=styles['filter']),
261 |
262 | html.Div([
263 | html.Label("Filter by sub-model:", style=styles['label']),
264 | html.Div([
265 | dcc.Checklist(
266 | id='submodel-checklist',
267 | options=[], # Will be populated dynamically based on model selection
268 | value=[],
269 | labelStyle={'display': 'block', 'margin-bottom': '8px', 'cursor': 'pointer'},
270 | style={'max-height': '200px', 'overflow-y': 'auto', 'padding': '10px', 'background-color': '#f5f9ff', 'border-radius': '5px'}
271 | ),
272 | ]),
273 | html.Div([
274 | html.Button(
275 | 'Apply Filters',
276 | id='apply-submodel-button',
277 | style=styles['button']
278 | ),
279 | html.Button(
280 | 'Clear Selection',
281 | id='clear-submodel-button',
282 | style=styles['clear_button']
283 | ),
284 | ], style={'display': 'flex', 'gap': '10px'}),
285 | ], style={'width': '23%', 'min-width': '200px', 'padding': '10px', 'flex-grow': '1'}),
286 |
287 | ], style=styles['filter_container']),
288 |
289 | # Click instruction
290 | html.Div([
291 | html.P("👆 Click on any point in the graph to open the vehicle ad in a new tab")
292 | ], style=styles['click_instruction']),
293 |
294 | # Graph section
295 | html.Div([
296 | dcc.Graph(id='price-date-scatter')
297 | ], style=styles['graph']),
298 |
299 | # Summary section
300 | html.Div([
301 | html.H3("Data Summary", style=styles['summary_header']),
302 | html.Div(id='summary-stats')
303 | ], style=styles['summary']),
304 |
305 | # Store for clicked links - ADDED FOR CLICKABLE POINTS
306 | dcc.Store(id='clicked-link', storage_type='memory'),
307 | ], style=styles['container'])
308 |
309 | # ADDED: Client-side callback to open links in new tab
310 | app.clientside_callback(
311 | """
312 | function(clickData) {
313 | console.log(clickData);
314 | if(clickData && clickData.points && clickData.points.length > 0) {
315 | const link = clickData.points[0].customdata[6];
316 | if(link && link.length > 0) {
317 | window.open(link, '_blank');
318 | }
319 | }
320 | return window.dash_clientside.no_update;
321 | }
322 | """,
323 | Output('clicked-link', 'data'),
324 | Input('price-date-scatter', 'clickData'),
325 | prevent_initial_call=True
326 | )
327 |
328 | # Callback to update submodel options based on selected models
329 | @app.callback(
330 | Output('submodel-checklist', 'options'),
331 | Input('model-filter', 'value'),
332 | )
333 | def update_submodel_options(selected_models):
334 | if not selected_models or len(selected_models) == 0:
335 | # If no models selected, show all submodels
336 | # For each submodel, add the model name in brackets
337 | submodel_options = []
338 | for sm in sorted(df['subModel'].unique()):
339 | # Find models for this submodel
340 | models_for_submodel = df[df['subModel'] == sm]['model'].unique()
341 | if len(models_for_submodel) == 1:
342 | # If there's only one model for this submodel
343 | label = f"[{models_for_submodel[0]}] {sm}"
344 | else:
345 | # If there are multiple models, show first one with "+"
346 | label = f"[{models_for_submodel[0]}+] {sm}"
347 | submodel_options.append({'label': label, 'value': sm})
348 | else:
349 | # Filter submodels based on selected models
350 | filtered_df = df[df['model'].isin(selected_models)]
351 | submodel_options = []
352 | for sm in sorted(filtered_df['subModel'].unique()):
353 | # Find models for this submodel (limited to selected models)
354 | models_for_submodel = filtered_df[filtered_df['subModel'] == sm]['model'].unique()
355 | if len(models_for_submodel) == 1:
356 | # If there's only one model for this submodel
357 | label = f" {sm} [{models_for_submodel[0]}]"
358 | else:
359 | # Join all models (should be less since we're filtering)
360 | models_str = '+'.join(models_for_submodel)
361 | label = f" {sm} [{models_str}]"
362 | submodel_options.append({'label': label, 'value': sm})
363 |
364 | return list(sorted(submodel_options, key=lambda x: x['label']))
365 |
366 | # Callback to clear submodel selection
367 | @app.callback(
368 | Output('submodel-checklist', 'value'),
369 | Input('clear-submodel-button', 'n_clicks'),
370 | prevent_initial_call=True
371 | )
372 | def clear_submodel_selection(n_clicks):
373 | return []
374 |
375 | @app.callback(
376 | [Output('price-date-scatter', 'figure'),
377 | Output('summary-stats', 'children')],
378 | [Input('km-filter', 'value'),
379 | Input('hand-filter', 'value'),
380 | Input('model-filter', 'value'),
381 | Input('apply-submodel-button', 'n_clicks'),
382 | Input('adtype-filter', 'value')],
383 | [State('submodel-checklist', 'value')]
384 | )
385 | def update_graph(km_range, hand, models, submodel_btn_clicks, adtype, submodel_list):
386 | # Apply filters
387 | filtered_df = df.copy()
388 |
389 | if km_range != 'all':
390 | min_km, max_km = map(int, km_range.split('-'))
391 | filtered_df = filtered_df[filtered_df['km_per_year'] <= max_km]
392 | if min_km > 0: # For the "> 25,000" filter
393 | filtered_df = filtered_df[filtered_df['km_per_year'] > min_km]
394 |
395 | if hand != 'all':
396 | # Parse the hand range format (e.g., "0-2" means hand ≤ 2)
397 | min_hand, max_hand = map(int, hand.split('-'))
398 | filtered_df = filtered_df[filtered_df['hand'] <= max_hand]
399 |
400 | # Handle model multiselect filter
401 | if models and len(models) > 0:
402 | filtered_df = filtered_df[filtered_df['model'].isin(models)]
403 |
404 | # Handle checkbox list for submodels
405 | if submodel_list and len(submodel_list) > 0:
406 | # If checkboxes are selected, filter to only those submodels
407 | filtered_df = filtered_df[filtered_df['subModel'].isin(submodel_list)]
408 | # When no checkboxes are selected, show all submodels
409 |
410 | if adtype != 'all':
411 | filtered_df = filtered_df[filtered_df['listingType'] == adtype]
412 |
413 | # For car price analysis, we want newest cars on the left and oldest on the right
414 | # First, calculate "days since newest car" for each point
415 | newest_date = filtered_df['productionDate'].max()
416 | filtered_df['days_since_newest'] = (newest_date - filtered_df['productionDate']).dt.days
417 |
418 | # Calculate actual dates instead of days since newest
419 | today = pd.Timestamp.today().normalize() # Get today's date (without time)
420 | filtered_df['display_date'] = today - pd.to_timedelta(filtered_df['days_since_newest'], unit='D')
421 |
422 | # Create scatter plot with actual dates on x-axis
423 | fig = px.scatter(
424 | filtered_df,
425 | x='display_date',
426 | y='price',
427 | color='km_per_year',
428 | # Use fixed size instead of varying by km_per_year
429 | size_max=8, # Slightly larger for better clickability
430 | color_continuous_scale='viridis', # Smooth color gradient
431 | range_color=[0, filtered_df['km_per_year'].quantile(0.95)], # Cap color scale for better differentiation
432 | hover_data=['model', 'subModel', 'hand', 'km', 'city', 'productionDate', 'link'],
433 | labels={'display_date': 'Date',
434 | 'price': 'Price (₪)',
435 | 'km_per_year': 'Kilometers per Year'},
436 | title=f'Vehicle Prices by Age ({len(filtered_df)} vehicles)'
437 | )
438 |
439 | # Create custom data array for hover and click functionality
440 | custom_data = np.column_stack((
441 | filtered_df['model'],
442 | filtered_df['subModel'],
443 | filtered_df['hand'],
444 | filtered_df['km'],
445 | filtered_df['city'],
446 | filtered_df['productionDate'],
447 | filtered_df['link']
448 | ))
449 |
450 | # UPDATED: Make points clickable to their ad links with improved styling
451 | fig.update_traces(
452 | marker=dict(
453 | size=8, # Larger points for easier clicking
454 | opacity=0.8,
455 | line=dict(width=1, color='DarkSlateGrey') # Add outline for better visibility
456 | ),
457 | customdata=custom_data,
458 | hovertemplate='%{customdata[0]} %{customdata[1]}
' +
459 | 'Price: ₪%{y:,.0f}
' +
460 | 'Production Date: %{customdata[5]}
' +
461 | 'Hand: %{customdata[2]}
' +
462 | 'KM: %{customdata[3]:,.0f}
' +
463 | 'City: %{customdata[4]}
' +
464 | '👆 Click to view ad' # Clear instruction in hover
465 | )
466 |
467 | # UPDATED: Configure clickable points with improved settings
468 | fig.update_layout(
469 | clickmode='event+select', # Enable clicking on points
470 | hoverdistance=100, # Increase hover detection distance
471 | hovermode='closest', # Show hover info for closest point
472 | # Improve interactivity
473 | dragmode='zoom',
474 | # Enhance appearance
475 | plot_bgcolor='rgba(240,240,240,0.2)',
476 | paper_bgcolor='rgba(0,0,0,0)',
477 | font=dict(family="Roboto, sans-serif"),
478 | xaxis=dict(
479 | title_font=dict(size=14),
480 | tickfont=dict(size=12),
481 | gridcolor='#eee',
482 | # Reverse the x-axis so older dates are on the left
483 | autorange="reversed"
484 | ),
485 | yaxis=dict(
486 | title_font=dict(size=14),
487 | tickfont=dict(size=12),
488 | gridcolor='#eee'
489 | ),
490 | title=dict(
491 | font=dict(size=16)
492 | ),
493 | legend=dict(
494 | title_font=dict(size=13),
495 | font=dict(size=11)
496 | ),
497 | coloraxis_colorbar=dict(
498 | title="Km/Year",
499 | title_font=dict(size=13),
500 | tickfont=dict(size=11)
501 | ),
502 | margin=dict(l=40, r=40, t=60, b=40)
503 | )
504 |
505 | # Always add exponential trendline
506 | # For car price depreciation, we'll use days since newest car as x-axis
507 | if len(filtered_df) > 1:
508 | # Sort by days_since_newest for proper fitting
509 | sorted_df = filtered_df.sort_values('days_since_newest')
510 |
511 | x = sorted_df['days_since_newest'].values
512 | y = sorted_df['price'].values
513 |
514 | # Ensure we have numeric data for curve fitting
515 | valid_indices = ~np.isnan(x) & ~np.isnan(y)
516 | x = x[valid_indices]
517 | y = y[valid_indices]
518 |
519 | if len(x) > 1: # Need at least 2 points for curve fitting
520 | try:
521 | # For better exponential fit, try more robust approaches
522 | from scipy import optimize
523 |
524 | # For car price depreciation, an exponential decay function:
525 | # Price(t) = Base_Price * exp(-decay_rate * t) + Residual_Value
526 | def exp_decay_with_offset(x, a, b, c):
527 | return a * np.exp(-b * x) + c
528 |
529 | # Initial parameter guesses with bounds
530 | max_price = np.max(y)
531 | mean_price = np.mean(y)
532 | min_price = np.min(y)
533 |
534 | # Initial guess: start at max price, decay to around min price
535 | p0 = [max_price - min_price, 0.001, min_price]
536 |
537 | # Set bounds to ensure reasonable parameters
538 | # a: positive value up to 2x max observed price
539 | # b: positive decay rate, not too small or large
540 | # c: residual value, could be 0 or positive value
541 | bounds = ([0, 0.0001, 0], [2 * max_price, 0.1, mean_price])
542 |
543 | # Try different fitting methods and functions
544 | try:
545 | # First try the 3-parameter model (with residual value)
546 | params, _ = optimize.curve_fit(
547 | exp_decay_with_offset, x, y,
548 | p0=p0, bounds=bounds,
549 | method='trf', maxfev=10000
550 | )
551 | a, b, c = params
552 |
553 | # Generate curve points with more granularity
554 | x_curve = np.linspace(0, x.max(), 200)
555 | y_curve = exp_decay_with_offset(x_curve, a, b, c)
556 |
557 | except RuntimeError:
558 | # If that fails, try simpler 2-parameter model without offset
559 | def exp_decay(x, a, b):
560 | return a * np.exp(-b * x)
561 |
562 | # Adjust bounds for simpler model
563 | p0_simple = [max_price, 0.001]
564 | bounds_simple = ([0, 0.0001], [2 * max_price, 0.1])
565 |
566 | params, _ = optimize.curve_fit(
567 | exp_decay, x, y,
568 | p0=p0_simple, bounds=bounds_simple,
569 | method='trf', maxfev=10000
570 | )
571 | a, b = params
572 | c = 0 # No offset
573 |
574 | # Generate curve points
575 | x_curve = np.linspace(0, x.max(), 200)
576 | y_curve = exp_decay(x_curve, a, b)
577 |
578 | # Convert x_curve back to days for plotting
579 | curve_days = x_curve
580 |
581 | # Find newest date for reference
582 | newest_date = filtered_df['productionDate'].max()
583 |
584 | # Convert days to dates for plotting
585 | curve_dates = [newest_date - pd.Timedelta(days=int(days)) for days in curve_days]
586 |
587 | # Add the exponential trendline with higher visibility
588 | fig.add_trace(go.Scatter(
589 | x=today - pd.to_timedelta(curve_days, unit='D'), # Convert to actual dates
590 | y=y_curve,
591 | mode='lines',
592 | name='Exponential Trend',
593 | line=dict(color='red', width=3, dash='solid'),
594 | hoverinfo='none' # Disable hover for the trendline to keep it clean
595 | ))
596 |
597 | except Exception as e:
598 | # Log the error for debugging
599 | print(f"Error fitting exponential curve: {str(e)}")
600 |
601 | # Fallback to simple exponential fit using numpy
602 | try:
603 | # Take log of y values for linear fit
604 | log_y = np.log(y)
605 | # Filter out any -inf values from log(0)
606 | valid = np.isfinite(log_y)
607 | x_valid = x[valid]
608 | log_y_valid = log_y[valid]
609 |
610 | if len(x_valid) > 1:
611 | # Linear fit on log-transformed data
612 | z = np.polyfit(x_valid, log_y_valid, 1)
613 | # Convert back to exponential form
614 | a = np.exp(z[1])
615 | b = -z[0] # Negative because our formula is exp(-bx)
616 |
617 | # Generate curve points
618 | x_curve = np.linspace(0, x.max(), 200)
619 | y_curve = a * np.exp(-b * x_curve)
620 |
621 | # Add the exponential trendline
622 | fig.add_trace(go.Scatter(
623 | x=today - pd.to_timedelta(x_curve, unit='D'), # Convert to actual dates
624 | y=y_curve,
625 | mode='lines',
626 | name='Exponential Trend (Simplified)',
627 | line=dict(color='red', width=3, dash='solid'),
628 | hoverinfo='none'
629 | ))
630 | else:
631 | # Not enough valid data for simplified exponential fit
632 | # Fall back to linear as last resort
633 | z = np.polyfit(x, y, 1)
634 | p = np.poly1d(z)
635 | x_curve = np.linspace(0, x.max(), 200)
636 |
637 | fig.add_trace(go.Scatter(
638 | x=today - pd.to_timedelta(x_curve, unit='D'), # Convert to actual dates
639 | y=p(x_curve),
640 | mode='lines',
641 | name='Linear Trend (Fallback)',
642 | line=dict(color='orange', width=3, dash='dash'),
643 | hoverinfo='none'
644 | ))
645 |
646 | except Exception as e2:
647 | print(f"Error with simplified exponential fit: {str(e2)}")
648 | # Final fallback to linear trend if all else fails
649 | try:
650 | z = np.polyfit(x, y, 1)
651 | p = np.poly1d(z)
652 | x_curve = np.linspace(0, x.max(), 200)
653 |
654 | fig.add_trace(go.Scatter(
655 | x=today - pd.to_timedelta(x_curve, unit='D'), # Convert to actual dates
656 | y=p(x_curve),
657 | mode='lines',
658 | name='Linear Trend (Fallback)',
659 | line=dict(color='orange', width=3, dash='dash'),
660 | hoverinfo='none'
661 | ))
662 | except:
663 | print("All trendline methods failed")
664 |
665 | # Enhanced summary statistics with better styling
666 | summary_style = {
667 | 'container': {
668 | 'display': 'flex',
669 | 'flex-wrap': 'wrap',
670 | 'gap': '20px'
671 | },
672 | 'card': {
673 | 'flex': '1',
674 | 'min-width': '180px',
675 | 'padding': '15px',
676 | 'border-radius': '5px',
677 | 'background-color': '#f5f9ff',
678 | 'box-shadow': '0 2px 4px rgba(0,0,0,0.05)',
679 | 'text-align': 'center'
680 | },
681 | 'value': {
682 | 'font-size': '20px',
683 | 'font-weight': 'bold',
684 | 'color': '#2c3e50',
685 | 'margin': '10px 0'
686 | },
687 | 'label': {
688 | 'font-size': '14px',
689 | 'color': '#7f8c8d',
690 | 'margin': '0'
691 | }
692 | }
693 |
694 | # Create styled summary stats cards
695 | summary = html.Div([
696 | html.Div([
697 | html.P("Number of Vehicles", style=summary_style['label']),
698 | html.P(f"{len(filtered_df)}", style=summary_style['value'])
699 | ], style=summary_style['card']),
700 |
701 | html.Div([
702 | html.P("Average Price", style=summary_style['label']),
703 | html.P(f"₪{filtered_df['price'].mean():,.0f}", style=summary_style['value'])
704 | ], style=summary_style['card']),
705 |
706 | html.Div([
707 | html.P("Price Range", style=summary_style['label']),
708 | html.P(f"₪{filtered_df['price'].min():,.0f} - ₪{filtered_df['price'].max():,.0f}", style=summary_style['value'])
709 | ], style=summary_style['card']),
710 |
711 | html.Div([
712 | html.P("Average km/year", style=summary_style['label']),
713 | html.P(f"{filtered_df['km_per_year'].mean():,.0f}", style=summary_style['value'])
714 | ], style=summary_style['card']),
715 |
716 | html.Div([
717 | html.P("Average Vehicle Age", style=summary_style['label']),
718 | html.P(f"{filtered_df['number_of_years'].mean():.1f} years", style=summary_style['value'])
719 | ], style=summary_style['card']),
720 | ], style=summary_style['container'])
721 |
722 | return fig, summary
723 |
724 | # Run the app
725 | print(f"Starting dashboard on http://127.0.0.1:{port}/")
726 | app.run_server(debug=False, port=port)
727 |
728 | def main():
729 | args = parse_arguments()
730 |
731 | # Create output directory if it doesn't exist
732 | output_dir = Path(args.output_dir)
733 | output_dir.mkdir(parents=True, exist_ok=True)
734 |
735 | # Step 1: Scrape the data if not skipped
736 | if not args.skip_scrape:
737 | scrape_data(args.output_dir, args.manufacturer, args.model, args.max_pages)
738 |
739 | # Step 2: Process the scraped data
740 | csv_path = process_data(args.output_dir)
741 |
742 | # Step 3: Load the data
743 | df = load_data(csv_path)
744 | os.unlink(csv_path)
745 |
746 | # Step 4: Create and run the dashboard
747 | create_dashboard(df, args.port)
748 |
749 |
750 | if __name__ == "__main__":
751 | main()
--------------------------------------------------------------------------------