├── .gitignore ├── requirements.txt ├── README.md ├── yad2_parser.py ├── scraper.py └── vehicle_analyzer.py /.gitignore: -------------------------------------------------------------------------------- 1 | scraped_vehicles/* 2 | *.pyc -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests>=2.25.0 2 | beautifulsoup4>=4.9.0 3 | pandas>=1.3.0 4 | dash>=2.0.0 5 | plotly>=5.0.0 6 | numpy>=1.20.0 7 | scipy>=1.7.0 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Yad2 Vehicle Price Analyzer 2 | 3 |

4 | 5 |
6 | Tool for scraping and visualizing vehicle pricing data from Yad2. 7 |
8 |
9 | 10 |

11 | 12 | 13 | ## Installation 14 | 15 | ```bash 16 | # Clone repository 17 | git clone https://github.com/yourusername/yad2-vehicle-analyzer.git 18 | cd yad2-vehicle-analyzer 19 | 20 | # Install dependencies 21 | pip install -r requirements.txt 22 | ``` 23 | 24 | ## Examples 25 | 26 | Basic usage: 27 | ```bash 28 | # Run with default settings (Toyota bZ4X) 29 | python vehicle_analyzer.py 30 | ``` 31 | 32 | Scrape specific vehicle model: 33 | ```bash 34 | # Volkswagen ID.4 35 | python vehicle_analyzer.py --manufacturer 41 --model 11579 36 | 37 | # Hyundai Ioniq 5 38 | python vehicle_analyzer.py --manufacturer 21 --model 11239 39 | 40 | # Nissan Qashqai 41 | python vehicle_analyzer.py --manufacturer 32 --model 10449 42 | ``` 43 | 44 | Use existing data: 45 | ```bash 46 | # Skip scraping 47 | python vehicle_analyzer.py --skip-scrape 48 | ``` 49 | 50 | Change web server port: 51 | ```bash 52 | python vehicle_analyzer.py --port 8080 53 | ``` 54 | 55 | Find manufacturer and model IDs in Yad2 URLs: 56 | ``` 57 | https://www.yad2.co.il/vehicles/cars?manufacturer=19&model=12894 58 | ``` 59 | -------------------------------------------------------------------------------- /yad2_parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | import csv 4 | from typing import List, Dict 5 | from datetime import datetime 6 | from bs4 import BeautifulSoup 7 | import os 8 | from pathlib import Path 9 | 10 | today = datetime.now().date().strftime("%y_%m_%d") 11 | 12 | def extract_json_from_html(html_content: str) -> Dict: 13 | """Extract JSON data from __NEXT_DATA__ script tag""" 14 | soup = BeautifulSoup(html_content, 'html.parser') 15 | script_tag = soup.find('script', id='__NEXT_DATA__') 16 | 17 | if script_tag is None: 18 | raise ValueError("Could not find __NEXT_DATA__ script tag in HTML") 19 | 20 | return json.loads(script_tag.string) 21 | 22 | def get_month_number(month_text: str) -> int: 23 | # Hebrew month names to numbers mapping 24 | month_mapping = { 25 | 'ינואר': 1, 'פברואר': 2, 'מרץ': 3, 'אפריל': 4, 26 | 'מאי': 5, 'יוני': 6, 'יולי': 7, 'אוגוסט': 8, 27 | 'ספטמבר': 9, 'אוקטובר': 10, 'נובמבר': 11, 'דצמבר': 12 28 | } 29 | return month_mapping.get(month_text, 1) # Default to 1 if month not found 30 | 31 | def format_date(date_str: str) -> str: 32 | # Parse ISO format and return YYYY-MM-DD 33 | return datetime.fromisoformat(date_str).strftime('%Y-%m-%d') 34 | 35 | def calculate_years_since_production(production_year: int, production_month: int) -> float: 36 | production_date = datetime(production_year, production_month, 1) 37 | current_date = datetime.now() 38 | years = (current_date - production_date).days / 365.25 39 | return years 40 | 41 | def process_vehicle_data(json_list: List[Dict], listing_type: str, output_file: str, mode: str = 'w') -> None: 42 | """Process vehicle data and write to CSV""" 43 | # Define the headers we want to extract 44 | headers = ['adNumber', 'price', 'city', 'adType', 'model', 'subModel', 45 | 'productionDate', 'km', 'hand', 'createdAt', 'updatedAt', 46 | 'rebouncedAt', 'listingType', 'number_of_years', 'km_per_year', 'description', 'link', 'make', 'hp'] 47 | 48 | # Open the CSV file for writing 49 | with open(output_file, mode, newline='', encoding='utf-8') as csvfile: 50 | writer = csv.DictWriter(csvfile, fieldnames=headers) 51 | if mode == 'w': # Only write header if we're creating a new file 52 | writer.writeheader() 53 | 54 | # Process each JSON object 55 | for item in json_list: 56 | try: 57 | # Create date string in YYYY-MM-DD format for production date 58 | year = item['vehicleDates']['yearOfProduction'] 59 | month = get_month_number(item['vehicleDates'].get('monthOfProduction', {"text": "ינואר"})['text']) 60 | production_date = f"{year}-{month:02d}-01" # Format: YYYY-MM-DD 61 | 62 | # Calculate years since production 63 | years_since_production = calculate_years_since_production(year, month) 64 | 65 | # Calculate km per year 66 | km = item['km'] 67 | km_per_year = round(km / years_since_production if years_since_production > 0 else km, 2) 68 | 69 | row = { 70 | 'adNumber': item['adNumber'], 71 | 'price': item['price'], 72 | 'city': item['address'].get('city',{"text":""})['text'], 73 | 'adType': item['adType'], 74 | 'model': item['model']['text'], 75 | 'subModel': item['subModel']['text'], 76 | 'hp': int(re.search(r'(\d+)\s*כ״ס', item['subModel']['text']).group(1)) if re.search(r'(\d+)\s*כ״ס', item['subModel']['text']) else 0, 77 | 'make': item['manufacturer']['text'], 78 | 'productionDate': production_date, 79 | 'km': item['km'], 80 | 'hand': item['hand']["id"], 81 | 'createdAt': format_date(item['dates']['createdAt']), 82 | 'updatedAt': format_date(item['dates']['updatedAt']), 83 | 'rebouncedAt': format_date(item['dates']['rebouncedAt']), 84 | 'listingType': listing_type, 85 | 'number_of_years': years_since_production, 86 | 'km_per_year': km_per_year, 87 | 'description': item["metaData"]["description"], 88 | 'link': f'https://www.yad2.co.il/vehicles/item/{item["token"]}', 89 | } 90 | writer.writerow(row) 91 | except KeyError as e: 92 | print(f"Skipping item due to missing key: {e}") 93 | print (item) 94 | exit(-1) 95 | except Exception as e: 96 | print(f"Error processing item: {e}") 97 | 98 | def process_directory(directory_path: str) -> None: 99 | """Process all HTML files in a directory and combine the data""" 100 | # Get directory name for the output file 101 | dir_name = Path(directory_path).name 102 | output_file = f"{dir_name}_summary.csv" 103 | output_path = os.path.join(directory_path, output_file) 104 | 105 | # Process each HTML file in the directory 106 | for filename in os.listdir(directory_path): 107 | if filename.endswith('.html') and today in filename: 108 | file_path = os.path.join(directory_path, filename) 109 | try: 110 | with open(file_path, 'r', encoding='utf-8') as file: 111 | print(f"Processing {filename}...") 112 | html_content = file.read() 113 | data = extract_json_from_html(html_content) 114 | listings_data = data['props']['pageProps']['dehydratedState']['queries'][0]['state']['data'] 115 | 116 | # Process commercial listings 117 | commercial_list = listings_data.get('commercial', []) 118 | if commercial_list: 119 | mode = 'a' if os.path.exists(output_path) else 'w' 120 | process_vehicle_data(commercial_list, 'commercial', output_path, mode) 121 | print(f"Processed {len(commercial_list)} commercial listings") 122 | 123 | # Process private listings 124 | private_list = listings_data.get('private', []) 125 | if private_list: 126 | mode = 'a' if os.path.exists(output_path) else 'w' 127 | process_vehicle_data(private_list, 'private', output_path, mode) 128 | print(f"Processed {len(private_list)} private listings") 129 | 130 | # Process private listings 131 | private_list = listings_data.get('solo', []) 132 | if private_list: 133 | mode = 'a' if os.path.exists(output_path) else 'w' 134 | process_vehicle_data(private_list, 'solo', output_path, mode) 135 | print(f"Processed {len(private_list)} solo listings") 136 | 137 | private_list = listings_data.get('platinum', []) 138 | if private_list: 139 | mode = 'a' if os.path.exists(output_path) else 'w' 140 | process_vehicle_data(private_list, 'platinum', output_path, mode) 141 | print(f"Processed {len(private_list)} platinum listings") 142 | 143 | except Exception as e: 144 | print(f"Error processing {filename}: {e}") 145 | 146 | print(f"Output saved to: {output_path}") 147 | 148 | if __name__ == "__main__": 149 | directory_path = "scraped_vehicles" 150 | process_directory(directory_path) 151 | 152 | # Upload to Google Drive 153 | output_file = f"{Path(directory_path).name}_summary.csv" 154 | output_path = os.path.join(directory_path, output_file) 155 | upload_drive.upload_to_sheet(output_path) 156 | os.unlink(output_path) -------------------------------------------------------------------------------- /scraper.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import time 3 | import os 4 | from datetime import datetime, timedelta 5 | from pathlib import Path 6 | import logging 7 | import yad2_parser 8 | 9 | class VehicleScraper: 10 | def __init__(self, output_dir, manufacturer=32, model=10449): 11 | """ 12 | Initialize the scraper with output directory and vehicle parameters 13 | 14 | Args: 15 | output_dir (str): Directory to save the scraped files 16 | manufacturer (int): Manufacturer ID 17 | model (int): Model ID 18 | """ 19 | self.output_dir = Path(output_dir) 20 | self.manufacturer = manufacturer 21 | self.model = model 22 | self.session = requests.Session() 23 | 24 | # Set up headers exactly as in the curl command 25 | self.headers = { 26 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 27 | 'Accept-Language': 'en-US,en;q=0.9,he;q=0.8', 28 | 'Cache-Control': 'max-age=0', 29 | 'Connection': 'keep-alive', 30 | 'DNT': '1', 31 | 'Referer': 'https://www.yad2.co.il/', 32 | 'Sec-Fetch-Dest': 'document', 33 | 'Sec-Fetch-Mode': 'navigate', 34 | 'Sec-Fetch-Site': 'same-origin', 35 | 'Sec-Fetch-User': '?1', 36 | 'Upgrade-Insecure-Requests': '1', 37 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36', 38 | 'sec-ch-ua': '"Chromium";v="131", "Not_A Brand";v="24"', 39 | 'sec-ch-ua-mobile': '?0', 40 | 'sec-ch-ua-platform': '"macOS"' 41 | } 42 | 43 | # Set up cookies 44 | self.cookies = { 45 | '__ssds': '3', 46 | 'y2018-2-cohort': '88', 47 | 'use_elastic_search': '1', 48 | 'abTestKey': '2', 49 | 'cohortGroup': 'D' 50 | # Note: Added only essential cookies. Add more if needed. 51 | } 52 | 53 | # Create output directory if it doesn't exist 54 | self.output_dir.mkdir(parents=True, exist_ok=True) 55 | 56 | # Setup logging 57 | logging.basicConfig( 58 | level=logging.INFO, 59 | format='%(asctime)s - %(levelname)s - %(message)s' 60 | ) 61 | self.logger = logging.getLogger(__name__) 62 | 63 | def build_url(self, page_num): 64 | """Build the URL for a specific page number""" 65 | base_url = "https://www.yad2.co.il/vehicles/cars" 66 | params = { 67 | "manufacturer": self.manufacturer, 68 | "model": self.model, 69 | # "carFamilyType": "5,10", 70 | # "year": "2022--1", 71 | # "price": "110000-190000", 72 | # "km": "-1-60000", 73 | # # "km": "-1-50000", 74 | "hand": "0-2", 75 | # "imgOnly": "1", 76 | "page": page_num 77 | } 78 | return f"{base_url}?{'&'.join(f'{k}={v}' for k, v in params.items())}" 79 | 80 | def get_output_filename(self, page_num): 81 | today = datetime.now().date().strftime("%y_%m_%d") 82 | """Generate output filename based on manufacturer and model""" 83 | return self.output_dir / f"{today}_manufacturer{self.manufacturer}_model{self.model}_page{page_num}.html" 84 | 85 | def should_skip_file(self, filepath): 86 | """Check if file exists and was modified in the last 24 hours""" 87 | if not filepath.exists(): 88 | return False 89 | 90 | file_mtime = datetime.fromtimestamp(filepath.stat().st_mtime) 91 | return datetime.now() - file_mtime < timedelta(days=1) 92 | 93 | def fetch_page(self, page_num): 94 | """ 95 | Fetch a single page and save it to file 96 | 97 | Args: 98 | page_num (int): Page number to fetch 99 | 100 | Returns: 101 | bool: True if page was fetched successfully, False if skipped or failed 102 | """ 103 | output_file = self.get_output_filename(page_num) 104 | 105 | if self.should_skip_file(output_file): 106 | self.logger.info(f"Skipping page {page_num} - recent file exists") 107 | with open(output_file, 'r', encoding='utf-8') as file: 108 | print(f"Processing {output_file}...") 109 | html_content = file.read() 110 | data = yad2_parser.extract_json_from_html(html_content) 111 | listings_data = data['props']['pageProps']['dehydratedState']['queries'][0]['state']['data'] 112 | return listings_data["pagination"]["pages"] 113 | 114 | try: 115 | url = self.build_url(page_num) 116 | self.logger.info(f"Fetching page {page_num}") 117 | 118 | time.sleep(5) # Rate limiting 119 | response = self.session.get( 120 | url, 121 | headers=self.headers, 122 | cookies=self.cookies, 123 | allow_redirects=True 124 | ) 125 | response.raise_for_status() 126 | 127 | assert len(response.content) > 50000 and b'__NEXT_DATA__' in response.content, len(response.content) 128 | 129 | data = yad2_parser.extract_json_from_html(response.content.decode("utf-8")) 130 | listings_data = data['props']['pageProps']['dehydratedState']['queries'][0]['state']['data'] 131 | with open(output_file, 'wb') as f: 132 | f.write(response.content) 133 | 134 | self.logger.info(f"Successfully saved page {page_num}") 135 | return listings_data["pagination"]["pages"] 136 | 137 | except requests.exceptions.RequestException as e: 138 | self.logger.error(f"Error fetching page {page_num}: {str(e)}") 139 | return 140 | 141 | def scrape_pages(self, max_page=100): 142 | """ 143 | Fetch multiple pages with rate limiting 144 | 145 | Args: 146 | num_pages (int): Number of pages to fetch 147 | """ 148 | page = 1 149 | while True: 150 | pages = self.fetch_page(page) 151 | print (f"Page {page}/{pages}") 152 | # Only wait between requests if we actually made a request 153 | if pages and page < pages and page < max_page: 154 | page += 1 155 | else: 156 | return 157 | 158 | def main(): 159 | # Example usage 160 | output_dir = "scraped_vehicles" # Replace with your desired output directory 161 | # VehicleScraper(output_dir, manufacturer=32, model=1337).scrape_pages() # Nissan 162 | # return 163 | VehicleScraper(output_dir, manufacturer=19, model=12894).scrape_pages(max_page=20) # bz4x 164 | # VehicleScraper(output_dir, manufacturer=32, model=10449).scrape_pages(max_page=20) # Nissan 165 | # VehicleScraper(output_dir, manufacturer=21, model=10283).scrape_pages(max_page=1) 166 | # VehicleScraper(output_dir, manufacturer=41, model=11579).scrape_pages(max_page=5) # ID.4 167 | # VehicleScraper(output_dir, manufacturer=41, model=12928).scrape_pages(max_page=5) # ID.5 168 | # VehicleScraper(output_dir, manufacturer=40, model=10545).scrape_pages(max_page=5) 169 | # VehicleScraper(output_dir, manufacturer=21, model=11239).scrape_pages(max_page=10) # Ioniq 5 170 | # VehicleScraper(output_dir, manufacturer=92, model=12134).scrape_pages(max_page=10) # Cupra Formentor 171 | # VehicleScraper(output_dir, manufacturer=41, model=10574).scrape_pages(max_page=10) # Tiguan 172 | # VehicleScraper(output_dir, manufacturer=40, model=11568).scrape_pages(max_page=10) # Enyaq 173 | # manufacturer=19&model=10226&subModel=104254,104255,104253 174 | # VehicleScraper(output_dir, manufacturer=19, model=10226).scrape_pages(max_page=20) 175 | # scraper = VehicleScraper(output_dir, manufacturer=21, model=10283) 176 | # scraper = VehicleScraper(output_dir, manufacturer=40, model=10545) 177 | # scraper = VehicleScraper(output_dir, manufacturer=21, model=11239) 178 | # scraper = VehicleScraper(output_dir, manufacturer=41, model=10574) 179 | 180 | if __name__ == "__main__": 181 | main() -------------------------------------------------------------------------------- /vehicle_analyzer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | from pathlib import Path 5 | import pandas as pd 6 | from datetime import datetime 7 | 8 | # Import the scraper modules 9 | from scraper import VehicleScraper 10 | import yad2_parser 11 | 12 | # For web visualization 13 | import dash 14 | from dash import dcc, html, Input, Output, State, callback_context 15 | import plotly.express as px 16 | import plotly.graph_objects as go 17 | import numpy as np 18 | from dash.exceptions import PreventUpdate 19 | 20 | def parse_arguments(): 21 | """Parse command line arguments""" 22 | parser = argparse.ArgumentParser(description='Vehicle Price Analyzer') 23 | parser.add_argument('--output-dir', type=str, default='scraped_vehicles', 24 | help='Directory to save scraped data') 25 | parser.add_argument('--manufacturer', type=int, default=19, 26 | help='Manufacturer ID to scrape') 27 | parser.add_argument('--model', type=int, default=12894, 28 | help='Model ID to scrape') 29 | parser.add_argument('--max-pages', type=int, default=25, 30 | help='Maximum number of pages to scrape') 31 | parser.add_argument('--skip-scrape', action='store_true', 32 | help='Skip scraping and use existing data') 33 | parser.add_argument('--port', type=int, default=8050, 34 | help='Port to run the web server on') 35 | return parser.parse_args() 36 | 37 | def scrape_data(output_dir, manufacturer, model, max_pages): 38 | """Run the scraper to collect vehicle data""" 39 | print(f"Scraping data for manufacturer={manufacturer}, model={model}...") 40 | scraper = VehicleScraper(output_dir, manufacturer, model) 41 | scraper.scrape_pages(max_page=max_pages) 42 | 43 | def process_data(output_dir): 44 | """Process the scraped HTML files into a CSV""" 45 | print("Processing scraped HTML files...") 46 | dir_name = Path(output_dir).name 47 | yad2_parser.process_directory(output_dir) 48 | output_file = f"{dir_name}_summary.csv" 49 | output_path = os.path.join(output_dir, output_file) 50 | 51 | # Check if the CSV file exists 52 | if not os.path.exists(output_path): 53 | print(f"Error: Could not find processed data at {output_path}") 54 | sys.exit(1) 55 | 56 | return output_path 57 | 58 | def load_data(csv_path): 59 | """Load and prepare the CSV data for visualization""" 60 | try: 61 | df = pd.read_csv(csv_path) 62 | 63 | # Filter out cars with no price or price = 0 64 | df = df[df['price'] > 0] 65 | 66 | # Convert date strings to datetime objects 67 | df['productionDate'] = pd.to_datetime(df['productionDate']) 68 | 69 | # Extract year from production date for easier filtering 70 | df['productionYear'] = df['productionDate'].dt.year 71 | 72 | return df 73 | except Exception as e: 74 | print(f"Error loading data: {str(e)}") 75 | sys.exit(1) 76 | 77 | def create_dashboard(df, port=8050): 78 | """Create and run an interactive Dash app for visualizing the data""" 79 | # Create a custom stylesheet 80 | external_stylesheets = [ 81 | { 82 | 'href': 'https://fonts.googleapis.com/css2?family=Roboto:wght@400;700&display=swap', 83 | 'rel': 'stylesheet' 84 | } 85 | ] 86 | # Create the app 87 | app = dash.Dash( 88 | __name__, 89 | title="Vehicle Price Analyzer", 90 | external_stylesheets=external_stylesheets, 91 | suppress_callback_exceptions=True # Needed for clientside callbacks 92 | ) 93 | 94 | # Get unique values for filters 95 | km_ranges = [ 96 | {'label': 'All', 'value': 'all'}, 97 | {'label': '≤ 10,000 km/year', 'value': '0-10000'}, 98 | {'label': '≤ 15,000 km/year', 'value': '0-15000'}, 99 | {'label': '≤ 20,000 km/year', 'value': '0-20000'}, 100 | {'label': '≤ 25,000 km/year', 'value': '0-25000'}, 101 | {'label': '> 25,000 km/year', 'value': '25000-999999'} 102 | ] 103 | 104 | hands = [{'label': 'All Hands', 'value': 'all'}] + [ 105 | {'label': f'Hand ≤ {h}', 'value': f'0-{h}'} for h in sorted(df['hand'].unique()) if h > 0 106 | ] 107 | 108 | sub_models = [{'label': 'All Sub-models', 'value': 'all'}] + [ 109 | {'label': sm, 'value': sm} for sm in sorted(df['subModel'].unique()) 110 | ] 111 | 112 | # Create model filter options 113 | models = [{'label': m, 'value': m} for m in sorted(df['model'].unique())] 114 | 115 | ad_types = [{'label': 'All', 'value': 'all'}] + [ 116 | {'label': at, 'value': at} for at in sorted(df['listingType'].unique()) 117 | ] 118 | 119 | # Define CSS styles 120 | styles = { 121 | 'container': { 122 | 'font-family': 'Roboto, sans-serif', 123 | 'max-width': '1200px', 124 | 'margin': '0 auto', 125 | 'padding': '20px', 126 | 'background-color': '#f9f9f9', 127 | 'border-radius': '8px', 128 | 'box-shadow': '0 4px 8px rgba(0,0,0,0.1)' 129 | }, 130 | 'header': { 131 | 'background-color': '#2c3e50', 132 | 'color': 'white', 133 | 'padding': '15px 20px', 134 | 'margin-bottom': '20px', 135 | 'border-radius': '5px', 136 | 'text-align': 'center' 137 | }, 138 | 'filter_container': { 139 | 'display': 'flex', 140 | 'flex-wrap': 'wrap', 141 | 'gap': '15px', 142 | 'background-color': 'white', 143 | 'padding': '15px', 144 | 'border-radius': '5px', 145 | 'box-shadow': '0 2px 4px rgba(0,0,0,0.05)', 146 | 'margin-bottom': '20px' 147 | }, 148 | 'filter': { 149 | 'width': '23%', 150 | 'min-width': '200px', 151 | 'padding': '10px' 152 | }, 153 | 'label': { 154 | 'font-weight': 'bold', 155 | 'margin-bottom': '5px', 156 | 'color': '#2c3e50' 157 | }, 158 | 'graph': { 159 | 'background-color': 'white', 160 | 'padding': '15px', 161 | 'border-radius': '5px', 162 | 'box-shadow': '0 2px 4px rgba(0,0,0,0.05)', 163 | 'margin-bottom': '20px' 164 | }, 165 | 'summary': { 166 | 'background-color': 'white', 167 | 'padding': '15px', 168 | 'border-radius': '5px', 169 | 'box-shadow': '0 2px 4px rgba(0,0,0,0.05)' 170 | }, 171 | 'summary_header': { 172 | 'color': '#2c3e50', 173 | 'border-bottom': '2px solid #3498db', 174 | 'padding-bottom': '10px', 175 | 'margin-bottom': '15px' 176 | }, 177 | 'button': { 178 | 'background-color': '#2c3e50', 179 | 'color': 'white', 180 | 'border': 'none', 181 | 'padding': '10px 20px', 182 | 'border-radius': '5px', 183 | 'cursor': 'pointer', 184 | 'font-weight': 'bold', 185 | 'margin-top': '10px', 186 | 'width': '100%' 187 | }, 188 | 'clear_button': { 189 | 'background-color': '#e74c3c', 190 | 'color': 'white', 191 | 'border': 'none', 192 | 'padding': '10px 20px', 193 | 'border-radius': '5px', 194 | 'cursor': 'pointer', 195 | 'font-weight': 'bold', 196 | 'margin-top': '10px', 197 | 'width': '100%' 198 | }, 199 | 'click_instruction': { 200 | 'text-align': 'center', 201 | 'font-style': 'italic', 202 | 'color': '#3498db', 203 | 'margin': '10px 0', 204 | 'padding': '8px', 205 | 'background-color': '#f0f7ff', 206 | 'border-radius': '5px', 207 | 'border-left': '3px solid #3498db' 208 | } 209 | } 210 | 211 | # Create the app layout 212 | app.layout = html.Div([ 213 | # Header 214 | html.Div([ 215 | html.H1("Vehicle Price Analysis Dashboard", style={'margin': '0'}) 216 | ], style=styles['header']), 217 | 218 | # Filter section 219 | html.Div([ 220 | html.Div([ 221 | html.Label("Filter by km/year:", style=styles['label']), 222 | dcc.Dropdown( 223 | id='km-filter', 224 | options=km_ranges, 225 | value='all', 226 | clearable=False 227 | ), 228 | ], style=styles['filter']), 229 | 230 | html.Div([ 231 | html.Label("Filter by owner hand:", style=styles['label']), 232 | dcc.Dropdown( 233 | id='hand-filter', 234 | options=hands, 235 | value='all', 236 | clearable=False 237 | ), 238 | ], style=styles['filter']), 239 | 240 | # New model multi-select dropdown 241 | html.Div([ 242 | html.Label("Filter by model:", style=styles['label']), 243 | dcc.Dropdown( 244 | id='model-filter', 245 | options=models, 246 | value=[], 247 | multi=True, 248 | placeholder="Select model(s)" 249 | ), 250 | ], style=styles['filter']), 251 | 252 | html.Div([ 253 | html.Label("Filter by listing type:", style=styles['label']), 254 | dcc.Dropdown( 255 | id='adtype-filter', 256 | options=ad_types, 257 | value='all', 258 | clearable=False 259 | ), 260 | ], style=styles['filter']), 261 | 262 | html.Div([ 263 | html.Label("Filter by sub-model:", style=styles['label']), 264 | html.Div([ 265 | dcc.Checklist( 266 | id='submodel-checklist', 267 | options=[], # Will be populated dynamically based on model selection 268 | value=[], 269 | labelStyle={'display': 'block', 'margin-bottom': '8px', 'cursor': 'pointer'}, 270 | style={'max-height': '200px', 'overflow-y': 'auto', 'padding': '10px', 'background-color': '#f5f9ff', 'border-radius': '5px'} 271 | ), 272 | ]), 273 | html.Div([ 274 | html.Button( 275 | 'Apply Filters', 276 | id='apply-submodel-button', 277 | style=styles['button'] 278 | ), 279 | html.Button( 280 | 'Clear Selection', 281 | id='clear-submodel-button', 282 | style=styles['clear_button'] 283 | ), 284 | ], style={'display': 'flex', 'gap': '10px'}), 285 | ], style={'width': '23%', 'min-width': '200px', 'padding': '10px', 'flex-grow': '1'}), 286 | 287 | ], style=styles['filter_container']), 288 | 289 | # Click instruction 290 | html.Div([ 291 | html.P("👆 Click on any point in the graph to open the vehicle ad in a new tab") 292 | ], style=styles['click_instruction']), 293 | 294 | # Graph section 295 | html.Div([ 296 | dcc.Graph(id='price-date-scatter') 297 | ], style=styles['graph']), 298 | 299 | # Summary section 300 | html.Div([ 301 | html.H3("Data Summary", style=styles['summary_header']), 302 | html.Div(id='summary-stats') 303 | ], style=styles['summary']), 304 | 305 | # Store for clicked links - ADDED FOR CLICKABLE POINTS 306 | dcc.Store(id='clicked-link', storage_type='memory'), 307 | ], style=styles['container']) 308 | 309 | # ADDED: Client-side callback to open links in new tab 310 | app.clientside_callback( 311 | """ 312 | function(clickData) { 313 | console.log(clickData); 314 | if(clickData && clickData.points && clickData.points.length > 0) { 315 | const link = clickData.points[0].customdata[6]; 316 | if(link && link.length > 0) { 317 | window.open(link, '_blank'); 318 | } 319 | } 320 | return window.dash_clientside.no_update; 321 | } 322 | """, 323 | Output('clicked-link', 'data'), 324 | Input('price-date-scatter', 'clickData'), 325 | prevent_initial_call=True 326 | ) 327 | 328 | # Callback to update submodel options based on selected models 329 | @app.callback( 330 | Output('submodel-checklist', 'options'), 331 | Input('model-filter', 'value'), 332 | ) 333 | def update_submodel_options(selected_models): 334 | if not selected_models or len(selected_models) == 0: 335 | # If no models selected, show all submodels 336 | # For each submodel, add the model name in brackets 337 | submodel_options = [] 338 | for sm in sorted(df['subModel'].unique()): 339 | # Find models for this submodel 340 | models_for_submodel = df[df['subModel'] == sm]['model'].unique() 341 | if len(models_for_submodel) == 1: 342 | # If there's only one model for this submodel 343 | label = f"[{models_for_submodel[0]}] {sm}" 344 | else: 345 | # If there are multiple models, show first one with "+" 346 | label = f"[{models_for_submodel[0]}+] {sm}" 347 | submodel_options.append({'label': label, 'value': sm}) 348 | else: 349 | # Filter submodels based on selected models 350 | filtered_df = df[df['model'].isin(selected_models)] 351 | submodel_options = [] 352 | for sm in sorted(filtered_df['subModel'].unique()): 353 | # Find models for this submodel (limited to selected models) 354 | models_for_submodel = filtered_df[filtered_df['subModel'] == sm]['model'].unique() 355 | if len(models_for_submodel) == 1: 356 | # If there's only one model for this submodel 357 | label = f" {sm} [{models_for_submodel[0]}]" 358 | else: 359 | # Join all models (should be less since we're filtering) 360 | models_str = '+'.join(models_for_submodel) 361 | label = f" {sm} [{models_str}]" 362 | submodel_options.append({'label': label, 'value': sm}) 363 | 364 | return list(sorted(submodel_options, key=lambda x: x['label'])) 365 | 366 | # Callback to clear submodel selection 367 | @app.callback( 368 | Output('submodel-checklist', 'value'), 369 | Input('clear-submodel-button', 'n_clicks'), 370 | prevent_initial_call=True 371 | ) 372 | def clear_submodel_selection(n_clicks): 373 | return [] 374 | 375 | @app.callback( 376 | [Output('price-date-scatter', 'figure'), 377 | Output('summary-stats', 'children')], 378 | [Input('km-filter', 'value'), 379 | Input('hand-filter', 'value'), 380 | Input('model-filter', 'value'), 381 | Input('apply-submodel-button', 'n_clicks'), 382 | Input('adtype-filter', 'value')], 383 | [State('submodel-checklist', 'value')] 384 | ) 385 | def update_graph(km_range, hand, models, submodel_btn_clicks, adtype, submodel_list): 386 | # Apply filters 387 | filtered_df = df.copy() 388 | 389 | if km_range != 'all': 390 | min_km, max_km = map(int, km_range.split('-')) 391 | filtered_df = filtered_df[filtered_df['km_per_year'] <= max_km] 392 | if min_km > 0: # For the "> 25,000" filter 393 | filtered_df = filtered_df[filtered_df['km_per_year'] > min_km] 394 | 395 | if hand != 'all': 396 | # Parse the hand range format (e.g., "0-2" means hand ≤ 2) 397 | min_hand, max_hand = map(int, hand.split('-')) 398 | filtered_df = filtered_df[filtered_df['hand'] <= max_hand] 399 | 400 | # Handle model multiselect filter 401 | if models and len(models) > 0: 402 | filtered_df = filtered_df[filtered_df['model'].isin(models)] 403 | 404 | # Handle checkbox list for submodels 405 | if submodel_list and len(submodel_list) > 0: 406 | # If checkboxes are selected, filter to only those submodels 407 | filtered_df = filtered_df[filtered_df['subModel'].isin(submodel_list)] 408 | # When no checkboxes are selected, show all submodels 409 | 410 | if adtype != 'all': 411 | filtered_df = filtered_df[filtered_df['listingType'] == adtype] 412 | 413 | # For car price analysis, we want newest cars on the left and oldest on the right 414 | # First, calculate "days since newest car" for each point 415 | newest_date = filtered_df['productionDate'].max() 416 | filtered_df['days_since_newest'] = (newest_date - filtered_df['productionDate']).dt.days 417 | 418 | # Calculate actual dates instead of days since newest 419 | today = pd.Timestamp.today().normalize() # Get today's date (without time) 420 | filtered_df['display_date'] = today - pd.to_timedelta(filtered_df['days_since_newest'], unit='D') 421 | 422 | # Create scatter plot with actual dates on x-axis 423 | fig = px.scatter( 424 | filtered_df, 425 | x='display_date', 426 | y='price', 427 | color='km_per_year', 428 | # Use fixed size instead of varying by km_per_year 429 | size_max=8, # Slightly larger for better clickability 430 | color_continuous_scale='viridis', # Smooth color gradient 431 | range_color=[0, filtered_df['km_per_year'].quantile(0.95)], # Cap color scale for better differentiation 432 | hover_data=['model', 'subModel', 'hand', 'km', 'city', 'productionDate', 'link'], 433 | labels={'display_date': 'Date', 434 | 'price': 'Price (₪)', 435 | 'km_per_year': 'Kilometers per Year'}, 436 | title=f'Vehicle Prices by Age ({len(filtered_df)} vehicles)' 437 | ) 438 | 439 | # Create custom data array for hover and click functionality 440 | custom_data = np.column_stack(( 441 | filtered_df['model'], 442 | filtered_df['subModel'], 443 | filtered_df['hand'], 444 | filtered_df['km'], 445 | filtered_df['city'], 446 | filtered_df['productionDate'], 447 | filtered_df['link'] 448 | )) 449 | 450 | # UPDATED: Make points clickable to their ad links with improved styling 451 | fig.update_traces( 452 | marker=dict( 453 | size=8, # Larger points for easier clicking 454 | opacity=0.8, 455 | line=dict(width=1, color='DarkSlateGrey') # Add outline for better visibility 456 | ), 457 | customdata=custom_data, 458 | hovertemplate='%{customdata[0]} %{customdata[1]}
' + 459 | 'Price: ₪%{y:,.0f}
' + 460 | 'Production Date: %{customdata[5]}
' + 461 | 'Hand: %{customdata[2]}
' + 462 | 'KM: %{customdata[3]:,.0f}
' + 463 | 'City: %{customdata[4]}
' + 464 | '👆 Click to view ad' # Clear instruction in hover 465 | ) 466 | 467 | # UPDATED: Configure clickable points with improved settings 468 | fig.update_layout( 469 | clickmode='event+select', # Enable clicking on points 470 | hoverdistance=100, # Increase hover detection distance 471 | hovermode='closest', # Show hover info for closest point 472 | # Improve interactivity 473 | dragmode='zoom', 474 | # Enhance appearance 475 | plot_bgcolor='rgba(240,240,240,0.2)', 476 | paper_bgcolor='rgba(0,0,0,0)', 477 | font=dict(family="Roboto, sans-serif"), 478 | xaxis=dict( 479 | title_font=dict(size=14), 480 | tickfont=dict(size=12), 481 | gridcolor='#eee', 482 | # Reverse the x-axis so older dates are on the left 483 | autorange="reversed" 484 | ), 485 | yaxis=dict( 486 | title_font=dict(size=14), 487 | tickfont=dict(size=12), 488 | gridcolor='#eee' 489 | ), 490 | title=dict( 491 | font=dict(size=16) 492 | ), 493 | legend=dict( 494 | title_font=dict(size=13), 495 | font=dict(size=11) 496 | ), 497 | coloraxis_colorbar=dict( 498 | title="Km/Year", 499 | title_font=dict(size=13), 500 | tickfont=dict(size=11) 501 | ), 502 | margin=dict(l=40, r=40, t=60, b=40) 503 | ) 504 | 505 | # Always add exponential trendline 506 | # For car price depreciation, we'll use days since newest car as x-axis 507 | if len(filtered_df) > 1: 508 | # Sort by days_since_newest for proper fitting 509 | sorted_df = filtered_df.sort_values('days_since_newest') 510 | 511 | x = sorted_df['days_since_newest'].values 512 | y = sorted_df['price'].values 513 | 514 | # Ensure we have numeric data for curve fitting 515 | valid_indices = ~np.isnan(x) & ~np.isnan(y) 516 | x = x[valid_indices] 517 | y = y[valid_indices] 518 | 519 | if len(x) > 1: # Need at least 2 points for curve fitting 520 | try: 521 | # For better exponential fit, try more robust approaches 522 | from scipy import optimize 523 | 524 | # For car price depreciation, an exponential decay function: 525 | # Price(t) = Base_Price * exp(-decay_rate * t) + Residual_Value 526 | def exp_decay_with_offset(x, a, b, c): 527 | return a * np.exp(-b * x) + c 528 | 529 | # Initial parameter guesses with bounds 530 | max_price = np.max(y) 531 | mean_price = np.mean(y) 532 | min_price = np.min(y) 533 | 534 | # Initial guess: start at max price, decay to around min price 535 | p0 = [max_price - min_price, 0.001, min_price] 536 | 537 | # Set bounds to ensure reasonable parameters 538 | # a: positive value up to 2x max observed price 539 | # b: positive decay rate, not too small or large 540 | # c: residual value, could be 0 or positive value 541 | bounds = ([0, 0.0001, 0], [2 * max_price, 0.1, mean_price]) 542 | 543 | # Try different fitting methods and functions 544 | try: 545 | # First try the 3-parameter model (with residual value) 546 | params, _ = optimize.curve_fit( 547 | exp_decay_with_offset, x, y, 548 | p0=p0, bounds=bounds, 549 | method='trf', maxfev=10000 550 | ) 551 | a, b, c = params 552 | 553 | # Generate curve points with more granularity 554 | x_curve = np.linspace(0, x.max(), 200) 555 | y_curve = exp_decay_with_offset(x_curve, a, b, c) 556 | 557 | except RuntimeError: 558 | # If that fails, try simpler 2-parameter model without offset 559 | def exp_decay(x, a, b): 560 | return a * np.exp(-b * x) 561 | 562 | # Adjust bounds for simpler model 563 | p0_simple = [max_price, 0.001] 564 | bounds_simple = ([0, 0.0001], [2 * max_price, 0.1]) 565 | 566 | params, _ = optimize.curve_fit( 567 | exp_decay, x, y, 568 | p0=p0_simple, bounds=bounds_simple, 569 | method='trf', maxfev=10000 570 | ) 571 | a, b = params 572 | c = 0 # No offset 573 | 574 | # Generate curve points 575 | x_curve = np.linspace(0, x.max(), 200) 576 | y_curve = exp_decay(x_curve, a, b) 577 | 578 | # Convert x_curve back to days for plotting 579 | curve_days = x_curve 580 | 581 | # Find newest date for reference 582 | newest_date = filtered_df['productionDate'].max() 583 | 584 | # Convert days to dates for plotting 585 | curve_dates = [newest_date - pd.Timedelta(days=int(days)) for days in curve_days] 586 | 587 | # Add the exponential trendline with higher visibility 588 | fig.add_trace(go.Scatter( 589 | x=today - pd.to_timedelta(curve_days, unit='D'), # Convert to actual dates 590 | y=y_curve, 591 | mode='lines', 592 | name='Exponential Trend', 593 | line=dict(color='red', width=3, dash='solid'), 594 | hoverinfo='none' # Disable hover for the trendline to keep it clean 595 | )) 596 | 597 | except Exception as e: 598 | # Log the error for debugging 599 | print(f"Error fitting exponential curve: {str(e)}") 600 | 601 | # Fallback to simple exponential fit using numpy 602 | try: 603 | # Take log of y values for linear fit 604 | log_y = np.log(y) 605 | # Filter out any -inf values from log(0) 606 | valid = np.isfinite(log_y) 607 | x_valid = x[valid] 608 | log_y_valid = log_y[valid] 609 | 610 | if len(x_valid) > 1: 611 | # Linear fit on log-transformed data 612 | z = np.polyfit(x_valid, log_y_valid, 1) 613 | # Convert back to exponential form 614 | a = np.exp(z[1]) 615 | b = -z[0] # Negative because our formula is exp(-bx) 616 | 617 | # Generate curve points 618 | x_curve = np.linspace(0, x.max(), 200) 619 | y_curve = a * np.exp(-b * x_curve) 620 | 621 | # Add the exponential trendline 622 | fig.add_trace(go.Scatter( 623 | x=today - pd.to_timedelta(x_curve, unit='D'), # Convert to actual dates 624 | y=y_curve, 625 | mode='lines', 626 | name='Exponential Trend (Simplified)', 627 | line=dict(color='red', width=3, dash='solid'), 628 | hoverinfo='none' 629 | )) 630 | else: 631 | # Not enough valid data for simplified exponential fit 632 | # Fall back to linear as last resort 633 | z = np.polyfit(x, y, 1) 634 | p = np.poly1d(z) 635 | x_curve = np.linspace(0, x.max(), 200) 636 | 637 | fig.add_trace(go.Scatter( 638 | x=today - pd.to_timedelta(x_curve, unit='D'), # Convert to actual dates 639 | y=p(x_curve), 640 | mode='lines', 641 | name='Linear Trend (Fallback)', 642 | line=dict(color='orange', width=3, dash='dash'), 643 | hoverinfo='none' 644 | )) 645 | 646 | except Exception as e2: 647 | print(f"Error with simplified exponential fit: {str(e2)}") 648 | # Final fallback to linear trend if all else fails 649 | try: 650 | z = np.polyfit(x, y, 1) 651 | p = np.poly1d(z) 652 | x_curve = np.linspace(0, x.max(), 200) 653 | 654 | fig.add_trace(go.Scatter( 655 | x=today - pd.to_timedelta(x_curve, unit='D'), # Convert to actual dates 656 | y=p(x_curve), 657 | mode='lines', 658 | name='Linear Trend (Fallback)', 659 | line=dict(color='orange', width=3, dash='dash'), 660 | hoverinfo='none' 661 | )) 662 | except: 663 | print("All trendline methods failed") 664 | 665 | # Enhanced summary statistics with better styling 666 | summary_style = { 667 | 'container': { 668 | 'display': 'flex', 669 | 'flex-wrap': 'wrap', 670 | 'gap': '20px' 671 | }, 672 | 'card': { 673 | 'flex': '1', 674 | 'min-width': '180px', 675 | 'padding': '15px', 676 | 'border-radius': '5px', 677 | 'background-color': '#f5f9ff', 678 | 'box-shadow': '0 2px 4px rgba(0,0,0,0.05)', 679 | 'text-align': 'center' 680 | }, 681 | 'value': { 682 | 'font-size': '20px', 683 | 'font-weight': 'bold', 684 | 'color': '#2c3e50', 685 | 'margin': '10px 0' 686 | }, 687 | 'label': { 688 | 'font-size': '14px', 689 | 'color': '#7f8c8d', 690 | 'margin': '0' 691 | } 692 | } 693 | 694 | # Create styled summary stats cards 695 | summary = html.Div([ 696 | html.Div([ 697 | html.P("Number of Vehicles", style=summary_style['label']), 698 | html.P(f"{len(filtered_df)}", style=summary_style['value']) 699 | ], style=summary_style['card']), 700 | 701 | html.Div([ 702 | html.P("Average Price", style=summary_style['label']), 703 | html.P(f"₪{filtered_df['price'].mean():,.0f}", style=summary_style['value']) 704 | ], style=summary_style['card']), 705 | 706 | html.Div([ 707 | html.P("Price Range", style=summary_style['label']), 708 | html.P(f"₪{filtered_df['price'].min():,.0f} - ₪{filtered_df['price'].max():,.0f}", style=summary_style['value']) 709 | ], style=summary_style['card']), 710 | 711 | html.Div([ 712 | html.P("Average km/year", style=summary_style['label']), 713 | html.P(f"{filtered_df['km_per_year'].mean():,.0f}", style=summary_style['value']) 714 | ], style=summary_style['card']), 715 | 716 | html.Div([ 717 | html.P("Average Vehicle Age", style=summary_style['label']), 718 | html.P(f"{filtered_df['number_of_years'].mean():.1f} years", style=summary_style['value']) 719 | ], style=summary_style['card']), 720 | ], style=summary_style['container']) 721 | 722 | return fig, summary 723 | 724 | # Run the app 725 | print(f"Starting dashboard on http://127.0.0.1:{port}/") 726 | app.run_server(debug=False, port=port) 727 | 728 | def main(): 729 | args = parse_arguments() 730 | 731 | # Create output directory if it doesn't exist 732 | output_dir = Path(args.output_dir) 733 | output_dir.mkdir(parents=True, exist_ok=True) 734 | 735 | # Step 1: Scrape the data if not skipped 736 | if not args.skip_scrape: 737 | scrape_data(args.output_dir, args.manufacturer, args.model, args.max_pages) 738 | 739 | # Step 2: Process the scraped data 740 | csv_path = process_data(args.output_dir) 741 | 742 | # Step 3: Load the data 743 | df = load_data(csv_path) 744 | os.unlink(csv_path) 745 | 746 | # Step 4: Create and run the dashboard 747 | create_dashboard(df, args.port) 748 | 749 | 750 | if __name__ == "__main__": 751 | main() --------------------------------------------------------------------------------