├── product_info.py ├── main_data_scraper.py └── README.md /product_info.py: -------------------------------------------------------------------------------- 1 | from enum import member 2 | from random import Random 3 | 4 | from bs4 import BeautifulSoup 5 | import lxml 6 | import requests 7 | from selenium import webdriver 8 | from selenium.webdriver.chrome.service import Service 9 | from selenium.webdriver.chrome.options import Options 10 | from urllib3.filepost import writer 11 | from webdriver_manager.chrome import ChromeDriverManager 12 | from selenium.webdriver.common.by import By 13 | from selenium.webdriver.common.keys import Keys 14 | import time 15 | import random 16 | from selenium.webdriver.support import expected_conditions as ec 17 | from selenium.webdriver.support.ui import WebDriverWait 18 | import re 19 | import csv 20 | 21 | url="xxxxx" 22 | clean_xxx_pl=[ 23 | ["Nazwa produktu", "Cena hurtowa", "№ art", "Dostępność", "Image", "Pasuje do pojazdów" ," Grupa produktu"] 24 | ] 25 | with open("csv/clean_xxx.csv","w", newline="", encoding="utf-8") as file: 26 | writer=csv.writer(file) 27 | 28 | headers={ 29 | "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", 30 | "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", 31 | "Accept-language":"ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7", 32 | "Accept-Encoding":"gzip, deflate, br, zstd", 33 | "Connection":"keep-alive", 34 | "Upgrade-Insecure-Requests":"1", 35 | "Content-Type":"text/html", 36 | } 37 | user_agents = [ 38 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", 39 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0", 40 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", 41 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", 42 | ] 43 | 44 | chrome_options = Options() 45 | chrome_options.add_argument("--start-maximized") # Запускаем браузер в полном экране 46 | chrome_options.add_argument("--disable-blink-features=AutomationControlled") # Скрываем автоматизацию 47 | chrome_options.add_argument("--headless") # Запуск браузера в фоновом режиме 48 | 49 | service = Service(ChromeDriverManager().install()) 50 | driver = webdriver.Chrome(service=service, options=chrome_options) 51 | options = webdriver.ChromeOptions() 52 | 53 | 54 | driver.get(url) 55 | time.sleep(3) 56 | try: 57 | accept_all_button = WebDriverWait(driver, 4).until( 58 | ec.element_to_be_clickable((By.ID, "dv-t3-consent-management-button-accept-all")) 59 | ) 60 | accept_all_button.click() 61 | print("Akceptuj wszystkie cookies- success..") 62 | except Exception as e: 63 | print("ERROR!", e) 64 | time.sleep(3) 65 | 66 | url_wo_login=driver.page_source 67 | 68 | with open("html/url_wo_login.html","w", encoding="utf8") as file: 69 | file.write(url_wo_login) 70 | 71 | try: 72 | wait = WebDriverWait(driver, 4) 73 | button = wait.until(ec.element_to_be_clickable((By.CLASS_NAME, "anonum_btn"))) 74 | button.click() 75 | print("Wejdź bez logowania - success..") 76 | except Exception as e: 77 | print("ERROR", e) 78 | 79 | with open("csv/xxx_pl.csv", newline="", encoding="utf-8") as file: 80 | reader = csv.reader(file) 81 | next(reader) 82 | data = list(reader) 83 | for row in data: 84 | url=row[6] 85 | driver.get(url) 86 | item_html=driver.page_source 87 | with open("html/item_html.html", "w", encoding="utf8") as file: 88 | file.write(item_html) 89 | with open("html/item_html.html", encoding="utf8") as file: 90 | src = file.read() 91 | soup = BeautifulSoup(src, features="lxml") 92 | try: 93 | current_img = soup.find("a", class_="detail_img current_image").get("href") 94 | img = f"https:{current_img}" 95 | except: 96 | img = "No Image" 97 | 98 | try: 99 | all_models = soup.find_all("tr", class_="tda") 100 | models = [] 101 | models_list=[] 102 | for item in all_models: 103 | model_info = item.find_all("a") 104 | items_list = [] 105 | for items in model_info: 106 | items_list.append(items.text) 107 | model = items_list[0] 108 | year = items_list[5] 109 | if model.strip() in models_list: 110 | models_list.append(year.strip()) 111 | else: 112 | models_list.append(model.strip()) 113 | models_list.append(year.strip()) 114 | models = " ".join(models_list) 115 | except: 116 | models = "No models info" 117 | clean_xxx_pl.append([row[0],row[1],row[4],row[2],img,models, row[3]]) 118 | i+=1 119 | print(f"Done {i} of {len(data)}") 120 | with open("csv/clean_xxx_pl.csv","w", newline="", encoding="utf-8") as file: 121 | writer=csv.writer(file) 122 | for j in range(len(clean_xxx_pl)): 123 | writer.writerow(clean_xxx_pl[j]) 124 | driver.quit() 125 | -------------------------------------------------------------------------------- /main_data_scraper.py: -------------------------------------------------------------------------------- 1 | from enum import member 2 | from random import Random 3 | 4 | from bs4 import BeautifulSoup 5 | import lxml 6 | import requests 7 | from selenium import webdriver 8 | from selenium.webdriver.chrome.service import Service 9 | from selenium.webdriver.chrome.options import Options 10 | from urllib3.filepost import writer 11 | from webdriver_manager.chrome import ChromeDriverManager 12 | from selenium.webdriver.common.by import By 13 | from selenium.webdriver.common.keys import Keys 14 | import time 15 | import random 16 | from selenium.webdriver.support import expected_conditions as ec 17 | from selenium.webdriver.support.ui import WebDriverWait 18 | import re 19 | import csv 20 | 21 | 22 | random=random.uniform(0,2) 23 | login_url="xxxxxxxx" 24 | main_shop_url="xxxxxxxx" 25 | 26 | headers={ 27 | "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", 28 | "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", 29 | "Accept-language":"ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7", 30 | "Accept-Encoding":"gzip, deflate, br, zstd", 31 | "Connection":"keep-alive", 32 | "Upgrade-Insecure-Requests":"1", 33 | "Content-Type":"text/html", 34 | } 35 | payload={ 36 | "username":"xxxxx", 37 | "password":"xxxxx" 38 | } 39 | 40 | chrome_options = Options() 41 | chrome_options.add_argument("--start-maximized") # Запускаем браузер в полном экране 42 | chrome_options.add_argument("--disable-blink-features=AutomationControlled") # Скрываем автоматизацию 43 | chrome_options.add_argument("--headless") # Запуск браузера в фоновом режиме 44 | # Устанавливаем драйвер 45 | service = Service(ChromeDriverManager().install()) 46 | driver = webdriver.Chrome(service=service, options=chrome_options) 47 | 48 | username = "106863" 49 | password = "XVS12003440xv$" 50 | 51 | # Открываем страницу логина 52 | driver.get(login_url) 53 | 54 | # Ждем, чтобы страница загрузилась 55 | time.sleep(2) 56 | 57 | # Находим поля для ввода логина и пароля 58 | username_field = driver.find_element(By.ID, "username") 59 | password_field = driver.find_element(By.ID, "password") 60 | 61 | # Вводим логин и пароль 62 | username_field.send_keys(username) 63 | password_field.send_keys(password) 64 | 65 | # Находим и нажимаем кнопку логина 66 | login_button = driver.find_element(By.CSS_SELECTOR, "input[type='submit']") 67 | login_button.click() 68 | 69 | # Ждем, чтобы страница загрузилась 70 | time.sleep(2) 71 | driver.get(main_shop_url) 72 | time.sleep(3) 73 | try: 74 | accept_all_button = WebDriverWait(driver, 4).until( 75 | ec.element_to_be_clickable((By.ID, "dv-t3-consent-management-button-accept-all")) 76 | ) 77 | accept_all_button.click() 78 | print("Akceptuj wszystkie cookies- success..") 79 | except Exception as e: 80 | print("ERROR!", e) 81 | time.sleep(5) 82 | main_shop_url_html=driver.page_source 83 | with open("html/main_shop_url_html.html","w",encoding="utf8")as file: 84 | file.write(main_shop_url_html) 85 | with open("html/main_shop_url_html.html",encoding="utf8") as file: 86 | src=file.read() 87 | soup=BeautifulSoup(src,features="lxml") 88 | 89 | main_data = re.compile("cat-item") 90 | find_main_info = soup.find_all("div", class_=main_data) 91 | 92 | list_of_links=[] 93 | list_of_categories=[] 94 | final_list=[ 95 | ["Nazwa produktu", 96 | "Cena hurtowa", 97 | "Dostępność", 98 | "Grupa produktu", 99 | "№ art.", 100 | "Mini IMG", 101 | "Link do produktu" 102 | ] 103 | ] 104 | 105 | with open("csv/xxxx.csv","w",newline='', encoding="utf8") as file: 106 | writer=csv.writer(file) 107 | 108 | for item in find_main_info: 109 | category_in_item=item.text.split(". ") 110 | category=category_in_item[1].replace("\xa0","") 111 | link_in_item=item.find("a") 112 | link=link_in_item.get("href") 113 | if category=="NOWOŚCI": 114 | continue 115 | list_of_categories.append(category) 116 | list_of_links.append(link) 117 | 118 | def find_all_pages(src): 119 | soup=BeautifulSoup(src,features="lxml") 120 | try: 121 | last_page_find=soup.find("div",class_="navi_seiten").text 122 | last_page_count=last_page_find.split() 123 | last_page=int(last_page_count[len(last_page_count)-2]) 124 | except: 125 | last_page=1 126 | print("*****WARNING! FOUND ONLY 1 PAGE!!*****") 127 | return last_page 128 | 129 | def lets_pars(src, final_list) : 130 | soup = BeautifulSoup(src, features="lxml") 131 | main_block = soup.find("div", class_="gallery") 132 | all_items = main_block.find_all("div", class_="gal_elem kombi") 133 | for item in all_items: 134 | avi = item.find("div", class_="avail_wrap").text 135 | res=re.findall(r'\d+',avi) 136 | item_avi=res[0] if res else "0" 137 | item_name = item.find("h3").find("a").text 138 | item_link = item.find("h3").find("a").get("href") 139 | item_price_l = item.find("td", class_="gal_price_color larsson_size").text 140 | item_price = item_price_l.replace("\xa0", " ") 141 | item_image = item.find("img").get("src") 142 | item_category = item.find_previous("div", class_="breadcrump-liste").text.strip() 143 | category = " > ".join( 144 | re.sub(r'^\s*»?\s*\d*\.\s*', '', line.strip()) for line in item_category.strip().splitlines() if line.strip() 145 | ) 146 | number=item.find("div", class_="gal_artnr").text.split(": ") 147 | item_num=number[1].replace("\xa0"," ") 148 | 149 | final_list.append([item_name, item_price, item_avi, category, item_num, "https:"+item_image, item_link]) 150 | return final_list 151 | 152 | for i in range(len(list_of_categories)): 153 | driver.get(list_of_links[i]+"?show=gal_kombi") 154 | 155 | time.sleep(random) 156 | 157 | with open (f"html/{list_of_categories[i]}.html","w", encoding="utf8") as file: 158 | file.write(driver.page_source) 159 | with open (f"html/{list_of_categories[i]}.html", encoding="utf8") as file: 160 | src=file.read() 161 | 162 | last_page=find_all_pages(src) 163 | 164 | time.sleep(random) 165 | 166 | for pars_page in range(last_page-1): 167 | time.sleep(random) 168 | driver.get(list_of_links[i] + f"?show=gal_kombi&sr={pars_page*30}") 169 | src=driver.page_source 170 | lets_pars(src, final_list) 171 | print(f"Parsing {list_of_categories[i]}... page {pars_page+1} of {last_page}") 172 | 173 | 174 | with open("csv/xxxx.csv","a",newline='', encoding="utf8") as file: 175 | writer=csv.writer(file) 176 | for i in range(len(final_list)): 177 | writer.writerow(final_list[i]) 178 | 179 | 180 | 181 | driver.close() 182 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dropshipping Product Scraping Tool 2 | 3 | A comprehensive web scraping solution for automated eCommerce product data extraction and processing. 4 | 5 | ## Overview 6 | 7 | This project provides a robust, enterprise-grade web scraping framework designed to extract product information from eCommerce websites. Built with Python and Selenium, it handles dynamic content, bypasses anti-bot protections, and delivers clean, structured data for dropshipping businesses. 8 | 9 | ### Key Capabilities 10 | 11 | - **Automated Product Discovery**: Scrapes product listings across multiple pages 12 | - **Detailed Product Information**: Extracts specifications, images, compatibility data, and pricing 13 | - **Anti-Bot Evasion**: Implements sophisticated techniques to bypass detection systems 14 | - **Dynamic Content Handling**: Processes JavaScript-rendered content and interactive elements 15 | - **Data Export**: Outputs clean, structured data in CSV format 16 | 17 | ## Architecture 18 | 19 | ### Core Components 20 | 21 | #### `main_data_scraper.py` 22 | Primary scraping engine responsible for: 23 | - Multi-page product catalog traversal 24 | - Product URL collection and categorization 25 | - Initial product attribute extraction 26 | - Session management and request orchestration 27 | 28 | #### `product_info.py` 29 | Detailed product processor that handles: 30 | - Individual product page analysis 31 | - Image extraction and validation 32 | - Model and compatibility data parsing 33 | - Data normalization and CSV export 34 | 35 | ## Features 36 | 37 | ### 🤖 Advanced Browser Automation 38 | - **Selenium WebDriver**: Full browser automation for JavaScript-heavy sites 39 | - **Headless Operation**: Optimized performance without GUI overhead 40 | - **Element Interaction**: Handles clicks, form submissions, and dynamic loading 41 | - **Smart Waiting**: WebDriverWait implementation for reliable element detection 42 | 43 | ### 🛡️ Anti-Detection Technology 44 | - **User-Agent Rotation**: Randomized browser fingerprints 45 | - **Request Throttling**: Intelligent delays to mimic human behavior 46 | - **Chrome Options Optimization**: Stealth mode configuration 47 | - **Session Persistence**: Maintains realistic browsing patterns 48 | 49 | ### 📊 Data Processing Pipeline 50 | - **Dynamic Content Extraction**: Handles AJAX-loaded product information 51 | - **Image Processing**: Automated image discovery and validation 52 | - **Data Cleaning**: Removes duplicates and normalizes formats 53 | - **CSV Export**: Structured output with customizable fields 54 | 55 | ### 🔧 Error Handling & Reliability 56 | - **Graceful Degradation**: Continues operation when individual products fail 57 | - **Retry Mechanisms**: Automatic retry for transient failures 58 | - **Comprehensive Logging**: Detailed operation tracking 59 | - **Resource Management**: Proper cleanup of browser instances 60 | 61 | ## Technical Specifications 62 | 63 | ### System Requirements 64 | - **Python**: 3.6 or higher 65 | - **Memory**: Minimum 4GB RAM recommended 66 | - **Storage**: 1GB free space for data and browser cache 67 | - **Network**: Stable internet connection 68 | 69 | ### Dependencies 70 | 71 | ```python 72 | selenium>=4.0.0 73 | webdriver-manager>=3.8.0 74 | beautifulsoup4>=4.11.0 75 | lxml>=4.9.0 76 | requests>=2.28.0 77 | pandas>=1.5.0 # Optional: for advanced data manipulation 78 | ``` 79 | 80 | ## Installation 81 | 82 | ### Quick Start 83 | 84 | ```bash 85 | git clone https://github.com/danieladdisonorg/Dropshipping-Product-Scraping.git 86 | ``` 87 | 88 | ```bash 89 | cd Dropshipping-Product-Scraping 90 | ``` 91 | 92 | ```bash 93 | pip install -r requirements.txt 94 | ``` 95 | 96 | ### Chrome WebDriver Setup 97 | The project uses WebDriver Manager for automatic Chrome driver management. No manual driver installation required. 98 | 99 | ## Usage 100 | 101 | ### Basic Operation 102 | 103 | ```bash 104 | python main_data_scraper.py 105 | ``` 106 | 107 | ```bash 108 | python product_info.py 109 | ``` 110 | 111 | ### Configuration Options 112 | 113 | The scripts support various configuration parameters: 114 | - **Target URLs**: Modify source websites in the configuration section 115 | - **Output Format**: Customize CSV field structure 116 | - **Scraping Intervals**: Adjust delay timing for different sites 117 | - **User-Agent Lists**: Update browser fingerprint rotation 118 | 119 | ## Output Format 120 | 121 | ### CSV Structure 122 | ``` 123 | Product Name, Model, Year, Compatibility, Image URL, Price, Description, Category, Availability 124 | ``` 125 | 126 | ### Data Quality Features 127 | - **Duplicate Removal**: Automatic deduplication based on product identifiers 128 | - **Data Validation**: Ensures required fields are populated 129 | - **Image Verification**: Validates image URLs and accessibility 130 | - **Format Standardization**: Consistent data formatting across all records 131 | 132 | ## Best Practices 133 | 134 | ### Ethical Scraping Guidelines 135 | - **Rate Limiting**: Respects server resources with appropriate delays 136 | - **robots.txt Compliance**: Honors website scraping policies 137 | - **Terms of Service**: Ensure compliance with target site terms 138 | - **Data Usage**: Use scraped data responsibly and legally 139 | 140 | ### Performance Optimization 141 | - **Batch Processing**: Groups requests for efficiency 142 | - **Memory Management**: Proper cleanup of browser resources 143 | - **Concurrent Processing**: Multi-threading support for large datasets 144 | - **Caching**: Reduces redundant requests 145 | 146 | ## Troubleshooting 147 | 148 | ### Common Issues 149 | - **Chrome Driver Errors**: Ensure Chrome browser is installed and updated 150 | - **Timeout Issues**: Increase wait times for slow-loading sites 151 | - **Memory Usage**: Monitor RAM usage during large scraping operations 152 | - **IP Blocking**: Implement proxy rotation if needed 153 | 154 | ### Debug Mode 155 | Enable verbose logging by modifying the logging configuration in the scripts. 156 | 157 | ## Contributing 158 | 159 | We welcome contributions! Please read our contributing guidelines and submit pull requests for any improvements. 160 | 161 | ### Development Setup 162 | 163 | ```bash 164 | pip install -r requirements-dev.txt 165 | ``` 166 | 167 | ```bash 168 | python -m pytest tests/ 169 | ``` 170 | 171 | ## License 172 | 173 | This project is licensed under the MIT License - see the LICENSE file for details. 174 | 175 | ## Disclaimer 176 | 177 | This tool is intended for educational and legitimate business purposes only. Users are responsible for ensuring compliance with applicable laws, website terms of service, and ethical scraping practices. The authors are not responsible for any misuse of this software. 178 | 179 | ## Support 180 | 181 | For issues, feature requests, or questions: 182 | - **GitHub Issues**: [Create an issue](https://github.com/danieladdisonorg/Dropshipping-Product-Scraping/issues) 183 | - **Documentation**: Check the wiki for detailed guides 184 | - **Community**: Join our discussions for tips and best practices 185 | 186 | --- 187 | 188 | **Version**: 2.0.0 189 | **Last Updated**: 2024 190 | **Maintained by**: Daniel Addison 191 | --------------------------------------------------------------------------------