├── product_info.py
├── main_data_scraper.py
└── README.md


/product_info.py:
--------------------------------------------------------------------------------
  1 | from enum import member
  2 | from random import Random
  3 | 
  4 | from bs4 import BeautifulSoup
  5 | import lxml
  6 | import requests
  7 | from selenium import webdriver
  8 | from selenium.webdriver.chrome.service import Service
  9 | from selenium.webdriver.chrome.options import Options
 10 | from urllib3.filepost import writer
 11 | from webdriver_manager.chrome import ChromeDriverManager
 12 | from selenium.webdriver.common.by import By
 13 | from selenium.webdriver.common.keys import Keys
 14 | import time
 15 | import random
 16 | from selenium.webdriver.support import expected_conditions as ec
 17 | from selenium.webdriver.support.ui import WebDriverWait
 18 | import re
 19 | import csv
 20 | 
 21 | url="xxxxx"
 22 | clean_xxx_pl=[
 23 |     ["Nazwa produktu", "Cena hurtowa", "№ art", "Dostępność", "Image", "Pasuje do pojazdów" ," Grupa produktu"]
 24 | ]
 25 | with open("csv/clean_xxx.csv","w", newline="", encoding="utf-8") as file:
 26 |     writer=csv.writer(file)
 27 | 
 28 | headers={
 29 |     "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
 30 |     "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
 31 |     "Accept-language":"ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7",
 32 |     "Accept-Encoding":"gzip, deflate, br, zstd",
 33 |     "Connection":"keep-alive",
 34 |     "Upgrade-Insecure-Requests":"1",
 35 |     "Content-Type":"text/html",
 36 | }
 37 | user_agents = [
 38 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
 39 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0",
 40 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
 41 |     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
 42 | ]
 43 | 
 44 | chrome_options = Options()
 45 | chrome_options.add_argument("--start-maximized")  # Запускаем браузер в полном экране
 46 | chrome_options.add_argument("--disable-blink-features=AutomationControlled")  # Скрываем автоматизацию
 47 | chrome_options.add_argument("--headless")    # Запуск браузера в фоновом режиме
 48 | 
 49 | service = Service(ChromeDriverManager().install())
 50 | driver = webdriver.Chrome(service=service, options=chrome_options)
 51 | options = webdriver.ChromeOptions()
 52 | 
 53 | 
 54 | driver.get(url)
 55 | time.sleep(3)
 56 | try:
 57 |     accept_all_button = WebDriverWait(driver, 4).until(
 58 |         ec.element_to_be_clickable((By.ID, "dv-t3-consent-management-button-accept-all"))
 59 |     )
 60 |     accept_all_button.click()
 61 |     print("Akceptuj wszystkie cookies- success..")
 62 | except Exception as e:
 63 |     print("ERROR!", e)
 64 | time.sleep(3)
 65 | 
 66 | url_wo_login=driver.page_source
 67 | 
 68 | with open("html/url_wo_login.html","w", encoding="utf8") as file:
 69 |     file.write(url_wo_login)
 70 | 
 71 | try:
 72 |     wait = WebDriverWait(driver, 4)
 73 |     button = wait.until(ec.element_to_be_clickable((By.CLASS_NAME, "anonum_btn")))
 74 |     button.click()
 75 |     print("Wejdź bez logowania - success..")
 76 | except Exception as e:
 77 |     print("ERROR", e)
 78 | 
 79 | with open("csv/xxx_pl.csv", newline="", encoding="utf-8") as file:
 80 |     reader = csv.reader(file)
 81 |     next(reader)
 82 |     data = list(reader)
 83 | for row in data:
 84 |     url=row[6]
 85 |     driver.get(url)
 86 |     item_html=driver.page_source
 87 |     with open("html/item_html.html", "w", encoding="utf8") as file:
 88 |         file.write(item_html)
 89 |     with open("html/item_html.html", encoding="utf8") as file:
 90 |         src = file.read()
 91 |     soup = BeautifulSoup(src, features="lxml")
 92 |     try:
 93 |         current_img = soup.find("a", class_="detail_img current_image").get("href")
 94 |         img = f"https:{current_img}"
 95 |     except:
 96 |         img = "No Image"
 97 | 
 98 |     try:
 99 |         all_models = soup.find_all("tr", class_="tda")
100 |         models = []
101 |         models_list=[]
102 |         for item in all_models:
103 |             model_info = item.find_all("a")
104 |             items_list = []
105 |             for items in model_info:
106 |                 items_list.append(items.text)
107 |             model = items_list[0]
108 |             year = items_list[5]
109 |             if model.strip() in models_list:
110 |                 models_list.append(year.strip())
111 |             else:
112 |                 models_list.append(model.strip())
113 |                 models_list.append(year.strip())
114 |         models = " ".join(models_list)
115 |     except:
116 |         models = "No models info"
117 |     clean_xxx_pl.append([row[0],row[1],row[4],row[2],img,models, row[3]])
118 |     i+=1
119 |     print(f"Done {i} of {len(data)}")
120 | with open("csv/clean_xxx_pl.csv","w", newline="", encoding="utf-8") as file:
121 |     writer=csv.writer(file)
122 |     for j in range(len(clean_xxx_pl)):
123 |         writer.writerow(clean_xxx_pl[j])
124 | driver.quit()
125 | 


--------------------------------------------------------------------------------
/main_data_scraper.py:
--------------------------------------------------------------------------------
  1 | from enum import member
  2 | from random import Random
  3 | 
  4 | from bs4 import BeautifulSoup
  5 | import lxml
  6 | import requests
  7 | from selenium import webdriver
  8 | from selenium.webdriver.chrome.service import Service
  9 | from selenium.webdriver.chrome.options import Options
 10 | from urllib3.filepost import writer
 11 | from webdriver_manager.chrome import ChromeDriverManager
 12 | from selenium.webdriver.common.by import By
 13 | from selenium.webdriver.common.keys import Keys
 14 | import time
 15 | import random
 16 | from selenium.webdriver.support import expected_conditions as ec
 17 | from selenium.webdriver.support.ui import WebDriverWait
 18 | import re
 19 | import csv
 20 | 
 21 | 
 22 | random=random.uniform(0,2)
 23 | login_url="xxxxxxxx"
 24 | main_shop_url="xxxxxxxx"
 25 | 
 26 | headers={
 27 |     "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
 28 |     "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
 29 |     "Accept-language":"ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7",
 30 |     "Accept-Encoding":"gzip, deflate, br, zstd",
 31 |     "Connection":"keep-alive",
 32 |     "Upgrade-Insecure-Requests":"1",
 33 |     "Content-Type":"text/html",
 34 | }
 35 | payload={
 36 |     "username":"xxxxx",
 37 |     "password":"xxxxx"
 38 | }
 39 | 
 40 | chrome_options = Options()
 41 | chrome_options.add_argument("--start-maximized")  # Запускаем браузер в полном экране
 42 | chrome_options.add_argument("--disable-blink-features=AutomationControlled")  # Скрываем автоматизацию
 43 | chrome_options.add_argument("--headless")    # Запуск браузера в фоновом режиме
 44 | # Устанавливаем драйвер
 45 | service = Service(ChromeDriverManager().install())
 46 | driver = webdriver.Chrome(service=service, options=chrome_options)
 47 | 
 48 | username = "106863"
 49 | password = "XVS12003440xv$"
 50 | 
 51 | # Открываем страницу логина
 52 | driver.get(login_url)
 53 | 
 54 | # Ждем, чтобы страница загрузилась
 55 | time.sleep(2)
 56 | 
 57 | # Находим поля для ввода логина и пароля
 58 | username_field = driver.find_element(By.ID, "username")
 59 | password_field = driver.find_element(By.ID, "password")
 60 | 
 61 | # Вводим логин и пароль
 62 | username_field.send_keys(username)
 63 | password_field.send_keys(password)
 64 | 
 65 | # Находим и нажимаем кнопку логина
 66 | login_button = driver.find_element(By.CSS_SELECTOR, "input[type='submit']")
 67 | login_button.click()
 68 | 
 69 | # Ждем, чтобы страница загрузилась
 70 | time.sleep(2)
 71 | driver.get(main_shop_url)
 72 | time.sleep(3)
 73 | try:
 74 |     accept_all_button = WebDriverWait(driver, 4).until(
 75 |         ec.element_to_be_clickable((By.ID, "dv-t3-consent-management-button-accept-all"))
 76 |     )
 77 |     accept_all_button.click()
 78 |     print("Akceptuj wszystkie cookies- success..")
 79 | except Exception as e:
 80 |     print("ERROR!", e)
 81 | time.sleep(5)
 82 | main_shop_url_html=driver.page_source
 83 | with open("html/main_shop_url_html.html","w",encoding="utf8")as file:
 84 |     file.write(main_shop_url_html)
 85 | with open("html/main_shop_url_html.html",encoding="utf8") as file:
 86 |     src=file.read()
 87 | soup=BeautifulSoup(src,features="lxml")
 88 | 
 89 | main_data = re.compile("cat-item")
 90 | find_main_info = soup.find_all("div", class_=main_data)
 91 | 
 92 | list_of_links=[]
 93 | list_of_categories=[]
 94 | final_list=[
 95 |         ["Nazwa produktu",
 96 |         "Cena hurtowa",
 97 |         "Dostępność",
 98 |         "Grupa produktu",
 99 |         "№ art.",
100 |         "Mini IMG",
101 |         "Link do produktu"
102 |          ]
103 | ]
104 | 
105 | with open("csv/xxxx.csv","w",newline='', encoding="utf8") as file:
106 |     writer=csv.writer(file)
107 | 
108 | for item in find_main_info:
109 |     category_in_item=item.text.split(". ")
110 |     category=category_in_item[1].replace("\xa0","")
111 |     link_in_item=item.find("a")
112 |     link=link_in_item.get("href")
113 |     if category=="NOWOŚCI":
114 |         continue
115 |     list_of_categories.append(category)
116 |     list_of_links.append(link)
117 | 
118 | def find_all_pages(src):
119 |     soup=BeautifulSoup(src,features="lxml")
120 |     try:
121 |         last_page_find=soup.find("div",class_="navi_seiten").text
122 |         last_page_count=last_page_find.split()
123 |         last_page=int(last_page_count[len(last_page_count)-2])
124 |     except:
125 |         last_page=1
126 |         print("*****WARNING! FOUND ONLY 1 PAGE!!*****")
127 |     return last_page
128 | 
129 | def lets_pars(src, final_list) :
130 |     soup = BeautifulSoup(src, features="lxml")
131 |     main_block = soup.find("div", class_="gallery")
132 |     all_items = main_block.find_all("div", class_="gal_elem kombi")
133 |     for item in all_items:
134 |         avi = item.find("div", class_="avail_wrap").text
135 |         res=re.findall(r'\d+',avi)
136 |         item_avi=res[0] if res else "0"
137 |         item_name = item.find("h3").find("a").text
138 |         item_link = item.find("h3").find("a").get("href")
139 |         item_price_l = item.find("td", class_="gal_price_color larsson_size").text
140 |         item_price = item_price_l.replace("\xa0", " ")
141 |         item_image = item.find("img").get("src")
142 |         item_category = item.find_previous("div", class_="breadcrump-liste").text.strip()
143 |         category = " > ".join(
144 |             re.sub(r'^\s*»?\s*\d*\.\s*', '', line.strip()) for line in item_category.strip().splitlines() if line.strip()
145 |         )
146 |         number=item.find("div", class_="gal_artnr").text.split(": ")
147 |         item_num=number[1].replace("\xa0"," ")
148 | 
149 |         final_list.append([item_name, item_price, item_avi,  category, item_num, "https:"+item_image, item_link])
150 |     return final_list
151 | 
152 | for i in range(len(list_of_categories)):
153 |     driver.get(list_of_links[i]+"?show=gal_kombi")
154 |     
155 |     time.sleep(random)
156 |     
157 |     with open (f"html/{list_of_categories[i]}.html","w", encoding="utf8") as file:
158 |         file.write(driver.page_source)
159 |     with open (f"html/{list_of_categories[i]}.html", encoding="utf8") as file:
160 |         src=file.read()
161 |         
162 |     last_page=find_all_pages(src)
163 |     
164 |     time.sleep(random)
165 | 
166 |     for pars_page in range(last_page-1):
167 |         time.sleep(random)
168 |         driver.get(list_of_links[i] + f"?show=gal_kombi&sr={pars_page*30}")
169 |         src=driver.page_source
170 |         lets_pars(src, final_list)
171 |         print(f"Parsing {list_of_categories[i]}... page {pars_page+1} of {last_page}")
172 | 
173 | 
174 | with open("csv/xxxx.csv","a",newline='', encoding="utf8") as file:
175 |     writer=csv.writer(file)
176 |     for i in range(len(final_list)):
177 |         writer.writerow(final_list[i])
178 | 
179 | 
180 | 
181 | driver.close()
182 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Dropshipping Product Scraping Tool
  2 | 
  3 | A comprehensive web scraping solution for automated eCommerce product data extraction and processing.
  4 | 
  5 | ## Overview
  6 | 
  7 | This project provides a robust, enterprise-grade web scraping framework designed to extract product information from eCommerce websites. Built with Python and Selenium, it handles dynamic content, bypasses anti-bot protections, and delivers clean, structured data for dropshipping businesses.
  8 | 
  9 | ### Key Capabilities
 10 | 
 11 | - **Automated Product Discovery**: Scrapes product listings across multiple pages
 12 | - **Detailed Product Information**: Extracts specifications, images, compatibility data, and pricing
 13 | - **Anti-Bot Evasion**: Implements sophisticated techniques to bypass detection systems
 14 | - **Dynamic Content Handling**: Processes JavaScript-rendered content and interactive elements
 15 | - **Data Export**: Outputs clean, structured data in CSV format
 16 | 
 17 | ## Architecture
 18 | 
 19 | ### Core Components
 20 | 
 21 | #### `main_data_scraper.py`
 22 | Primary scraping engine responsible for:
 23 | - Multi-page product catalog traversal
 24 | - Product URL collection and categorization
 25 | - Initial product attribute extraction
 26 | - Session management and request orchestration
 27 | 
 28 | #### `product_info.py`
 29 | Detailed product processor that handles:
 30 | - Individual product page analysis
 31 | - Image extraction and validation
 32 | - Model and compatibility data parsing
 33 | - Data normalization and CSV export
 34 | 
 35 | ## Features
 36 | 
 37 | ### 🤖 Advanced Browser Automation
 38 | - **Selenium WebDriver**: Full browser automation for JavaScript-heavy sites
 39 | - **Headless Operation**: Optimized performance without GUI overhead
 40 | - **Element Interaction**: Handles clicks, form submissions, and dynamic loading
 41 | - **Smart Waiting**: WebDriverWait implementation for reliable element detection
 42 | 
 43 | ### 🛡️ Anti-Detection Technology
 44 | - **User-Agent Rotation**: Randomized browser fingerprints
 45 | - **Request Throttling**: Intelligent delays to mimic human behavior
 46 | - **Chrome Options Optimization**: Stealth mode configuration
 47 | - **Session Persistence**: Maintains realistic browsing patterns
 48 | 
 49 | ### 📊 Data Processing Pipeline
 50 | - **Dynamic Content Extraction**: Handles AJAX-loaded product information
 51 | - **Image Processing**: Automated image discovery and validation
 52 | - **Data Cleaning**: Removes duplicates and normalizes formats
 53 | - **CSV Export**: Structured output with customizable fields
 54 | 
 55 | ### 🔧 Error Handling & Reliability
 56 | - **Graceful Degradation**: Continues operation when individual products fail
 57 | - **Retry Mechanisms**: Automatic retry for transient failures
 58 | - **Comprehensive Logging**: Detailed operation tracking
 59 | - **Resource Management**: Proper cleanup of browser instances
 60 | 
 61 | ## Technical Specifications
 62 | 
 63 | ### System Requirements
 64 | - **Python**: 3.6 or higher
 65 | - **Memory**: Minimum 4GB RAM recommended
 66 | - **Storage**: 1GB free space for data and browser cache
 67 | - **Network**: Stable internet connection
 68 | 
 69 | ### Dependencies
 70 | 
 71 | ```python
 72 | selenium>=4.0.0
 73 | webdriver-manager>=3.8.0
 74 | beautifulsoup4>=4.11.0
 75 | lxml>=4.9.0
 76 | requests>=2.28.0
 77 | pandas>=1.5.0  # Optional: for advanced data manipulation
 78 | ```
 79 | 
 80 | ## Installation
 81 | 
 82 | ### Quick Start
 83 | 
 84 | ```bash
 85 | git clone https://github.com/danieladdisonorg/Dropshipping-Product-Scraping.git
 86 | ```
 87 | 
 88 | ```bash
 89 | cd Dropshipping-Product-Scraping
 90 | ```
 91 | 
 92 | ```bash
 93 | pip install -r requirements.txt
 94 | ```
 95 | 
 96 | ### Chrome WebDriver Setup
 97 | The project uses WebDriver Manager for automatic Chrome driver management. No manual driver installation required.
 98 | 
 99 | ## Usage
100 | 
101 | ### Basic Operation
102 | 
103 | ```bash
104 | python main_data_scraper.py
105 | ```
106 | 
107 | ```bash
108 | python product_info.py
109 | ```
110 | 
111 | ### Configuration Options
112 | 
113 | The scripts support various configuration parameters:
114 | - **Target URLs**: Modify source websites in the configuration section
115 | - **Output Format**: Customize CSV field structure
116 | - **Scraping Intervals**: Adjust delay timing for different sites
117 | - **User-Agent Lists**: Update browser fingerprint rotation
118 | 
119 | ## Output Format
120 | 
121 | ### CSV Structure
122 | ```
123 | Product Name, Model, Year, Compatibility, Image URL, Price, Description, Category, Availability
124 | ```
125 | 
126 | ### Data Quality Features
127 | - **Duplicate Removal**: Automatic deduplication based on product identifiers
128 | - **Data Validation**: Ensures required fields are populated
129 | - **Image Verification**: Validates image URLs and accessibility
130 | - **Format Standardization**: Consistent data formatting across all records
131 | 
132 | ## Best Practices
133 | 
134 | ### Ethical Scraping Guidelines
135 | - **Rate Limiting**: Respects server resources with appropriate delays
136 | - **robots.txt Compliance**: Honors website scraping policies
137 | - **Terms of Service**: Ensure compliance with target site terms
138 | - **Data Usage**: Use scraped data responsibly and legally
139 | 
140 | ### Performance Optimization
141 | - **Batch Processing**: Groups requests for efficiency
142 | - **Memory Management**: Proper cleanup of browser resources
143 | - **Concurrent Processing**: Multi-threading support for large datasets
144 | - **Caching**: Reduces redundant requests
145 | 
146 | ## Troubleshooting
147 | 
148 | ### Common Issues
149 | - **Chrome Driver Errors**: Ensure Chrome browser is installed and updated
150 | - **Timeout Issues**: Increase wait times for slow-loading sites
151 | - **Memory Usage**: Monitor RAM usage during large scraping operations
152 | - **IP Blocking**: Implement proxy rotation if needed
153 | 
154 | ### Debug Mode
155 | Enable verbose logging by modifying the logging configuration in the scripts.
156 | 
157 | ## Contributing
158 | 
159 | We welcome contributions! Please read our contributing guidelines and submit pull requests for any improvements.
160 | 
161 | ### Development Setup
162 | 
163 | ```bash
164 | pip install -r requirements-dev.txt
165 | ```
166 | 
167 | ```bash
168 | python -m pytest tests/
169 | ```
170 | 
171 | ## License
172 | 
173 | This project is licensed under the MIT License - see the LICENSE file for details.
174 | 
175 | ## Disclaimer
176 | 
177 | This tool is intended for educational and legitimate business purposes only. Users are responsible for ensuring compliance with applicable laws, website terms of service, and ethical scraping practices. The authors are not responsible for any misuse of this software.
178 | 
179 | ## Support
180 | 
181 | For issues, feature requests, or questions:
182 | - **GitHub Issues**: [Create an issue](https://github.com/danieladdisonorg/Dropshipping-Product-Scraping/issues)
183 | - **Documentation**: Check the wiki for detailed guides
184 | - **Community**: Join our discussions for tips and best practices
185 | 
186 | ---
187 | 
188 | **Version**: 2.0.0  
189 | **Last Updated**: 2024  
190 | **Maintained by**: Daniel Addison
191 | 


--------------------------------------------------------------------------------