├── functions.py ├── main.py └── requirements.txt /functions.py: -------------------------------------------------------------------------------- 1 | import time as tm 2 | from bs4 import BeautifulSoup 3 | from selenium.webdriver.common.by import By 4 | 5 | 6 | def page_down(driver): 7 | driver.execute_script(''' 8 | const scrollStep = 200; // Размер шага прокрутки (в пикселях) 9 | const scrollInterval = 100; // Интервал между шагами (в миллисекундах) 10 | 11 | const scrollHeight = document.documentElement.scrollHeight; 12 | let currentPosition = 0; 13 | const interval = setInterval(() => { 14 | window.scrollBy(0, scrollStep); 15 | currentPosition += scrollStep; 16 | 17 | if (currentPosition >= scrollHeight) { 18 | clearInterval(interval); 19 | } 20 | }, scrollInterval); 21 | ''') 22 | 23 | 24 | def collect_product_info(driver, url=''): 25 | 26 | driver.switch_to.new_window('tab') 27 | 28 | tm.sleep(3) 29 | driver.get(url=url) 30 | tm.sleep(3) 31 | 32 | # product_id 33 | product_id = driver.find_element( 34 | By.XPATH, '//div[contains(text(), "Артикул: ")]' 35 | ).text.split('Артикул: ')[1] 36 | 37 | # print(product_id) 38 | 39 | page_source = str(driver.page_source) 40 | soup = BeautifulSoup(page_source, 'lxml') 41 | 42 | with open(f'product_{product_id}.html', 'w') as file: 43 | file.write(page_source) 44 | 45 | product_name = soup.find('div', attrs={"data-widget": 'webProductHeading'}).find( 46 | 'h1').text.strip().replace('\t', '').replace('\n', ' ') 47 | 48 | # product_id 49 | # try: 50 | # product_id = soup.find('div', string=re.compile( 51 | # 'Артикул:')).text.split('Артикул: ')[1].strip() 52 | # except: 53 | # product_id = None 54 | 55 | # product statistic 56 | try: 57 | product_statistic = soup.find( 58 | 'div', attrs={"data-widget": 'webSingleProductScore'}).text.strip() 59 | 60 | if " • " in product_statistic: 61 | product_stars = product_statistic.split(' • ')[0].strip() 62 | product_reviews = product_statistic.split(' • ')[1].strip() 63 | else: 64 | product_statistic = product_statistic 65 | except: 66 | product_statistic = None 67 | product_stars = None 68 | product_reviews = None 69 | 70 | # product price 71 | try: 72 | ozon_card_price_element = soup.find( 73 | 'span', string="c Ozon Картой").parent.find('div').find('span') 74 | product_ozon_card_price = ozon_card_price_element.text.strip( 75 | ) if ozon_card_price_element else '' 76 | 77 | price_element = soup.find( 78 | 'span', string="без Ozon Карты").parent.parent.find('div').findAll('span') 79 | 80 | product_discount_price = price_element[0].text.strip( 81 | ) if price_element[0] else '' 82 | product_base_price = price_element[1].text.strip( 83 | ) if price_element[1] is not None else '' 84 | except: 85 | product_ozon_card_price = None 86 | product_discount_price = None 87 | product_base_price = None 88 | 89 | # product price 90 | try: 91 | ozon_card_price_element = soup.find( 92 | 'span', string="c Ozon Картой").parent.find('div').find('span') 93 | except AttributeError: 94 | card_price_div = soup.find( 95 | 'div', attrs={"data-widget": "webPrice"}).findAll('span') 96 | 97 | product_base_price = card_price_div[0].text.strip() 98 | product_discount_price = card_price_div[1].text.strip() 99 | 100 | product_data = ( 101 | { 102 | 'product_id': product_id, 103 | 'product_name': product_name, 104 | 'product_ozon_card_price': product_ozon_card_price, 105 | 'product_discount_price': product_discount_price, 106 | 'product_base_price': product_base_price, 107 | 'product_statistic': product_statistic, 108 | 'product_stars': product_stars, 109 | 'product_reviews': product_reviews, 110 | } 111 | ) 112 | 113 | driver.close() 114 | driver.switch_to.window(driver.window_handles[0]) 115 | 116 | return product_data 117 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | import undetected_chromedriver as uc 4 | from bs4 import BeautifulSoup 5 | from selenium.webdriver.common.by import By 6 | from selenium.webdriver.common.keys import Keys 7 | from selenium.webdriver.support import expected_conditions as EC 8 | from selenium.webdriver.support.ui import WebDriverWait 9 | from functions import page_down, collect_product_info 10 | 11 | 12 | def get_products_links(item_name='наушники hyperx'): 13 | driver = uc.Chrome() 14 | driver.implicitly_wait(5) 15 | 16 | driver.get(url='https://ozon.ru') 17 | time.sleep(2) 18 | 19 | find_input = driver.find_element(By.NAME, 'text') 20 | find_input.clear() 21 | find_input.send_keys(item_name) 22 | time.sleep(2) 23 | 24 | find_input.send_keys(Keys.ENTER) 25 | time.sleep(2) 26 | 27 | current_url = f'{driver.current_url}&sorting=rating' 28 | driver.get(url=current_url) 29 | time.sleep(2) 30 | 31 | # page_down(driver=driver) 32 | time.sleep(2) 33 | 34 | try: 35 | find_links = driver.find_elements(By.CLASS_NAME, 'tile-hover-target') 36 | products_urls = list(set([f'{link.get_attribute("href")}' for link in find_links])) 37 | 38 | print('[+] Ссылки на товары собраны!') 39 | except: 40 | print('[!] Что-то сломалось при сборе ссылок на товары!') 41 | 42 | products_urls_dict = {} 43 | 44 | for k, v in enumerate(products_urls): 45 | products_urls_dict.update({k: v}) 46 | 47 | with open('products_urls_dict.json', 'w', encoding='utf-8') as file: 48 | json.dump(products_urls_dict, file, indent=4, ensure_ascii=False) 49 | 50 | time.sleep(2) 51 | 52 | products_data = [] 53 | 54 | for url in products_urls: 55 | data = collect_product_info(driver=driver, url=url) 56 | print(f'[+] Собрал данные товара с id: {data.get("product_id")}') 57 | time.sleep(2) 58 | products_data.append(data) 59 | 60 | with open('PRODUCTS_DATA.json', 'w', encoding='utf-8') as file: 61 | json.dump(products_data, file, indent=4, ensure_ascii=False) 62 | 63 | driver.close() 64 | driver.quit() 65 | 66 | 67 | def main(): 68 | print('[INFO] Сбор данных начался. Пожалуйста ожидайте...') 69 | get_products_links(item_name='наушники hyperx') 70 | print('[INFO] Работа выполнена успешно!') 71 | 72 | 73 | if __name__ == '__main__': 74 | main() 75 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | bs4==0.0.2 2 | lxml==5.3.0 3 | undetected-chromedriver==3.5.5 4 | --------------------------------------------------------------------------------