├── .gitignore
├── README.md
├── avito_parsing.py
├── cian_parsing.py
├── database.py
├── irr_parsing.py
├── kvadrat64_parsing.py
├── main.py
├── requirements.txt
├── ya_realty_parsing.py
└── youla_parsing.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | .idea/
107 | *.csv
108 | chromedriver.exe
109 | phone.gif
110 | phone_number.png
111 | logs.txt
112 | # breakpoints/
113 | total_data.txt
114 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # real_estate_parsing


--------------------------------------------------------------------------------
/avito_parsing.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import requests
  4 | from bs4 import BeautifulSoup
  5 | import time
  6 | import random
  7 | import datetime
  8 | from fake_useragent import UserAgent
  9 | from selenium import webdriver
 10 | from selenium.webdriver.chrome.options import Options
 11 | from xvfbwrapper import Xvfb
 12 | from PIL import Image
 13 | from pytesseract import image_to_string
 14 | import sys
 15 | from database import DataBase
 16 | 
 17 | # на каких записях останавливаться
 18 | with open("breakpoints/avito.txt", "r", encoding="utf8") as file:
 19 |     breakpoints = file.readlines()
 20 |     try:
 21 |         break_apartment = tuple(breakpoints[0].strip().split("--"))
 22 |     except:
 23 |         break_apartment = None
 24 |     try:
 25 |         break_cottage = tuple(breakpoints[1].strip().split("--"))
 26 |     except:
 27 |         break_cottage = None
 28 |     try:
 29 |         break_land = tuple(breakpoints[2].strip().split("--"))
 30 |     except:
 31 |         break_land = None
 32 |     try:
 33 |         break_commercial = tuple(breakpoints[3].strip().split("--"))
 34 |     except:
 35 |         break_commercial = None
 36 | 
 37 | 
 38 | #defining chrome options for selenium
 39 | options = Options()
 40 | options.add_argument('--no-sandbox')
 41 | 
 42 | db = DataBase()
 43 | visited_urls = []
 44 | 
 45 | 
 46 | def get_html(url):
 47 |     req = requests.get(url, headers={"User-Agent": UserAgent().chrome})
 48 |     return req.text.encode(req.encoding)
 49 | 
 50 | 
 51 | def get_total_pages(html):
 52 |     soup = BeautifulSoup(html, "lxml")
 53 |     try:
 54 |         pages = soup.find("div", class_="pagination-pages clearfix").find_all("a", class_="pagination-page")[-1].get("href")
 55 |         total_pages = int(pages.split("=")[1].split("&")[0])
 56 |     except Exception as e:
 57 |         with open("logs.txt", "a", encoding="utf8") as file:
 58 |             file.write(str(e) + " avito get_total_pages\n")
 59 |         sys.exit(0)
 60 |     return total_pages
 61 | 
 62 | 
 63 | def get_title(soup):
 64 |     try:
 65 |         title = soup.find("span", class_="title-info-title-text").text.strip()
 66 |     except Exception as e:
 67 |         with open("logs.txt", "a", encoding="utf8") as file:
 68 |             file.write(str(e) + " avito get_title\n")
 69 |         title = "Не указано"
 70 |     return title
 71 | 
 72 | 
 73 | def get_address(soup):
 74 |     try:
 75 |         address = "{}, {}".format(soup.find("meta", itemprop="addressLocality").get("content").strip(),
 76 |                                   soup.find("span", itemprop="streetAddress").text.strip())
 77 |         # separating data from the address string
 78 |         district, street = "Не указано", "Не указано"
 79 |         city = address.split(",")[0]
 80 |         block_number = address.split(",")[-1].strip()
 81 |         if "ул " in block_number.lower() or "ул." in block_number.lower() or "улица" in block_number.lower()\
 82 |                 or " пер" in block_number.lower() or "проспект" in block_number.lower() or "проезд" in block_number.lower():
 83 |             street = block_number
 84 |             block_number = "Не указано"
 85 | 
 86 |         for param in address.split(",")[1:-1]:
 87 |             if "ул " in param.lower() or "ул." in param.lower() or "улица" in param.lower() \
 88 |                     or " пер" in param.lower() or "проспект" in param.lower() or "проезд" in param.lower():
 89 |                 street = param.strip()
 90 |             elif "район" in param.lower() or "р-н" in param.lower():
 91 |                 district = param.strip()
 92 | 
 93 |         if street.split()[-1].strip().isdigit():
 94 |             block_number = street.split()[-1].strip()
 95 |             street = " ".join(street.split()[:-1]).strip()
 96 | 
 97 |         return city, district, street, block_number
 98 |     except Exception as e:
 99 |         with open("logs.txt", "a", encoding="utf8") as file:
100 |             file.write(str(e) + " avito get_address\n")
101 |         return ["Не указано"] * 4
102 | 
103 | 
104 | def get_selling_info(soup):
105 |     try:
106 |         per_meter = False  # если цена указана за квадратный метр
107 |         price = soup.find("span", class_="price-value-string js-price-value-string").text.strip()
108 |         if "за сутки" in price:
109 |             sell_type = "Аренда"
110 |             rent_info = "посуточно"
111 |         elif "в месяц" in price:
112 |             sell_type = "Аренда"
113 |             rent_info = "длительный срок"
114 |             if "за " in price:
115 |                 per_meter = True
116 |         else:
117 |             sell_type = "Продажа"
118 |             rent_info = "Не аренда"
119 |         price = soup.find("span", class_="js-item-price").text.strip()
120 |         # ошибка кодировки при записи, собираем сообщение вручную
121 |         if rent_info == "посуточно":
122 |             price = "от " + price + " за сутки"
123 |         elif rent_info == "длительный срок":
124 |             if per_meter:
125 |                 price = price + " в месяц за м2"
126 |             else:
127 |                 price = price + " в месяц"
128 |     except Exception as e:
129 |         with open("logs.txt", "a", encoding="utf8") as file:
130 |             file.write(str(e) + " avito get_selling info\n")
131 |         sell_type, price, rent_info = ["Не указано"] * 3
132 |     return sell_type, price, rent_info
133 | 
134 | 
135 | def get_deposit(soup):
136 |     try:
137 |         deposit = soup.find("div", class_="item-price-sub-price").text.strip()
138 |     except Exception as e:
139 |         with open("logs.txt", "a", encoding="utf8") as file:
140 |             file.write(str(e) + " avito get_deposit\n")
141 |         deposit = "Не указано"
142 |     return deposit
143 | 
144 | 
145 | def get_seller_type(soup):
146 |     try:
147 |         seller_type = soup.find("div", class_="seller-info-prop seller-info-prop_short_margin")
148 |         if seller_type is not None:
149 |             seller_type = "Посредник"
150 |         else:
151 |             seller_type = "Собственник"
152 |     except Exception as e:
153 |         with open("logs.txt", "a", encoding="utf8") as file:
154 |             file.write(str(e) + " avito get_seller_type\n")
155 |         seller_type = "Не указано"
156 |     return seller_type
157 | 
158 | 
159 | def get_seller_name(soup):
160 |     try:
161 |         seller_name = soup.find("div", class_="seller-info-name").find("a").text.strip()
162 |     except Exception as e:
163 |         with open("logs.txt", "a", encoding="utf8") as file:
164 |             file.write(str(e) + " avito get_seller_name\n")
165 |         seller_name = "Не указано"
166 |     return seller_name
167 | 
168 | 
169 | def get_photos(soup):
170 |     try:
171 |         images = []
172 |         images_list = soup.find("ul", class_="gallery-list js-gallery-list").find_all("li", class_="gallery-list-item js-gallery-list-item")
173 |         for image in images_list:
174 |             link = image.find("span").get("style").split(":")[1].strip()[4:-2]
175 |             images.append(link)
176 |         images = "\n".join(images)
177 |     except:
178 |         # если нет фото, возьмем фото с "обложки"
179 |         try:
180 |             images = soup.find("span", class_="gallery-img-cover").get("style").split(":")[1].strip()[4:-2]
181 |         except Exception as e:
182 |             with open("logs.txt", "a", encoding="utf8") as file:
183 |                 file.write(str(e) + " avito get_photos\n")
184 |             images = "Не указано"
185 |     return images
186 | 
187 | 
188 | def get_description(soup):
189 |     try:
190 |         description = soup.find("div", class_="item-description-text").find("p").text.strip()
191 |     except Exception as e:
192 |         with open("logs.txt", "a", encoding="utf8") as file:
193 |             file.write(str(e) + " avito get_description\n")
194 |         description = "Не указано"
195 |     return description
196 | 
197 | 
198 | def get_date(soup):
199 |     try:
200 |         date = soup.find("div", class_="title-info-metadata-item").text.split(",")[1].strip()
201 |         if "сегодня" in date:
202 |             date = str(datetime.datetime.today()).split()[0]
203 |         elif "вчера" in date:
204 |             date = str(datetime.datetime.today() - datetime.timedelta(days=1)).split()[0]
205 |         else:
206 |             date = "too old"
207 |     except Exception as e:
208 |         with open("logs.txt", "a", encoding="utf8") as file:
209 |             file.write(str(e) + " avito get_date\n")
210 |         date = "Не указано"
211 |     return date
212 | 
213 | 
214 | def get_seller_phone(url):
215 |     # телефон показывается в виде картинки, используем selenium и pytesseract
216 |     vdisplay = Xvfb()
217 |     vdisplay.start()
218 |     driver = webdriver.Chrome(options=options)
219 |     driver.set_window_size(1920, 1080)
220 |     driver.get(url)
221 | 
222 |     try:
223 |         button = driver.find_element_by_xpath('//a[@class="button item-phone-button js-item-phone-button '
224 |                                               'button-origin button-origin-blue button-origin_full-width '
225 |                                               'button-origin_large-extra item-phone-button_hide-phone '
226 |                                               'item-phone-button_card js-item-phone-button_card"]')
227 |         button.click()
228 |         time.sleep(2)
229 |         driver.save_screenshot("phone_number.png")
230 | 
231 |         image = driver.find_element_by_xpath('//div[@class="item-phone-big-number js-item-phone-big-number"]//*')
232 | 
233 |         cropped = Image.open("phone_number.png")
234 |         x, y = image.location["x"], image.location["y"]
235 |         width, height = image.size["width"], image.size["height"]
236 |         cropped.crop((x, y, x + width, y + height)).save("phone.gif")
237 | 
238 |         phone = Image.open("phone.gif")
239 |         phone_text = image_to_string(phone)
240 |     except Exception as e:
241 |         with open("logs.txt", "a", encoding="utf8") as file:
242 |             file.write(str(e) + " avito get_seller_phone\n")
243 |         phone_text = "Не указано"
244 | 
245 |     driver.quit()
246 |     vdisplay.stop()
247 | 
248 |     return phone_text
249 | 
250 | 
251 | def get_apartment_params(soup):
252 |     rooms_number, floor_number, total_floors, material, total_area, kitchen_area, living_area = ["Не указано"] * 7
253 |     block_type = "Вторичка"
254 |     try:
255 |         params = soup.find_all("li", class_="item-params-list-item")
256 |         for i in range(len(params)):
257 |             info = params[i].text.strip()
258 |             if "Количество комнат" in info:
259 |                 rooms_number = info.split(":")[1].strip()
260 |             elif "Этажей в доме" in info:
261 |                 total_floors = info.split(":")[1].strip()
262 |             elif "Этаж" in info:
263 |                 floor_number = info.split(":")[1].strip()
264 |             elif "Тип дома" in info:
265 |                 material = info.split(":")[1].strip()
266 |             elif "Общая площадь" in info:
267 |                 total_area = info.split(":")[1].split("м²")[0].strip()
268 |             elif "Площадь кухни" in info:
269 |                 kitchen_area = info.split(":")[1].split("м²")[0].strip()
270 |             elif "Жилая площадь" in info:
271 |                 living_area = info.split(":")[1].split("м²")[0].strip()
272 |             elif "Официальный застройщик" in info or "Название объекта недвижимости" in info:
273 |                 block_type = "Новостройка"
274 |     except Exception as e:
275 |         with open("logs.txt", "a", encoding="utf8") as file:
276 |             file.write(str(e) + " avito get_apartment_params\n")
277 |     return rooms_number, floor_number, total_floors, material, total_area, kitchen_area, living_area, block_type
278 | 
279 | 
280 | def get_cottage_params(soup):
281 |     house_type, total_floors, distance, material, total_area, land_area = ["Не указано"] * 6
282 |     try:
283 |         params = soup.find_all("li", class_="item-params-list-item")
284 |         for i in range(len(params)):
285 |             info = params[i].text.strip()
286 |             if "Вид объекта" in info:
287 |                 house_type = info.split(":")[1].strip()
288 |             elif "Этажей в доме" in info:
289 |                 total_floors = info.split(":")[1].strip()
290 |             elif "Расстояние до города" in info:
291 |                 distance = info.split(":")[1].split("км")[0].strip() + " км"
292 |             elif "Материал стен" in info:
293 |                 material = info.split(":")[1].strip()
294 |             elif "Площадь дома" in info:
295 |                 total_area = info.split(":")[1].split("м²")[0].strip()
296 |             elif "Площадь участка" in info:
297 |                 land_area = info.split(":")[1].split("сот")[0].strip() + " сот"
298 |     except Exception as e:
299 |         with open("logs.txt", "a", encoding="utf8") as file:
300 |             file.write(str(e) + " avito get_cottage_params\n")
301 |     return house_type, total_floors, distance, material, total_area, land_area
302 | 
303 | 
304 | def get_land_params(soup):
305 |     distance, area = "Не указано", "Не указано"
306 |     try:
307 |         labels = soup.find_all("span", class_="item-params-label")
308 |         params = soup.find("div", class_="item-params").find_all("span")
309 |         for i in range(len(labels)):
310 |             info = params[i * 2].text.strip()
311 |             label = labels[i].text.strip()
312 |             if "Расстояние до города" in label:
313 |                 distance = info.split(":")[1].split("км")[0].strip() + " км"
314 |             elif "Площадь" in label:
315 |                 area = info.split(":")[1].split("сот")[0].strip() + " сот"
316 |     except Exception as e:
317 |         with open("logs.txt", "a", encoding="utf8") as file:
318 |             file.write(str(e) + " avito get_land_params\n")
319 |     return distance, area
320 | 
321 | 
322 | def get_commercial_params(soup):
323 |     office_class, area = "Не указано", "Не указано"
324 |     try:
325 |         labels = soup.find_all("span", class_="item-params-label")
326 |         params = soup.find("div", class_="item-params").find_all("span")
327 |         for i in range(len(labels)):
328 |             info = params[i * 2].text.strip()
329 |             label = labels[i].text.strip()
330 |             if "Площадь" in label:
331 |                 area = info.split(":")[1].split("м²")[0].strip()
332 |             elif "Класс здания" in label:
333 |                 office_class = info.split(":")[1].strip()
334 |     except Exception as e:
335 |         with open("logs.txt", "a", encoding="utf8") as file:
336 |             file.write(str(e) + " avito get_commercial_params\n")
337 |     return office_class, area
338 | 
339 | 
340 | def get_apartment_data(url, html):
341 |     soup = BeautifulSoup(html, "lxml")
342 | 
343 |     title = get_title(soup)
344 |     if "сниму" not in title.lower() and "куплю" not in title.lower():
345 |         city, district, street, block_number = get_address(soup)
346 |         sell_type, price, rent_info = get_selling_info(soup)
347 |         rooms_number, floor_number, total_floors, material, total_area, kitchen_area, living_area, block_type = get_apartment_params(soup)
348 |         #seller_type = get_seller_type(soup)
349 |         #seller_name = get_seller_name(soup)
350 |         images = get_photos(soup)
351 |         description = get_description(soup)
352 |         phone = get_seller_phone(url)
353 |         date = get_date(soup)
354 |         selling_detail = "Не указано"  # на авито не указывается эта информация
355 | 
356 |         return [city, district, street, block_number, sell_type, rent_info, price, block_type,
357 |                 rooms_number, total_area, total_floors, material, selling_detail, images,
358 |                 description, date, phone, kitchen_area, living_area, floor_number]
359 | 
360 |     return None
361 | 
362 | 
363 | def get_cottage_data(url, html):
364 |     soup = BeautifulSoup(html, "lxml")
365 | 
366 |     title = get_title(soup)
367 |     if "сниму" not in title.lower() and "куплю" not in title.lower():
368 |         city, district, street, block_number = get_address(soup)
369 |         sell_type, price, rent_info = get_selling_info(soup)
370 |         house_type, total_floors, distance, material, total_area, land_area = get_cottage_params(soup)
371 |         #seller_type = get_seller_type(soup)
372 |         seller_name = get_seller_name(soup)
373 |         images = get_photos(soup)
374 |         description = get_description(soup)
375 |         phone = get_seller_phone(url)
376 |         date = get_date(soup)
377 |         selling_detail, comforts, land_status = ["Не указано"] * 3  # на авито не указывается эта информация
378 | 
379 |         return [city, district, street, block_number, sell_type, rent_info, price, house_type,
380 |                 total_area, comforts, selling_detail, images, description, date, phone, material,
381 |                 total_floors, land_area, land_status, seller_name]
382 |     return None
383 | 
384 | 
385 | def get_land_data(url, html):
386 |     soup = BeautifulSoup(html, "lxml")
387 | 
388 |     title = get_title(soup)
389 |     if "сниму" not in title.lower() and "куплю" not in title.lower():
390 |         # категория земель указывается в скобках в названии объявления
391 |         if "(" in title:
392 |             land_type = title[title.find("(") + 1:].split(")")[0]
393 |         else:
394 |             land_type = "Не указано"
395 | 
396 |         city, district, street, _ = get_address(soup)
397 |         sell_type, price, _ = get_selling_info(soup)
398 | 
399 |         if "Аренда" in sell_type:
400 |             deposit = get_deposit(soup)
401 |         else:
402 |             deposit = "Не аренда"
403 | 
404 |         distance, area = get_land_params(soup)
405 |         seller_type = get_seller_type(soup)
406 |         seller_name = get_seller_name(soup)
407 |         images = get_photos(soup)
408 |         description = get_description(soup)
409 |         phone = get_seller_phone(url)
410 |         date = get_date(soup)
411 | 
412 |         return [city, district, street, sell_type, deposit, land_type, distance, area, price, seller_type, images,
413 |                 description, seller_name, phone, date]
414 |     return None
415 | 
416 | 
417 | def get_commercial_data(url, html):
418 |     soup = BeautifulSoup(html, "lxml")
419 | 
420 |     title = get_title(soup)
421 |     if "сниму" not in title.lower() and "куплю" not in title.lower():
422 |         # анализируем вид помещения по заголовку
423 |         if "офис" in title.lower():
424 |             object_type = "Офисное помещение"
425 |         elif "торг" in title.lower():
426 |             object_type = "Торговое помещение"
427 |         elif "гостиница" in title.lower():
428 |             object_type = "Гостиница"
429 |         elif "свобод" in title.lower():
430 |             object_type = "Помещение свободного назначения"
431 |         elif "производ" in title.lower():
432 |             object_type = "Производственное помещение"
433 |         elif "склад" in title.lower():
434 |             object_type = "Складское помещение"
435 |         else:
436 |             object_type = "Не указано"
437 | 
438 |         city, district, street, block_number = get_address(soup)
439 |         sell_type, price, _ = get_selling_info(soup)
440 | 
441 |         # if "Аренда" in sell_type:
442 |         #     deposit = get_deposit(soup)
443 |         # else:
444 |         #     deposit = "Не аренда"
445 | 
446 |         # если не офис, не заполняем поле office_class
447 |         if object_type == "Офисное помещение":
448 |             office_class, area = get_commercial_params(soup)
449 |         else:
450 |             _, area = get_commercial_params(soup)
451 |             office_class = "Не офис"
452 | 
453 |         #seller_type = get_seller_type(soup)
454 |         seller_name = get_seller_name(soup)
455 |         images = get_photos(soup)
456 |         description = get_description(soup)
457 |         phone = get_seller_phone(url)
458 |         date = get_date(soup)
459 |         furniture, entrance = "Не указано", "Не указано"  # на авито не указывается эта информация
460 | 
461 |         return [city, district, street, block_number, sell_type, price, object_type, office_class,
462 |                 furniture, entrance, area, date, phone, images, description, seller_name]
463 |     return None
464 | 
465 | 
466 | def crawl_page(first_offer, html, category):
467 |     global visited_urls, db
468 |     soup = BeautifulSoup(html, "lxml")
469 |     try:
470 |         offers = soup.find("div", class_="catalog-list").find_all("div", class_="item_table")
471 |     except:
472 |         offers = []
473 |     if offers is None or not offers:
474 |         print("Парсинг завершен avito")
475 |         return True
476 |     for offer in offers:
477 |         try:
478 |             if first_offer:
479 |                 # сохраняем самую первую запись как точку выхода
480 |                 modifier = "w" if category == "Квартиры" else "a"
481 |                 with open("breakpoints/avito.txt", modifier, encoding="utf8") as file:
482 |                     file.write("%s--%s\n" % (offer.find("a", class_="item-description-title-link").get("title"),
483 |                                                  offer.find("span", {"class": "price", "itemprop": "price"}).get("content")))
484 |                 first_offer = False
485 | 
486 |             if offer.find("div", class_="js-item-date c-2").text.strip() == "2 дня назад":
487 |                 print("Парсинг завершен avito")
488 |                 return True
489 | 
490 |             key_info = (offer.find("a", class_="item-description-title-link").get("title"), offer.find("span", {"class": "price", "itemprop": "price"}).get("content"))
491 | 
492 |             if any(x == key_info for x in [break_apartment, break_cottage, break_land, break_commercial]):
493 |                 print("Парсинг завершен avito")
494 |                 return True
495 | 
496 |             url = "https://avito.ru" + offer.find("div", class_="description").find("h3").find("a").get("href")
497 |             if url in visited_urls:
498 |                 print("avito not unique")
499 |                 time.sleep(random.uniform(5, 8))
500 |                 continue
501 |             else:
502 |                 visited_urls.append(url)
503 | 
504 |             data = []
505 |             if category == "Квартиры":
506 |                 data = get_apartment_data(url, get_html(url))
507 |                 # записываем ключевую информацию, чтобы потом найти дубликаты
508 |                 with open("total_data.txt", "a", encoding="utf8") as file:
509 |                     file.write("%s--%s--%s--%s--%s--%s\n" % (data[2], data[3], data[4], data[8], data[-1], url))
510 |             elif category == "Дома":
511 |                 data = get_cottage_data(url, get_html(url))
512 |                 with open("total_data.txt", "a", encoding="utf8") as file:
513 |                     file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[7], data[8], url))
514 |             elif category == "Участки":
515 |                 data = get_land_data(url, get_html(url))
516 |                 with open("total_data.txt", "a", encoding="utf8") as file:
517 |                     file.write("%s--%s--%s--%s\n" % (data[2], data[5], data[7], url))
518 |             elif category == "Коммерческая_недвижимость":
519 |                 data = get_commercial_data(url, get_html(url))
520 |                 with open("total_data.txt", "a", encoding="utf8") as file:
521 |                     file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[6], data[10], url))
522 | 
523 |             if data[0] != "Не указано" and data is not None:
524 |                 try:
525 |                     db.insert_data(category, data)
526 |                 except:
527 |                     db.close()
528 |                     db = DataBase()
529 |                     db.insert_data(category, data)
530 |                 print("parsed page avito")
531 | 
532 |             #print(data)
533 | 
534 |         except Exception as e:
535 |             with open("logs.txt", "a", encoding="utf8") as file:
536 |                 file.write(str(e) + " avito crawl_page\n")
537 |                 #print(str(e) + " avito crawl_page")
538 | 
539 |         time.sleep(random.uniform(5, 8))
540 | 
541 | 
542 | def parse(category_url, base_url, category_name):
543 |     page_part = "p="
544 |     parameters_part = "&s=104&s_trg=3&bt=1"
545 | 
546 |     total_pages = get_total_pages(get_html(category_url))
547 | 
548 |     for page in range(1, total_pages + 1):
549 |         url_gen = base_url + page_part + str(page) + parameters_part
550 |         if page == 1:
551 |             completed = crawl_page(True, get_html(url_gen), category_name)
552 |         else:
553 |             completed = crawl_page(False, get_html(url_gen), category_name)
554 |         if completed:
555 |             break
556 | 
557 | 
558 | def main():
559 |     global visited_urls
560 |     url_apartments = "https://www.avito.ru/saratovskaya_oblast/kvartiry?p=1&s=104&s_trg=3&bt=1"
561 |     base_url = "https://www.avito.ru/saratovskaya_oblast/kvartiry?"
562 |     parse(url_apartments, base_url, "Квартиры")
563 | 
564 |     visited_urls = []
565 |     url_cottages = "https://www.avito.ru/saratovskaya_oblast/doma_dachi_kottedzhi?s=104&s_trg=3&bt=1"
566 |     base_url = "https://www.avito.ru/saratovskaya_oblast/doma_dachi_kottedzhi?"
567 |     parse(url_cottages, base_url, "Дома")
568 | 
569 |     visited_urls = []
570 |     url_lands = "https://www.avito.ru/saratovskaya_oblast/zemelnye_uchastki?s=104&s_trg=3&bt=1"
571 |     base_url = "https://www.avito.ru/saratovskaya_oblast/zemelnye_uchastki?"
572 |     parse(url_lands, base_url, "Участки")
573 | 
574 |     visited_urls = []
575 |     url_commercials = "https://www.avito.ru/saratovskaya_oblast/kommercheskaya_nedvizhimost?s=104&s_trg=3&bt=1"
576 |     base_url = "https://www.avito.ru/saratovskaya_oblast/kommercheskaya_nedvizhimost?"
577 |     parse(url_commercials, base_url, "Коммерческая_недвижимость")
578 | 
579 | 
580 | if __name__ == "__main__":
581 |     main()
582 |     db.close()
583 | 


--------------------------------------------------------------------------------
/cian_parsing.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import requests
  4 | from bs4 import BeautifulSoup
  5 | import time
  6 | import random
  7 | from fake_useragent import UserAgent
  8 | import datetime
  9 | from selenium import webdriver
 10 | from xvfbwrapper import Xvfb
 11 | from selenium.webdriver.chrome.options import Options
 12 | from database import DataBase
 13 | 
 14 | 
 15 | db = DataBase()
 16 | visited_urls = []
 17 | 
 18 | # defining chrome options for selenium
 19 | options = Options()
 20 | options.add_argument("--no-sandbox")
 21 | 
 22 | 
 23 | def get_html(url):
 24 |     req = requests.get(url, headers={"User-Agent": UserAgent().chrome})
 25 |     return req.text.encode(req.encoding)
 26 | 
 27 | 
 28 | def get_title(soup):
 29 |     try:
 30 |         title = soup.find("h1").text.strip()
 31 |     except Exception as e:
 32 |         #print(str(e) + " title")
 33 |         title = "Не указано"
 34 |     return title
 35 | 
 36 | 
 37 | def get_address(soup):
 38 |     try:
 39 |         address = soup.find("address").text.strip()
 40 |         if "На карте" in address:
 41 |             address = address[:address.rfind("На карте")]
 42 |         # separating data from the address string
 43 |         district, street = "Не указано", "Не указано"
 44 |         city = address.split(",")[1].strip()
 45 |         block_number = address.split(",")[-1].strip()
 46 |         if "ул " in block_number.lower() or "ул." in block_number.lower() or "улица" in block_number.lower() \
 47 |                 or " пер" in block_number.lower() or "проезд" in block_number.lower() or "проспект" in block_number.lower():
 48 |             street = block_number
 49 |             block_number = "Не указано"
 50 | 
 51 |         for param in address.split(",")[1:-1]:
 52 |             if "ул " in param.lower() or "ул." in param.lower() or "улица" in param.lower() or " пер" in param.lower() \
 53 |                     or "проезд" in param.lower() or "проспект" in param.lower():
 54 |                 street = param.strip()
 55 |             elif "район" in param.lower() or "р-н" in param.lower():
 56 |                 district = param.strip()
 57 | 
 58 |         if street.split()[-1].strip().isdigit():
 59 |             block_number = street.split()[-1].strip()
 60 |             street = " ".join(street.split()[:-1]).strip()
 61 | 
 62 |         return city, district, street, block_number
 63 |     except Exception as e:
 64 |         with open("logs.txt", "a", encoding="utf8") as file:
 65 |             file.write(str(e) + " cian get_title\n")
 66 |     return ["Не указано"] * 4
 67 | 
 68 | 
 69 | def get_price(soup):
 70 |     try:
 71 |         price = soup.find("span", {"itemprop": "price"})
 72 |         if price is not None:
 73 |             price = price.text.strip()
 74 |         else:
 75 |             price = "от " + soup.find("span", {"itemprop": "lowPrice"}).text.strip() + \
 76 |                     " до " + soup.find("span", {"itemprop": "highPrice"}).text.strip() + "/мес."
 77 |     except Exception as e:
 78 |         with open("logs.txt", "a", encoding="utf8") as file:
 79 |             file.write(str(e) + " cian get_price\n")
 80 |         price = "Не указано"
 81 |     return price
 82 | 
 83 | 
 84 | def get_selling_type(soup):
 85 |     try:
 86 |         paragraphs = [x for x in soup.find_all("p") if x.get("class") is not None
 87 |                       and len(x.get("class")) == 1 and "description--" in x.get("class")[0]]
 88 |         if paragraphs:
 89 |             selling_type = paragraphs[0].text.strip()
 90 |         else:
 91 |             selling_type = "Не указано"
 92 |     except Exception as e:
 93 |         with open("logs.txt", "a", encoding="utf8") as file:
 94 |             file.write(str(e) + " cian get_selling_type\n")
 95 |         selling_type = "Не указано"
 96 |     return selling_type
 97 | 
 98 | 
 99 | def get_seller_type(soup):
100 |     try:
101 |         divs = [x for x in soup.find_all("div") if x.get("class") is not None
102 |                 and len(x.get("class")) == 1 and "honest-container" in x.get("class")[0]]
103 |         if not divs:
104 |             seller_type = "Не указано"
105 |         else:
106 |             seller_type = divs[0].text.strip()
107 |             if seller_type is not None and seller_type.lower() == "собственник":
108 |                 seller_type = "Собственник"
109 |             else:
110 |                 seller_type = "Посредник"
111 |     except Exception as e:
112 |         with open("logs.txt", "a", encoding="utf8") as file:
113 |             file.write(str(e) + " cian get_seller_type\n")
114 |         seller_type = "Не указано"
115 |     return seller_type
116 | 
117 | 
118 | def get_seller_name(soup):
119 |     try:
120 |         name = [x for x in soup.find_all("h2") if x.get("class") is not None and len(x.get("class")) == 1
121 |                 and "title--" in x.get("class")[0]]
122 |         if name:
123 |             name = name[0].text.strip()
124 |     except Exception as e:
125 |         with open("logs.txt", "a", encoding="utf8") as file:
126 |             file.write(str(e) + " cian get_seller_name\n")
127 |         name = "Не указано"
128 |     return name
129 | 
130 | 
131 | def get_photos(url):
132 |     try:
133 |         driver = webdriver.Chrome()
134 |         driver.get(url)
135 | 
136 |         images = []
137 |         images_list = driver.find_elements_by_class_name("fotorama__img")
138 |         images_list = [x.get_attribute("src") for x in images_list if "-2." in x.get_attribute("src")]
139 |         for image in images_list:
140 |             link = image.replace("-2.", "-1.")
141 |             images.append(link)
142 |         images = "\n".join(images)
143 |     except Exception as e:
144 |         with open("logs.txt", "a", encoding="utf8") as file:
145 |             file.write(str(e) + " cian get_photos\n")
146 |         images = "Не указано"
147 |     return images
148 | 
149 | 
150 | def get_description(soup):
151 |     try:
152 |         paragraphs = [x for x in soup.find_all("p") if x.get("class") is not None
153 |                       and len(x.get("class")) == 1 and "description-text--" in x.get("class")[0]]
154 |         description = paragraphs[0].text.strip()
155 |     except Exception as e:
156 |         with open("logs.txt", "a", encoding="utf8") as file:
157 |             file.write(str(e) + " cian get_description\n")
158 |         description = "Не указано"
159 |     return description
160 | 
161 | 
162 | def get_date(soup):
163 |     try:
164 |         date = soup.find("div", id="frontend-offer-card").find("main").find_all("div")[4].text.strip()
165 |         if "вчера" in date:
166 |             date = str(datetime.datetime.today() - datetime.timedelta(days=1)).split()[0]
167 |         elif "сегодня" in date:
168 |             date = str(datetime.datetime.today()).split()[0]
169 |         else:
170 |             date = "too old"
171 |     except Exception as e:
172 |         with open("logs.txt", "a", encoding="utf8") as file:
173 |             file.write(str(e) + " cian get_date\n")
174 |         date = "Не указано"
175 |     return date
176 | 
177 | 
178 | def driver_get_phone_and_images(url):
179 |     vdisplay = Xvfb()
180 |     vdisplay.start()
181 |     driver = webdriver.Chrome(options=options)
182 |     driver.set_window_size(1920, 1080)
183 |     driver.get(url)
184 | 
185 |     try:
186 |         images = []
187 |         images_list = driver.find_elements_by_class_name("fotorama__img")
188 |         images_list = [x.get_attribute("src") for x in images_list if "-2." in x.get_attribute("src")]
189 |         for image in images_list:
190 |             link = image.replace("-2.", "-1.")
191 |             images.append(link)
192 |         images = "\n".join(images)
193 |         if not images:
194 |             # берем с обложки
195 |             images = driver.find_element_by_class_name("fotorama__img").get_attribute("src")
196 |     except Exception as e:
197 |         with open("logs.txt", "a", encoding="utf8") as file:
198 |             file.write(str(e) + " cian get_images\n")
199 |         images = "Не указано"
200 | 
201 |     try:
202 |         button = [x for x in driver.find_elements_by_tag_name("button") if x.text.strip() == "Показать телефон"][-1]
203 |         button.click()
204 |         phone = "\n".join([x.text.strip() for x in driver.find_elements_by_tag_name("a") if x.get_attribute("class") is not None
205 |                            and "phone--" in x.get_attribute("class")])
206 |     except Exception as e:
207 |         phone = "Не указано"
208 |         with open("logs.txt", "a", encoding="utf8") as file:
209 |             file.write(str(e) + " cian get_phone\n")
210 |     driver.quit()
211 |     vdisplay.stop()
212 |     return images, phone
213 | 
214 | 
215 | def get_apartment_params(soup):
216 |     block_type, rooms_number, total_floors, total_area, material, year, kitchen_area, living_area, floor = ["Не указано"] * 9
217 |     try:
218 |         main_params = [x.text.strip() for x in soup.find_all("div") if x.get("class") is not None
219 |                        and len(x.get("class")) == 1 and "info-title--" in x.get("class")[0]]
220 |         main_values = [x.text.strip() for x in soup.find_all("div") if x.get("class") is not None
221 |                        and len(x.get("class")) == 1 and "info-text--" in x.get("class")[0]]
222 |         for i in range(len(main_params)):
223 |             if "Общая" in main_params[i]:
224 |                 total_area = main_values[i]
225 |             elif "Построен" in main_params[i]:
226 |                 year = main_values[i]
227 |             elif "Кухня" in main_params[i]:
228 |                 kitchen_area = main_values[i]
229 |             elif "Жилая" in main_params[i]:
230 |                 living_area = main_values[i]
231 | 
232 |         desc_params = [x.text.strip() for x in soup.find_all("span") if x.get("class") is not None
233 |                        and len(x.get("class")) == 1 and "name--" in x.get("class")[0]]
234 |         desc_values = [x.text.strip() for x in soup.find_all("span") if x.get("class") is not None
235 |                        and len(x.get("class")) == 1 and "value--" in x.get("class")[0]]
236 |         for i in range(len(desc_params)):
237 |             if "Тип жилья" in desc_params[i]:
238 |                 block_type = desc_values[i]
239 |             elif "Количество комнат" in desc_params[i]:
240 |                 rooms_number = desc_values[i]
241 |             elif "Этаж" in desc_params[i]:
242 |                 floor = desc_values[i]
243 |             elif "Этажей в доме" in desc_params[i]:
244 |                 total_floors = desc_values[i]
245 |             elif "Тип дома" in desc_params[i]:
246 |                 material = desc_values[i]
247 | 
248 |         if year == "Не указано":
249 |             building_params = [x.text.strip() for x in soup.find_all("div") if x.get("class") is not None
250 |                                and len(x.get("class")) == 1 and "name--" in x.get("class")[0]]
251 |             building_values = [x.text.strip() for x in soup.find_all("div") if x.get("class") is not None
252 |                                and len(x.get("class")) == 1 and "value--" in x.get("class")[0]]
253 |             for i in range(len(building_params)):
254 |                 if "Год постройки" in building_params[i]:
255 |                     year = building_values[i]
256 |                     break
257 |     except Exception as e:
258 |         with open("logs.txt", "a", encoding="utf8") as file:
259 |             file.write(str(e) + " cian get_apartment_params\n")
260 |     return block_type, rooms_number, total_floors, total_area, material, year, kitchen_area, living_area, floor
261 | 
262 | 
263 | def get_cottage_params(soup):
264 |     total_area, material, land_area, status, comforts, total_floors = ["Не указано"] * 6
265 |     try:
266 |         main_params = [x.text.strip() for x in soup.find_all("div") if x.get("class") is not None
267 |                        and len(x.get("class")) == 1 and "info-title--" in x.get("class")[0]]
268 |         main_values = [x.text.strip() for x in soup.find_all("div") if x.get("class") is not None
269 |                        and len(x.get("class")) == 1 and "info-text--" in x.get("class")[0]]
270 |         for i in range(len(main_params)):
271 |             if "Общая" in main_params[i]:
272 |                 total_area = main_values[i]
273 |             elif "Участок" in main_params[i]:
274 |                 land_area = main_values[i]
275 |             elif "Тип дома" in main_params[i]:
276 |                 material = main_values[i]
277 |             elif "Этажей в доме" in main_params[i]:
278 |                 total_floors = main_values[i]
279 | 
280 |         comforts_list = [x.text.strip() for x in soup.find_all("li") if x.get("class") is not None
281 |                          and len(x.get("class")) == 2 and "item--" in x.get("class")[0]]
282 |         if comforts:
283 |             comforts = "; ".join(comforts_list)
284 | 
285 |         desc_params = [x.text.strip() for x in soup.find_all("span") if x.get("class") is not None
286 |                        and len(x.get("class")) == 1 and "name--" in x.get("class")[0]]
287 |         desc_values = [x.text.strip() for x in soup.find_all("span") if x.get("class") is not None
288 |                        and len(x.get("class")) == 1 and "value--" in x.get("class")[0]]
289 |         for i in range(len(desc_params)):
290 |             if "Статус участка" in desc_params[i]:
291 |                 status = desc_values[i]
292 |             elif land_area == "Не указано" and "Площадь участка" in desc_params[i]:
293 |                 land_area = desc_values[i]
294 |             elif material == "Не указано" and "Тип дома" in desc_params[i]:
295 |                 material = desc_values[i]
296 |     except Exception as e:
297 |         with open("logs.txt", "a", encoding="utf8") as file:
298 |             file.write(str(e) + " cian get_cottage_params\n")
299 |     return total_area, material, land_area, status, comforts, total_floors
300 | 
301 | 
302 | def get_commercial_params(soup):
303 |     area, office_class, floor, furniture, entrance = ["Не указано"] * 5
304 |     try:
305 |         main_params = [x.text.strip() for x in soup.find_all("div") if x.get("class") is not None
306 |                        and len(x.get("class")) == 1 and "info-title--" in x.get("class")[0]]
307 |         main_values = [x.text.strip() for x in soup.find_all("div") if x.get("class") is not None
308 |                        and len(x.get("class")) == 1 and "info-text--" in x.get("class")[0]]
309 |         for i in range(len(main_params)):
310 |             if "Класс" in main_params[i]:
311 |                 office_class = main_values[i]
312 |             elif "Этаж" in main_params[i]:
313 |                 floor = main_values[i]
314 |             elif "Площадь" in main_params[i]:
315 |                 area = main_values[i]
316 | 
317 |         desc_params = [x.text.strip() for x in soup.find_all("span") if x.get("class") is not None
318 |                        and len(x.get("class")) == 1 and "name--" in x.get("class")[0]]
319 |         desc_values = [x.text.strip() for x in soup.find_all("span") if x.get("class") is not None
320 |                        and len(x.get("class")) == 1 and "value--" in x.get("class")[0]]
321 |         for i in range(len(desc_params)):
322 |             if "Вход" in desc_params[i]:
323 |                 entrance = desc_values[i]
324 |             elif "Мебель" in desc_params[i]:
325 |                 furniture = desc_values[i]
326 |     except Exception as e:
327 |         with open("logs.txt", "a", encoding="utf8") as file:
328 |             file.write(str(e) + " cian get_commercial_params\n")
329 |     return area, office_class, floor, furniture, entrance
330 | 
331 | 
332 | def get_apartment_data(html, url):
333 |     soup = BeautifulSoup(html, "lxml")
334 | 
335 |     # title = get_title(soup)
336 |     city, district, street, block_number = get_address(soup)
337 |     price = get_price(soup)
338 |     block_type, rooms_number, total_floors, total_area, material, year, kitchen_area, living_area, floor = get_apartment_params(soup)
339 |     selling_detail = get_selling_type(soup)
340 |     if "продажа" in selling_detail.lower() or "ипотека" in selling_detail.lower():
341 |         rent_info = "Не аренда"
342 |     else:
343 |         rent_info = selling_detail
344 |         selling_detail = "Не указано"
345 |     #seller_type = get_seller_type(soup)
346 |     description = get_description(soup)
347 |     date = get_date(soup)
348 |     images, phone = driver_get_phone_and_images(url)
349 | 
350 |     return [city, district, street, block_number, rent_info, price, block_type,
351 |             rooms_number, total_area, total_floors, material, selling_detail, images,
352 |             description, date, phone, kitchen_area, living_area, floor]
353 | 
354 | 
355 | def get_cottage_data(html, url):
356 |     soup = BeautifulSoup(html, "lxml")
357 | 
358 |     title = get_title(soup)
359 |     city, district, street, block_number = get_address(soup)
360 |     price = get_price(soup)
361 |     cottage_type = title.split(",")[0]
362 |     total_area, material, land_area, status, comforts, total_floors = get_cottage_params(soup)
363 |     selling_detail = get_selling_type(soup)
364 |     if "продажа" in selling_detail.lower() or "ипотека" in selling_detail.lower():
365 |         rent_info = "Не аренда"
366 |     else:
367 |         rent_info = selling_detail
368 |         selling_detail = "Не указано"
369 |     description = get_description(soup)
370 |     date = get_date(soup)
371 |     images, phone = driver_get_phone_and_images(url)
372 |     seller_name = get_seller_name(soup)
373 | 
374 |     return [city, district, street, block_number, rent_info, price, cottage_type,
375 |             total_area, comforts, selling_detail, images, description, date, phone, material,
376 |             total_floors, land_area, status, seller_name]
377 | 
378 | 
379 | def get_commercial_data(html, url):
380 |     soup = BeautifulSoup(html, "lxml")
381 | 
382 |     title = get_title(soup)
383 |     city, district, street, block_number = get_address(soup)
384 |     price = get_price(soup)
385 | 
386 |     if "офис" in title.lower():
387 |         object_type = "Офисное помещение"
388 |     elif "торговая площадь" in title.lower():
389 |         object_type = "Торговая площадь"
390 |     elif "склад" in title.lower():
391 |         object_type = "Склад"
392 |     elif "своб. назнач." in title.lower() or "свободное назначение" in title.lower():
393 |         object_type = "Свободного назначения"
394 |     elif "гараж" in title.lower():
395 |         object_type = "Гараж"
396 |     elif "автосервис" in title.lower():
397 |         object_type = "Автосервис"
398 |     elif "производство" in title.lower():
399 |         object_type = "Производство"
400 |     elif "готовый бизнес" in title.lower():
401 |         object_type = "Готовый бизнес"
402 |     else:
403 |         object_type = "Не указано"
404 | 
405 |     area, office_class, floor, furniture, entrance = get_commercial_params(soup)
406 |     if object_type != "Офисное помещение":
407 |         office_class = "Не офис"
408 |     description = get_description(soup)
409 |     date = get_date(soup)
410 |     images, phone = driver_get_phone_and_images(url)
411 |     seller_name = get_seller_name(soup)
412 | 
413 |     return [city, district, street, block_number, price, object_type, office_class,
414 |             furniture, entrance, area, date, phone, images, description, seller_name]
415 | 
416 | 
417 | def crawl_page(page, html, category, sell_type):
418 |     global visited_urls, db
419 |     soup = BeautifulSoup(html, "lxml")
420 |     if page != 1 and "".join([x.text.strip() for x in soup.find_all("li")
421 |                               if len(x.get("class")) == 2 and "list-item--active" in "".join(x.get("class"))]) == "1":
422 |         print("Парсинг завершен cian")
423 |         return True
424 |     # так как пагинация динамическая и мы не можем получить число страниц, проверяем, есть ли на странице объявления
425 |     try:
426 |         offers = [x for x in soup.find("div", id="frontend-serp").find("div").find_all("div")
427 |                   if x.get("class") is not None and "offer-container" in x.get("class")[0]]
428 |     except:
429 |         offers = []
430 |     if offers is None or not offers:
431 |         print("Парсинг завершен cian")
432 |         return True
433 |     for offer in offers:
434 |         try:
435 |             url = offer.find("a").get("href")
436 |             if url in visited_urls:
437 |                 print("cian not unique")
438 |                 time.sleep(random.uniform(5, 8))
439 |                 continue
440 |             else:
441 |                 visited_urls.append(url)
442 |             #print(url)
443 | 
444 |             data = []
445 |             if category == "Квартиры":
446 |                 data = get_apartment_data(get_html(url), url)
447 |                 # записываем ключевую информацию, чтобы потом найти дубликаты
448 |                 with open("total_data.txt", "a", encoding="utf8") as file:
449 |                     file.write("%s--%s--%s--%s--%s--%s\n" % (data[2], data[3], data[4], data[8], data[-1], url))
450 |             elif category == "Дома":
451 |                 data = get_cottage_data(get_html(url), url)
452 |                 with open("total_data.txt", "a", encoding="utf8") as file:
453 |                     file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[7], data[8], url))
454 |             elif category == "Коммерческая_недвижимость":
455 |                 data = get_commercial_data(get_html(url), url)
456 |                 with open("total_data.txt", "a", encoding="utf8") as file:
457 |                     file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[6], data[10], url))
458 | 
459 |             # на каком месте находится дата объявления
460 |             index_of_date = -1
461 |             if category == "Квартиры" or category == "Коммерческая_недвижимость":
462 |                 index_of_date = -5
463 |             elif category == "Дома":
464 |                 index_of_date = -7
465 |             elif category == "Участки":
466 |                 index_of_date = -1
467 |             if data[index_of_date] == "too old":
468 |                 print("Парсинг завершен cian")
469 |                 return True
470 | 
471 |             data.insert(4, sell_type)
472 |             if data[0] != "Не указано":
473 |                 try:
474 |                     db.insert_data(category, data)
475 |                 except:
476 |                     db.close()
477 |                     db = DataBase()
478 |                     db.insert_data(category, data)
479 |                 print("parsed page cian")
480 | 
481 |             #print(*data, sep="\n")
482 |             #print("--------------------------------------")
483 | 
484 |         except Exception as e:
485 |             with open("logs.txt", "a", encoding="utf8") as file:
486 |                 file.write(str(e) + " cian crawl_page\n")
487 | 
488 |         time.sleep(random.uniform(5, 8))
489 | 
490 | 
491 | def parse(category_url, category_name, sell_type):
492 |     completed = False
493 |     page = 1
494 |     while not completed:
495 |         url_gen = category_url[:category_url.rfind("=") + 1] + str(page)
496 |         completed = crawl_page(page, get_html(url_gen), category_name, sell_type)
497 |         page += 1
498 | 
499 | 
500 | def main():
501 |     global visited_urls
502 |     url_cottages_sell = "https://saratov.cian.ru/cat.php?deal_type=sale&engine_version=2&object_type%5B0%5D=1&offer_type=suburban&region=4609&totime=86400&page=1"
503 |     parse(url_cottages_sell, "Дома", "Продажа")
504 | 
505 |     visited_urls = []
506 |     url_cottages_rent = "https://saratov.cian.ru/cat.php?deal_type=rent&engine_version=2&object_type%5B0%5D=1&offer_type=suburban&region=4609&totime=86400&page=1"
507 |     parse(url_cottages_rent, "Дома", "Аренда")
508 | 
509 |     visited_urls = []
510 |     url_commercials_sell = "https://saratov.cian.ru/cat.php?deal_type=sale&engine_version=2&offer_type=offices&office_type%5B0%5D=1&office_type%5B10%5D=12&office_type%5B1%5D=2&office_type%5B2%5D=3&office_type%5B3%5D=4&office_type%5B4%5D=5&office_type%5B5%5D=6&office_type%5B6%5D=7&office_type%5B7%5D=9&office_type%5B8%5D=10&office_type%5B9%5D=11&region=4609&totime=86400&page=1"
511 |     parse(url_commercials_sell, "Коммерческая_недвижимость", "Продажа")
512 | 
513 |     visited_urls = []
514 |     url_commercials_rent = "https://saratov.cian.ru/cat.php?deal_type=rent&engine_version=2&offer_type=offices&office_type%5B0%5D=1&office_type%5B10%5D=12&office_type%5B1%5D=2&office_type%5B2%5D=3&office_type%5B3%5D=4&office_type%5B4%5D=5&office_type%5B5%5D=6&office_type%5B6%5D=7&office_type%5B7%5D=9&office_type%5B8%5D=10&office_type%5B9%5D=11&region=4609&totime=86400&page=1"
515 |     parse(url_commercials_rent, "Коммерческая_недвижимость", "Аренда")
516 | 
517 |     visited_urls = []
518 |     url_apartments_sell = "https://saratov.cian.ru/cat.php?deal_type=sale&engine_version=2&offer_type=flat&region=4609&room1=1&room2=1&room3=1&room4=1&room5=1&room6=1&room7=1&room9=1&totime=86400&page=1"
519 |     parse(url_apartments_sell, "Квартиры", "Продажа")
520 | 
521 |     visited_urls = []
522 |     url_apartments_rent = "https://saratov.cian.ru/cat.php?deal_type=rent&engine_version=2&offer_type=flat&region=4609&room1=1&room2=1&room3=1&room4=1&room5=1&room6=1&room7=1&room9=1&totime=86400&page=1"
523 |     parse(url_apartments_rent, "Квартиры", "Аренда")
524 | 
525 | 
526 | if __name__ == "__main__":
527 |     main()
528 |     db.close()
529 | 


--------------------------------------------------------------------------------
/database.py:
--------------------------------------------------------------------------------
 1 | ﻿# -*- coding: utf-8 -*-
 2 | 
 3 | import mysql.connector
 4 | from mysql.connector import Error
 5 | 
 6 | # using sensitive data placeholders, replace it with passwords
 7 | host = "host"
 8 | database = "db"
 9 | user = "user"
10 | password = "pass"
11 | 
12 | 
13 | class DataBase:
14 |     def __init__(self):
15 |         try:
16 |             self.conn = mysql.connector.connect(host=host, database=database, user=user, password=password)
17 |             self.cursor = self.conn.cursor()
18 |         except Error as error:
19 |             print("Error while connecting to database", error)
20 | 
21 |     def close(self):
22 |         self.cursor.close()
23 |         self.conn.close()
24 | 
25 |     def create_table(self, category):
26 |         if category == "Квартиры":
27 |             self.cursor.execute("CREATE TABLE IF NOT EXISTS Квартиры"
28 |                                 "(Город TEXT, Район TEXT, Улица TEXT, Номер_дома TEXT, "
29 |                                 "Тип_сделки TEXT, Срок_аренды TEXT, Цена TEXT, Тип_дома TEXT, Количество_комнат TEXT, "
30 |                                 "Общая_площадь TEXT, Количество_этажей TEXT, Материал_стен TEXT, Тип_продажи TEXT, "
31 |                                 "Фото TEXT, Описание TEXT, Дата TEXT, Телефон TEXT, Площадь_кухни TEXT, Жилая_площадь TEXT, "
32 |                                 "Этаж TEXT);")
33 | 
34 |         elif category == "Дома":
35 |             self.cursor.execute("CREATE TABLE IF NOT EXISTS Дома"
36 |                                 "(Город TEXT, Район TEXT, Улица TEXT, Номер_дома TEXT, Тип_сделки TEXT, Срок_аренды TEXT, "
37 |                                 "Цена TEXT, Тип_дома TEXT, Площадь_дома TEXT, Удобства TEXT, Тип_продажи TEXT, "
38 |                                 "Фото TEXT, Описание TEXT, Дата TEXT, Телефон TEXT, Материал_стен TEXT, "
39 |                                 "Количество_этажей TEXT, Площадь_участка TEXT, Статус_участка TEXT, Имя_продавца TEXT);")
40 | 
41 |         elif category == "Коммерческая_недвижимость":
42 |             self.cursor.execute("CREATE TABLE IF NOT EXISTS Коммерческая_недвижимость"
43 |                                 "(Город TEXT, Район TEXT, Улица TEXT, Номер_дома TEXT, Тип_сделки TEXT, Цена TEXT, "
44 |                                 "Тип_недвижимости TEXT, Класс_здания TEXT, Мебель TEXT, Вход TEXT, Общая_площадь TEXT, "
45 |                                 "Дата TEXT, Телефон TEXT, Фото TEXT, Описание TEXT, Имя_продавца TEXT);")
46 | 
47 |         elif category == "Участки":
48 |             self.cursor.execute("CREATE TABLE IF NOT EXISTS Участки"
49 |                                 "(Город TEXT, Район TEXT, Улица TEXT, Тип_сделки TEXT, Залог TEXT, Статус_участка TEXT, "
50 |                                 "Расстояние_до_города TEXT, Площадь_участка TEXT, Цена TEXT, Право_собственности TEXT, "
51 |                                 "Фото TEXT, Описание TEXT, Имя_продавца TEXT, Телефон TEXT, Дата TEXT);")
52 | 
53 |         elif category == "Дубликаты":
54 |             self.cursor.execute("CREATE TABLE IF NOT EXISTS Дубликаты (Заголовок TEXT, URLs TEXT);")
55 | 
56 |     def insert_data(self, table_name, data):
57 |         data_string = ', '.join(['%s'] * len(data))
58 |         query = "INSERT INTO %s VALUES (%s);" % (table_name, data_string)
59 |         self.cursor.execute(query, data)
60 |         self.conn.commit()
61 | 


--------------------------------------------------------------------------------
/irr_parsing.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import requests
  4 | from bs4 import BeautifulSoup
  5 | import time
  6 | import random
  7 | from fake_useragent import UserAgent
  8 | import datetime
  9 | import base64
 10 | from database import DataBase
 11 | 
 12 | 
 13 | # на каких записях останавливаться
 14 | with open("breakpoints/irr.txt", "r", encoding="utf8") as file:
 15 |     breakpoints = file.readlines()
 16 |     try:
 17 |         break_apartment_sell = tuple(breakpoints[0].strip().split("--"))
 18 |     except:
 19 |         break_apartment_sell = None
 20 |     try:
 21 |         break_apartment_rent = tuple(breakpoints[1].strip().split("--"))
 22 |     except:
 23 |         break_apartment_rent = None
 24 |     try:
 25 |         break_commercial_sell = tuple(breakpoints[2].strip().split("--"))
 26 |     except:
 27 |         break_commercial_sell = None
 28 |     try:
 29 |         break_commercial_rent = tuple(breakpoints[3].strip().split("--"))
 30 |     except:
 31 |         break_commercial_rent = None
 32 |     try:
 33 |         break_cottage_sell = tuple(breakpoints[4].strip().split("--"))
 34 |     except:
 35 |         break_cottage_sell = None
 36 |     try:
 37 |         break_cottage_rent = tuple(breakpoints[5].strip().split("--"))
 38 |     except:
 39 |         break_cottage_rent = None
 40 | 
 41 | # получаем вчерашнюю дату
 42 | today = datetime.datetime.today()
 43 | yesterday = str(today - datetime.timedelta(days=2)).split()[0].split("-")
 44 | if yesterday[1][0] == "0":
 45 |     yesterday[1] = yesterday[1][1:]
 46 | if yesterday[2][0] == "0":
 47 |     yesterday[2] = yesterday[2][1:]
 48 | months = {
 49 |     "1": "января",
 50 |     "2": "февраля",
 51 |     "3": "марта",
 52 |     "4": "апреля",
 53 |     "5": "мая",
 54 |     "6": "июня",
 55 |     "7": "июля",
 56 |     "8": "августа",
 57 |     "9": "сентября",
 58 |     "10": "октября",
 59 |     "11": "ноября",
 60 |     "12": "декабря"
 61 | }
 62 | date_break_point = yesterday[2] + " " + months[yesterday[1]]
 63 | 
 64 | db = DataBase()
 65 | visited_urls = []
 66 | 
 67 | 
 68 | def get_html(url):
 69 |     req = requests.get(url, headers={"User-Agent": UserAgent().chrome})
 70 |     return req.text.encode(req.encoding)
 71 | 
 72 | 
 73 | def get_total_pages(html):
 74 |     soup = BeautifulSoup(html, "lxml")
 75 |     total_pages = soup.find("div", class_="pagination__pages")
 76 |     if total_pages is not None:
 77 |         total_pages = total_pages.find_all("a", class_="pagination__pagesLink")[-1].text.strip()
 78 |     else:
 79 |         total_pages = 1
 80 |     return int(total_pages)
 81 | 
 82 | 
 83 | def get_title(soup):
 84 |     try:
 85 |         title = soup.find("h1", class_="productPage__title").text.strip()
 86 |     except Exception as e:
 87 |         with open("logs.txt", "a", encoding="utf8") as file:
 88 |             file.write(str(e) + " irr get_title\n")
 89 |         title = "Не указано"
 90 |     return title
 91 | 
 92 | 
 93 | def get_address(soup):
 94 |     try:
 95 |         address = soup.find("div", class_="productPage__infoTextBold js-scrollToMap").text.strip()
 96 |         city = address.split(",")[0]
 97 |     except Exception as e:
 98 |         with open("logs.txt", "a", encoding="utf8") as file:
 99 |             file.write(str(e) + " irr get_address\n")
100 |         city = "Не указано"
101 |     return city
102 | 
103 | 
104 | def get_material(soup):
105 |     try:
106 |         material = "Не указано"
107 |         building_params = soup.find_all("div", class_="productPage__infoColumnBlock js-columnBlock")[-1].find_all("li", class_="productPage__infoColumnBlockText")
108 |         for i in range(len(building_params)):
109 |             info = building_params[i].text.strip()
110 |             if "Материал стен" in info:
111 |                 material = info.split(":")[1].strip()
112 |     except Exception as e:
113 |         with open("logs.txt", "a", encoding="utf8") as file:
114 |             file.write(str(e) + " irr get_material\n")
115 |         material = "Не указано"
116 |     return material
117 | 
118 | 
119 | def get_price(soup):
120 |     try:
121 |         price = " ".join(soup.find("div", class_="productPage__price").text.strip().split("\xa0"))
122 |         fee = soup.find("div", class_="productPage__fee")
123 |         if fee is not None:
124 |             price += " (" + fee.text.strip() + ")"
125 | 
126 |         if "в месяц" in price:
127 |             rent_info = "длительный срок"
128 |         elif "за сутки" in price:
129 |             rent_info = "посуточно"
130 |         else:
131 |             rent_info = "Не аренда"
132 | 
133 |     except Exception as e:
134 |         with open("logs.txt", "a", encoding="utf8") as file:
135 |             file.write(str(e) + " irr get_price\n")
136 |         price, rent_info = "Не указано", "Не указано"
137 |     return price, rent_info
138 | 
139 | 
140 | def get_block_type(soup):
141 |     block_type = "Вторичка"
142 |     try:
143 |         seller_site = soup.find("a", class_="js-sellerSiteLink")
144 |         if seller_site is not None:
145 |             block_type = "Новостройка"
146 |     except Exception as e:
147 |         with open("logs.txt", "a", encoding="utf8") as file:
148 |             file.write(str(e) + " irr get_block_type\n")
149 |     return block_type
150 | 
151 | 
152 | def get_seller_info(soup):
153 |     try:
154 |         company_name = soup.find("div", class_="productPage__infoTextBold productPage__infoTextBold_inline").find("a")
155 |         if company_name is not None:
156 |             seller_type = "Компания"
157 |             seller_name = company_name.text.strip()
158 |         else:
159 |             seller_type = "Частное лицо"
160 |             seller_name = soup.find("div", class_="productPage__infoTextBold productPage__infoTextBold_inline").text.strip()
161 |     except Exception as e:
162 |         with open("logs.txt", "a", encoding="utf8") as file:
163 |             file.write(str(e) + " irr get_seller_info\n")
164 |         seller_name, seller_type = "Не указано", "Не указано"
165 |     return seller_type, seller_name
166 | 
167 | 
168 | def get_photos(soup):
169 |     try:
170 |         images = []
171 |         images_list = soup.find("div", class_="lineGallery js-lineProductGallery").find_all("meta")
172 |         for image in images_list:
173 |             link = image.get("content")
174 |             images.append(link)
175 |         images = "\n".join(images)
176 |     except Exception as e:
177 |         with open("logs.txt", "a", encoding="utf8") as file:
178 |             file.write(str(e) + " irr get_photos\n")
179 |         images = "Не указано"
180 |     return images
181 | 
182 | 
183 | def get_description(soup):
184 |     try:
185 |         description = " ".join(soup.find("p", class_="productPage__descriptionText js-productPageDescription").text.strip().split())
186 |     except Exception as e:
187 |         with open("logs.txt", "a", encoding="utf8") as file:
188 |             file.write(str(e) + " irr get_description\n")
189 |         description = "Не указано"
190 |     return description
191 | 
192 | 
193 | def get_date(soup):
194 |     try:
195 |         relative_date = soup.find("div", class_="productPage__createDate").find("span").text.strip()
196 |         if "," in relative_date:
197 |             if relative_date.split(",")[0] == "сегодня":
198 |                 date = str(datetime.datetime.today()).split()[0] + relative_date.split(",")[1]
199 |             else:
200 |                 date = str(datetime.datetime.today() - datetime.timedelta(days=2)).split()[0] + relative_date.split(",")[1]
201 |         else:
202 |             date = relative_date
203 |     except Exception as e:
204 |         with open("logs.txt", "a", encoding="utf8") as file:
205 |             file.write(str(e) + " irr get_date\n")
206 |         date = "Не указано"
207 |     return date
208 | 
209 | 
210 | def get_seller_phone(soup):
211 |     try:
212 |         ciphered_phone = soup.find("input", {"class": "js-backendVar", "name": "phoneBase64"}).get("value")
213 |     except Exception as e:
214 |         with open("logs.txt", "a", encoding="utf8") as file:
215 |             file.write(str(e) + " irr get_seller-phone\n")
216 |         ciphered_phone = "Не указано"
217 |     return base64.b64decode(ciphered_phone).decode("utf-8")
218 | 
219 | 
220 | def get_apartment_params(soup):
221 |     rooms_number, floor, total_floors, total_area, kitchen_area, living_area, furnish, district, street, block_number = ["Не указано"] * 10
222 |     try:
223 |         building_params = []
224 |         divs = soup.find_all("div", class_="productPage__infoColumnBlock js-columnBlock")
225 |         for i in range(len(divs)):
226 |             building_params.extend(divs[i].find_all("li", class_="productPage__infoColumnBlockText"))
227 | 
228 |         for i in range(len(building_params)):
229 |             info = building_params[i].text.strip()
230 |             if "Этаж:" in info:
231 |                 floor = info.split(":")[1].strip()
232 |             elif "Этажей в здании" in info:
233 |                 total_floors = info.split(":")[1].strip()
234 |             elif "Комнат в квартире" in info:
235 |                 rooms_number = info.split(":")[1].strip()
236 |             elif "Общая площадь" in info:
237 |                 total_area = info.split(":")[1].strip()
238 |             elif "Жилая площадь" in info:
239 |                 living_area = info.split(":")[1].strip()
240 |             elif "Площадь кухни" in info:
241 |                 kitchen_area = info.split(":")[1].strip()
242 |             elif "Ремонт" in info:
243 |                 furnish = info.split(":")[1].strip()
244 |                 if furnish == "1":
245 |                     # иногда выводит "1", хотя на странице не указан
246 |                     furnish = "Не указано"
247 |             elif "Улица" in info:
248 |                 street = info.split(":")[1].strip()
249 |             elif "Район города" in info:
250 |                 district = info.split(":")[1].strip()
251 |             elif "Дом" in info:
252 |                 block_number = info.split(":")[1].strip()
253 |     except Exception as e:
254 |         with open("logs.txt", "a", encoding="utf8") as file:
255 |             file.write(str(e) + " irr get_apartment_params\n")
256 |     return rooms_number, floor, total_floors, total_area, kitchen_area, living_area, furnish, district, street, block_number
257 | 
258 | 
259 | def get_commercial_params(soup):
260 |     building_type, parking, ceilings, area, entrance, district, street, block_number = ["Не указано"] * 8
261 |     try:
262 |         building_params = []
263 |         divs = soup.find_all("div", class_="productPage__infoColumnBlock js-columnBlock")
264 |         for i in range(len(divs)):
265 |             building_params.extend(divs[i].find_all("li", class_="productPage__infoColumnBlockText"))
266 | 
267 |         for i in range(len(building_params)):
268 |             info = building_params[i].text.strip()
269 |             if "Тип здания" in info:
270 |                 building_type = info.split(":")[1].strip()
271 |             elif "Общая площадь" in info:
272 |                 area = info.split(":")[1].strip()
273 |             elif "Парковка" in info:
274 |                 parking = "Парковка есть"
275 |             elif "Высота потолков" in info:
276 |                 ceilings = info.split(":")[1].strip()
277 |             elif "Вход" in info:
278 |                 entrance = info.strip()
279 |             elif "Улица" in info:
280 |                 street = info.split(":")[1].strip()
281 |             elif "Район города" in info:
282 |                 district = info.split(":")[1].strip()
283 |             elif "Дом" in info:
284 |                 block_number = info.split(":")[1].strip()
285 |     except Exception as e:
286 |         with open("logs.txt", "a", encoding="utf8") as file:
287 |             file.write(str(e) + " irr get_commercial_params\n")
288 |     return building_type, parking, ceilings, area, entrance, district, street, block_number
289 | 
290 | 
291 | def get_cottage_params(soup):
292 |     house_area, material, total_floors, land_area, status, comforts, district, street, block_number = ["Не указано"] * 9
293 |     try:
294 |         building_params = []
295 |         divs = soup.find_all("div", class_="productPage__infoColumnBlock js-columnBlock")
296 |         for i in range(len(divs)):
297 |             building_params.extend(divs[i].find_all("li", class_="productPage__infoColumnBlockText"))
298 |         for i in range(len(building_params)):
299 |             info = building_params[i].text.strip()
300 |             if "Площадь участка" in info:
301 |                 land_area = info.split(":")[1].strip()
302 |             elif "Площадь строения" in info:
303 |                 house_area = info.split(":")[1].strip()
304 |             elif "Материал стен" in info:
305 |                 material = info.split(":")[1].strip()
306 |             elif "Количество этажей" in info:
307 |                 total_floors = info.split(":")[1].strip()
308 |             elif "Вид разрешенного использования" in info:
309 |                 status = info.split(":")[1].strip()
310 |             elif any(x in info.lower() for x in ["отапливаемый", "отопление", "водопровод", "канализация",
311 |                                                  "свет", "газ", "вода", "интернет", "телефон"]):
312 |                 if comforts == "Не указано":
313 |                     comforts = info.strip()
314 |                 else:
315 |                     comforts += "; " + info.strip()
316 |             elif "Улица" in info:
317 |                 street = info.split(":")[1].strip()
318 |             elif "Район города" in info:
319 |                 district = info.split(":")[1].strip()
320 |             elif "Дом" in info:
321 |                 block_number = info.split(":")[1].strip()
322 |     except Exception as e:
323 |         with open("logs.txt", "a", encoding="utf8") as file:
324 |             file.write(str(e) + " irr get_cottage_params\n")
325 |     return house_area, material, total_floors, land_area, status, comforts, district, street, block_number
326 | 
327 | 
328 | def get_apartment_data(html):
329 |     soup = BeautifulSoup(html, "lxml")
330 | 
331 |     #title = get_title(soup)
332 |     city = get_address(soup)
333 |     material = get_material(soup)
334 |     rooms_number, floor, total_floors, total_area, kitchen_area, living_area, furnish, district, street, block_number = get_apartment_params(soup)
335 |     price, rent_info = get_price(soup)
336 |     block_type = get_block_type(soup)
337 |     #seller_type, seller_name = get_seller_info(soup)
338 |     images = get_photos(soup)
339 |     description = get_description(soup)
340 |     phone = get_seller_phone(soup)
341 |     date = get_date(soup)
342 |     selling_detail = "Не указано"  # на irr нет этой информации
343 | 
344 |     return [city, district, street, block_number, rent_info, price, block_type,
345 |             rooms_number, total_area, total_floors, material, selling_detail, images,
346 |             description, date, phone, kitchen_area, living_area, floor]
347 | 
348 | 
349 | def get_commercial_data(html):
350 |     soup = BeautifulSoup(html, "lxml")
351 | 
352 |     title = get_title(soup)
353 |     # анализируем вид помещения по заголовку
354 |     if "офис" in title.lower():
355 |         object_type = "Офисное помещение"
356 |     elif "торг" in title.lower():
357 |         object_type = "Торговое помещение"
358 |     elif "гостиница" in title.lower():
359 |         object_type = "Гостиница"
360 |     elif "производ" in title.lower():
361 |         object_type = "Производственное помещение"
362 |     elif "склад" in title.lower():
363 |         object_type = "Складское помещение"
364 |     elif "помещение" in title.lower():
365 |         object_type = "Помещение свободного назначения"
366 |     else:
367 |         object_type = "Не указано"
368 | 
369 |     city = get_address(soup)
370 |     building_type, parking, ceilings, area, entrance, district, street, block_number = get_commercial_params(soup)
371 |     price, rent_info = get_price(soup)
372 |     seller_type, seller_name = get_seller_info(soup)
373 |     images = get_photos(soup)
374 |     description = get_description(soup)
375 |     phone = get_seller_phone(soup)
376 |     date = get_date(soup)
377 |     office_class, furniture = "Не указано", "Не указано"  # на irr нет этой информации
378 | 
379 |     return [city, district, street, block_number, price, object_type, office_class,
380 |             furniture, entrance, area, date, phone, images, description, seller_name]
381 | 
382 | 
383 | def get_cottage_data(html):
384 |     soup = BeautifulSoup(html, "lxml")
385 | 
386 |     title = get_title(soup)
387 | 
388 |     # определим тип объекта по заголовку
389 |     if "дом" in title.lower():
390 |         object_type = "Дом"
391 |     elif "участок" in title.lower():
392 |         object_type = "Участок"
393 |     elif "таунхаус" in title.lower():
394 |         object_type = "Таунхаус"
395 |     else:
396 |         object_type = "Не указано"
397 | 
398 |     city = get_address(soup)
399 |     price, rent_info = get_price(soup)
400 |     house_area, material, total_floors, land_area, status, comforts, district, street, block_number = get_cottage_params(soup)
401 |     _, seller_name = get_seller_info(soup)
402 |     date = get_date(soup)
403 |     images = get_photos(soup)
404 |     description = get_description(soup)
405 |     phone = get_seller_phone(soup)
406 |     selling_detail = "Не указано"  # на irr нет этой информации
407 | 
408 |     return [city, district, street, block_number, rent_info, price, object_type,
409 |             house_area, comforts, selling_detail, images, description, date, phone, material,
410 |             total_floors, land_area, status, seller_name]
411 | 
412 | 
413 | def crawl_page(first_offer, html, category, sell_type):
414 |     global visited_urls, db
415 |     soup = BeautifulSoup(html, "lxml")
416 |     try:
417 |         offers = soup.find("div", class_="listing js-productGrid ").find_all("div", class_="listing__item")
418 |     except:
419 |         offers = []
420 |     if offers is None or not offers:
421 |         print("Парсинг завершен irr")
422 |         return True
423 |     for offer in offers:
424 |         try:
425 |             date = offer.find("span", class_="listing__itemDate").find("div", class_="updateProduct").text.strip()
426 |             if date == date_break_point:
427 |                 print("Парсинг завершен irr")
428 |                 return True
429 | 
430 |             url = offer.find("div", class_="listing__itemTitleWrapper").find("a", class_="listing__itemTitle").get("href")
431 |             if url in visited_urls:
432 |                 print("irr not unique")
433 |                 time.sleep(random.uniform(5, 8))
434 |                 continue
435 |             else:
436 |                 visited_urls.append(url)
437 |             #print(url)
438 | 
439 |             data = []
440 |             if category == "Квартиры":
441 |                 data = get_apartment_data(get_html(url))
442 |                 # записываем ключевую информацию, чтобы потом найти дубликаты
443 |                 with open("total_data.txt", "a", encoding="utf8") as file:
444 |                     file.write("%s--%s--%s--%s--%s--%s\n" % (data[2], data[3], data[4], data[8], data[-1], url))
445 |             elif category == "Дома":
446 |                 data = get_cottage_data(get_html(url))
447 |                 with open("total_data.txt", "a", encoding="utf8") as file:
448 |                     file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[7], data[8], url))
449 |             elif category == "Коммерческая_недвижимость":
450 |                 data = get_commercial_data(get_html(url))
451 |                 with open("total_data.txt", "a", encoding="utf8") as file:
452 |                     file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[6], data[10], url))
453 | 
454 |             if first_offer:
455 |                 # сохраняем самую первую запись как точку выхода
456 |                 modifier = "w" if (category == "Квартиры" and sell_type == "Продажа") else "a"
457 |                 with open("breakpoints/irr.txt", modifier, encoding="utf8") as file:
458 |                     file.write("%s--%s\n" % (data[2], data[5]))
459 |                 first_offer = False
460 | 
461 |             key_info = (data[2], data[5])
462 | 
463 |             if any(x == key_info for x in [break_apartment_sell, break_apartment_rent, break_commercial_sell,
464 |                                            break_commercial_rent, break_cottage_sell, break_cottage_rent]):
465 |                 print("Парсинг завершен irr")
466 |                 return True
467 | 
468 |             data.insert(4, sell_type)
469 |             if data[0] != "Не указано":
470 |                 try:
471 |                     db.insert_data(category, data)
472 |                 except:
473 |                     db.close()
474 |                     db = DataBase()
475 |                     db.insert_data(category, data)
476 |                 print("parsed page irr")
477 |             #print(data)
478 | 
479 |         except Exception as e:
480 |             with open("logs.txt", "a", encoding="utf8") as file:
481 |                 file.write(str(e) + " irr crawl_page\n")
482 | 
483 |         time.sleep(random.uniform(5, 8))
484 | 
485 | 
486 | def parse(category_url, category_name, sell_type):
487 |     page_part = "page"
488 | 
489 |     total_pages = get_total_pages(get_html(category_url))
490 | 
491 |     for page in range(1, total_pages + 1):
492 |         url_gen = category_url + page_part + str(page)
493 |         if page == 1:
494 |             completed = crawl_page(True, get_html(url_gen), category_name, sell_type)
495 |         else:
496 |             completed = crawl_page(False, get_html(url_gen), category_name, sell_type)
497 |         if completed:
498 |             break
499 | 
500 | 
501 | def main():
502 |     global visited_urls
503 |     # на сайте есть разделения продажа/аренда
504 |     # сначала парсим страницу с предложениями продажи
505 |     url_apartments_sell = "https://saratovskaya-obl.irr.ru/real-estate/apartments-sale/sort/date_sort:desc/"
506 |     parse(url_apartments_sell, "Квартиры", "Продажа")
507 | 
508 |     visited_urls = []
509 |     url_apartments_rent = "https://saratovskaya-obl.irr.ru/real-estate/rent/sort/date_sort:desc/"
510 |     parse(url_apartments_rent, "Квартиры", "Аренда")
511 | 
512 |     visited_urls = []
513 |     url_commercials_sell = "https://saratovskaya-obl.irr.ru/real-estate/commercial-sale/sort/date_sort:desc/"
514 |     parse(url_commercials_sell, "Коммерческая_недвижимость", "Продажа")
515 | 
516 |     visited_urls = []
517 |     url_commercials_rent = "https://saratovskaya-obl.irr.ru/real-estate/commercial/sort/date_sort:desc/"
518 |     parse(url_commercials_rent, "Коммерческая_недвижимость", "Аренда")
519 | 
520 |     visited_urls = []
521 |     url_cottages_sell = "https://saratovskaya-obl.irr.ru/real-estate/out-of-town/sort/date_sort:desc/"
522 |     parse(url_cottages_sell, "Дома", "Продажа")
523 | 
524 |     visited_urls = []
525 |     url_cottages_rent = "https://saratovskaya-obl.irr.ru/real-estate/out-of-town-rent/sort/date_sort:desc/"
526 |     parse(url_cottages_rent, "Дома", "Аренда")
527 | 
528 | 
529 | if __name__ == "__main__":
530 |     main()
531 |     db.close()
532 | 


--------------------------------------------------------------------------------
/kvadrat64_parsing.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import requests
  4 | from bs4 import BeautifulSoup
  5 | import time
  6 | import random
  7 | from fake_useragent import UserAgent
  8 | import datetime
  9 | from selenium import webdriver
 10 | from xvfbwrapper import Xvfb
 11 | from selenium.webdriver.chrome.options import Options
 12 | from database import DataBase
 13 | 
 14 | 
 15 | # на каких записях останавливаться
 16 | with open("breakpoints/kvadrat.txt", "r", encoding="utf8") as file:
 17 |     breakpoints = file.readlines()
 18 |     try:
 19 |         break_apartment_sell = tuple(breakpoints[0].strip().split("--"))
 20 |     except:
 21 |         break_apartment_sell = None
 22 |     try:
 23 |         break_apartment_rent = tuple(breakpoints[1].strip().split("--"))
 24 |     except:
 25 |         break_apartment_rent = None
 26 |     try:
 27 |         break_cottage_sell = tuple(breakpoints[2].strip().split("--"))
 28 |     except:
 29 |         break_cottage_sell = None
 30 |     try:
 31 |         break_cottage_rent = tuple(breakpoints[3].strip().split("--"))
 32 |     except:
 33 |         break_cottage_rent = None
 34 |     try:
 35 |         break_commercial_sell = tuple(breakpoints[4].strip().split("--"))
 36 |     except:
 37 |         break_commercial_sell = None
 38 |     try:
 39 |         break_commercial_rent = tuple(breakpoints[5].strip().split("--"))
 40 |     except:
 41 |         break_commercial_rent = None
 42 |     try:
 43 |         break_dacha_sell = tuple(breakpoints[6].strip().split("--"))
 44 |     except:
 45 |         break_dacha_sell = None
 46 |     try:
 47 |         break_saratov_land_sell = tuple(breakpoints[7].strip().split("--"))
 48 |     except:
 49 |         break_saratov_land_sell = None
 50 |     try:
 51 |         break_region_land_sell = tuple(breakpoints[8].strip().split("--"))
 52 |     except:
 53 |         break_region_land_sell = None
 54 | 
 55 | # defining chrome options for selenium
 56 | options = Options()
 57 | options.add_argument("--no-sandbox")
 58 | 
 59 | db = DataBase()
 60 | visited_urls = []
 61 | 
 62 | 
 63 | def transform_date(date_str):
 64 |     """
 65 |     Преобразуем дату, чтобы сравнить datetime-объекты
 66 |     """
 67 |     day, month, year = date_str.split("-")
 68 |     if day[0] == "0":
 69 |         day = day[1]
 70 |     if month[0] == "0":
 71 |         month = month[1]
 72 | 
 73 |     date = datetime.datetime(int(year), int(month), int(day))
 74 |     return date
 75 | 
 76 | 
 77 | def get_html(url):
 78 |     # сайт использует кодировку windows-1251, поэтому меняем на utf-8
 79 |     req = requests.get(url, headers={"User-Agent": UserAgent().chrome})
 80 |     return req.text.encode(req.encoding)
 81 | 
 82 | 
 83 | def get_total_pages(html):
 84 |     soup = BeautifulSoup(html, "lxml")
 85 |     try:
 86 |         total_pages = soup.find("div", class_="a t100")
 87 |         if total_pages is not None:
 88 |             total_pages = total_pages.find_all("a", class_="phase")[-1].text.strip()
 89 |         else:
 90 |             total_pages = 0
 91 |     except Exception as e:
 92 |         total_pages = 0
 93 |         with open("logs.txt", "a", encoding="utf8") as file:
 94 |             file.write(str(e) + " kvadrat get_total_pages\n")
 95 |     return int(total_pages)
 96 | 
 97 | 
 98 | def get_title(soup):
 99 |     try:
100 |         title = soup.find("td", class_="hh").text.strip()
101 |     except Exception as e:
102 |         title = "Не указано"
103 |         with open("logs.txt", "a", encoding="utf8") as file:
104 |             file.write(str(e) + " kvadrat get_title\n")
105 |     return title
106 | 
107 | 
108 | def get_price(soup):
109 |     try:
110 |         price = soup.find("td", class_="thprice").text.strip()
111 |     except Exception as e:
112 |         with open("logs.txt", "a", encoding="utf8") as file:
113 |             file.write(str(e) + " kvadrat get_price\n")
114 |         price = "Не указано"
115 |     return price
116 | 
117 | 
118 | def get_commercial_price(soup):
119 |     price = "Не указано"
120 |     try:
121 |         aggregated = [x.find_all("span", class_="d") for x in soup.find_all("td", class_="tddec2")]  # список из всех ссылок из tddec2
122 |         flat_aggregated = [item for sublist in aggregated for item in sublist]  # из двумерного списка делаем одномерный
123 |         price_params = [x.text.strip() for x in flat_aggregated]
124 |         for param in price_params:
125 |             if "за м²" in param:
126 |                 price = "м2".join(param.split("м²"))
127 |     except Exception as e:
128 |         with open("logs.txt", "a", encoding="utf8") as file:
129 |             file.write(str(e) + " kvadrat get_price\n")
130 |     return price
131 | 
132 | 
133 | def get_selling_type(soup):
134 |     try:
135 |         # если продажа, ищем тип продажи
136 |         selling_type = "; ".join([x.text.strip() for x in soup.find("td", class_="tddec2").find_all("span", class_="d")])
137 |         if not selling_type:
138 |             selling_type = "Не продажа"
139 |         # если аренда - срок аренды
140 |         rent_info = [x.text.strip() for x in soup.find_all("td", class_="tddec2")[-2].find_all("span", class_="d")]
141 |         for info in rent_info:
142 |             if "аренда" in info:
143 |                 rent_info = info
144 |                 break
145 |         if not rent_info:
146 |             rent_info = "Не аренда"
147 |     except Exception as e:
148 |         with open("logs.txt", "a", encoding="utf8") as file:
149 |             file.write(str(e) + " kvadrat get_selling_type\n")
150 |         selling_type = "Не указано"
151 |         rent_info = "Не указано"
152 |     return selling_type, rent_info
153 | 
154 | 
155 | def get_photos(soup):
156 |     try:
157 |         images = []
158 |         # список ссылок на картинки в полном размере
159 |         td_images = soup.find("td", class_="tdimg").find_all("a")
160 |         for image_item in td_images:
161 |             link = "https://kvadrat64.ru/" + image_item.get("href")
162 |             html_gallery = BeautifulSoup(get_html(link), "lxml")
163 |             image = html_gallery.find("img", {"style": "cursor:pointer;"})
164 |             if image is not None:
165 |                 images.append("https://kvadrat64.ru/" + image.get("src"))
166 |         images = "\n".join(images)
167 |         # если нет картинок в галерее, пытаемся вытащить с облоджки
168 |         if not images:
169 |             images = "https://kvadrat64.ru/" + soup.find("div", id="mainfotoid").find("img").get("src")
170 |     except Exception as e:
171 |         with open("logs.txt", "a", encoding="utf8") as file:
172 |             file.write(str(e) + " kvadrat get_photos\n")
173 |         images = "Не указано"
174 |     return images
175 | 
176 | 
177 | def get_description(soup):
178 |     try:
179 |         description = soup.find("p", class_="dinfo").text.strip().replace("\r", "")
180 |     except Exception as e:
181 |         with open("logs.txt", "a", encoding="utf8") as file:
182 |             file.write(str(e) + " kvadrat get_description\n")
183 |         description = "Не указано"
184 |     return description
185 | 
186 | 
187 | def get_date(soup):
188 |     try:
189 |         date = soup.find("div", class_="tdate").text.strip().split(",")[1]
190 |         if "сделать" in date:
191 |             date = date.split("сделать")[0].split("создано")[1].strip()
192 |         else:
193 |             date = date.split("VIP")[0].split("создано")[1].strip()
194 |         date = transform_date(date)
195 |     except Exception as e:
196 |         with open("logs.txt", "a", encoding="utf8") as file:
197 |             file.write(str(e) + " kvadrat get_date\n")
198 |         date = "Не указано"
199 |     return date
200 | 
201 | 
202 | def get_seller_name(soup):
203 |     try:
204 |         name = soup.find_all("td", class_="tddec2")[-1].find("span").text.strip()
205 |     except Exception as e:
206 |         with open("logs.txt", "a", encoding="utf8") as file:
207 |             file.write(str(e) + " kvadrat get_seller_name\n")
208 |         name = "Не указано"
209 |     return name
210 | 
211 | 
212 | def get_seller_phone(url, soup):
213 |     phone = "Не указано"
214 |     # телефон появляется динамически, используем selenium
215 |     try:
216 |         # иногда посредники указывают телефон в самом тексте; проверяем это
217 |         tddec = soup.find_all("td", class_="tddec2")[-1].find_all(text=True)
218 |         found = False
219 |         for i in range(len(tddec)):
220 |             if "Персона для контактов" in tddec[i]:
221 |                 phone = tddec[i + 1].split(",")[-1].strip()
222 |                 found = True
223 |             elif "Контактный телефон" in tddec[i]:
224 |                 found = False
225 | 
226 |         if "".join(phone.split()).isalpha():
227 |             phone = "Не указано"
228 | 
229 |         if not found:
230 |             vdisplay = Xvfb()
231 |             vdisplay.start()
232 |             driver = webdriver.Chrome(options=options)
233 |             driver.set_window_size(1920, 1080)
234 |             driver.get(url)
235 | 
236 |             button = driver.find_element_by_xpath('//span[@class="showphone"]')
237 |             button.click()
238 |             time.sleep(3)
239 |             seller_info = driver.find_elements_by_xpath('//td[@class="tddec2"]')[-1].text
240 |             for info in seller_info.split("\n"):
241 |                 if "Контактный телефон" in info:
242 |                     phone = info.split(":")[1].strip()
243 |             driver.quit()
244 |             vdisplay.stop()
245 |     except Exception as e:
246 |         with open("logs.txt", "a", encoding="utf8") as file:
247 |             file.write(str(e) + " kvadrat get_seller_phone\n")
248 |         phone = "Не указано"
249 |     return phone
250 | 
251 | 
252 | def get_apartment_params(soup):
253 |     block_type, total_area, kitchen_area, living_area, floor, total_floors, material = ["Не указано"] * 7
254 |     try:
255 |         ###
256 |         # из-за кривой структуры сайта, формируем все сами в удобный формат
257 |         params_raw = str(soup.find("td", class_="tddec")).split("<br/>")
258 |         params = BeautifulSoup(params_raw[0], "lxml").find("td", class_="tddec").text.strip().split("\xa0")
259 |         for param in params_raw[1:]:
260 |             params.append(BeautifulSoup(param, "lxml").text.strip())
261 |         ###
262 |         new_block = False  # в новостройке ли квартира
263 |         add_info = ""  # дата сдачи, застройщик (указываем в одноц графе)
264 |         for param in params:
265 |             if "Площадь общая" in param:
266 |                 total_area = param.split(":")[1].split("м²")[0].strip() + " м2"
267 |             elif "Кухня" in param:
268 |                 kitchen_area = param.split(":")[1].split("м²")[0].strip() + " м2"
269 |             elif "Жилая" in param:
270 |                 living_area = param.split(":")[1].split("м²")[0].strip() + " м2"
271 |             elif "этажей в доме" in param:
272 |                 total_floors = param.split(":")[1].split("/")[1]
273 |                 floor = param.split(":")[1].split("/")[0].split()[1]
274 |             elif "cтроение" in param:
275 |                 material = param.split(":")[1].strip()
276 |             elif "Застройщик" in param or "Дата сдачи" in param or "Стадия строительства" in param:
277 |                 new_block = True
278 |                 add_info += param.split(":")[1] + ";"
279 | 
280 |         if new_block:
281 |             block_type = "Новостройка " + add_info
282 |         else:
283 |             block_type = "Вторичка"
284 |     except Exception as e:
285 |         with open("logs.txt", "a", encoding="utf8") as file:
286 |             file.write(str(e) + " kvadrat get_apartment_params\n")
287 |     return block_type, total_area, kitchen_area, living_area, floor, total_floors, material
288 | 
289 | 
290 | def get_cottage_params(soup):
291 |     total_area, material, comforts, total_floors, land_area = ["Не указано"] * 5
292 |     try:
293 |         ###
294 |         # из-за кривой структуры сайта, формируем все сами в удобный формат
295 |         params_raw = str(soup.find("td", class_="tddec")).split("<br/>")
296 |         params = BeautifulSoup(params_raw[0], "lxml").find("td", class_="tddec").text.strip().split("\xa0")
297 |         for param in params_raw[1:]:
298 |             params.append(BeautifulSoup(param, "lxml").text.strip())
299 |         ###
300 |         for param in params:
301 |             if "Площадь общая" in param:
302 |                 total_area = param.split(":")[1].split("м²")[0].strip() + " м2"
303 |             elif "cтроение" in param:
304 |                 material = param.split(":")[1].strip()
305 |             elif "Площадь участка" in param:
306 |                 land_area = param.split(":")[1].strip()
307 |             elif "Этажей" in param:
308 |                 total_floors = param.split(":")[1].strip()
309 |             elif "Коммуникации" in param:
310 |                 comforts = param.split(":")[1].strip()
311 |     except Exception as e:
312 |         with open("logs.txt", "a", encoding="utf8") as file:
313 |             file.write(str(e) + " kvadrat get_cottage_params\n")
314 |     return total_area, material, comforts, total_floors, land_area
315 | 
316 | 
317 | def get_commercial_params(soup):
318 |     object_type, area = ["Не указано"] * 2
319 |     try:
320 |         ###
321 |         # из-за кривой структуры сайта, формируем все сами в удобный формат
322 |         params_raw = str(soup.find("td", class_="tddec")).split("<br/>")
323 |         params = BeautifulSoup(params_raw[0], "lxml").find("td", class_="tddec").text.strip().split("\xa0")
324 |         for param in params_raw[1:]:
325 |             params.append(BeautifulSoup(param, "lxml").text.strip())
326 |         ###
327 |         for param in params:
328 |             if "Объект" in param:
329 |                 object_type = param.split(":")[1].strip()
330 |             elif "площадь" in param:
331 |                 area = param.split(":")[1].strip()
332 |     except Exception as e:
333 |         with open("logs.txt", "a", encoding="utf8") as file:
334 |             file.write(str(e) + " kvadrat get_commercial_params\n")
335 |     return object_type, area
336 | 
337 | 
338 | def get_dacha_params(soup):
339 |     total_area = "Не указано"
340 |     try:
341 |         ###
342 |         # из-за кривой структуры сайта, формируем все сами в удобный формат
343 |         params_raw = str(soup.find("td", class_="tddec")).split("<br/>")
344 |         params = BeautifulSoup(params_raw[0], "lxml").find("td", class_="tddec").text.strip().split("\xa0")
345 |         for param in params_raw[1:]:
346 |             params.append(BeautifulSoup(param, "lxml").text.strip())
347 |         ###
348 |         for param in params:
349 |             if "Площадь дома" in param:
350 |                 total_area = param.split(":")[1].strip()
351 |                 break
352 |     except Exception as e:
353 |         with open("logs.txt", "a", encoding="utf8") as file:
354 |             file.write(str(e) + " kvadrat get_dacha_params\n")
355 |     return total_area
356 | 
357 | 
358 | def get_land_params(soup):
359 |     total_area, land_type = ["Не указано"] * 2
360 |     try:
361 |         ###
362 |         # из-за кривой структуры сайта, формируем все сами в удобный формат
363 |         params_raw = str(soup.find("td", class_="tddec")).split("<br/>")
364 |         params = BeautifulSoup(params_raw[0], "lxml").find("td", class_="tddec").text.strip().split("\xa0")
365 |         for param in params_raw[1:]:
366 |             params.append(BeautifulSoup(param, "lxml").text.strip())
367 |         ###
368 |         for param in params:
369 |             if "Площадь участка" in param:
370 |                 total_area = param.split(":")[1].strip()
371 |             elif "Тип земли" in params:
372 |                 land_type = param.split(":")[1].strip()
373 |     except Exception as e:
374 |         with open("logs.txt", "a", encoding="utf8") as file:
375 |             file.write(str(e) + " kvadrat get_land_params\n")
376 |     return total_area, land_type
377 | 
378 | 
379 | def get_apartment_data(html, url):
380 |     soup = BeautifulSoup(html, "lxml")
381 | 
382 |     title = get_title(soup)
383 |     if "сниму" not in title.lower():
384 |         address = ",".join(title.split(",")[1:]).strip()
385 |         address = address[:address.rfind(" на карте")]
386 |         if "сдам" in address.lower():
387 |             address = " ".join(address.split()[1:])
388 |         if "(" in address:
389 |             address = address[:address.rfind("(")]
390 | 
391 |         city = address.split(",")[-1].strip()
392 |         district = address.split(",")[-2].strip()
393 |         block_number = address.split(",")[-3].strip()
394 |         street = address.split(",")[-4].strip()
395 | 
396 |         rooms_number = title.split(",")[0]
397 |         block_type, total_area, kitchen_area, living_area, floor, total_floors, material = get_apartment_params(soup)
398 |         price = get_price(soup)
399 |         selling_detail, rent_info = get_selling_type(soup)  # чистая продажа/ипотека/без посредников; если аренда, срок аренды
400 |         if not selling_detail:
401 |             selling_detail = "Не продажа"
402 |         images = get_photos(soup)
403 |         description = get_description(soup)
404 |         phone = get_seller_phone(url, soup)
405 |         date = get_date(soup)
406 | 
407 |         return [city, district, street, block_number, rent_info, price, block_type,
408 |                 rooms_number, total_area, total_floors, material, selling_detail, images,
409 |                 description, date, phone, kitchen_area, living_area, floor]
410 |     return None
411 | 
412 | 
413 | def get_cottage_data(html, url):
414 |     soup = BeautifulSoup(html, "lxml")
415 | 
416 |     title = get_title(soup)
417 |     if "сниму" not in title.lower():
418 |         address = ",".join(title.split(",")[1:]).strip()
419 |         address = address[:address.rfind(" на карте")]
420 |         if "(" in address:
421 |             address = address[:address.rfind("(")]
422 | 
423 |         if address == address.upper():
424 |             city, street, block_number = address.split(",") + (["Не указано"] * (3 - len(address.split(","))))
425 |             district = "Не указано"
426 |         else:
427 |             city = address.split(",")[-1].strip()
428 |             district = address.split(",")[-2].strip()
429 |             block_number = address.split(",")[-3].strip()
430 |             street = address.split(",")[-4].strip()
431 | 
432 |         cottage_type = title.split(",")[0]
433 |         if "сдам" in cottage_type.lower():
434 |             cottage_type = " ".join(cottage_type.split()[1:])
435 |         price = get_price(soup)
436 |         total_area, material, comforts, total_floors, land_area = get_cottage_params(soup)
437 |         selling_detail, rent_info = get_selling_type(soup)  # чистая продажа/ипотека/без посредников; если аренда, срок аренды
438 |         if not selling_detail:
439 |             selling_detail = "Не продажа"
440 |         images = get_photos(soup)
441 |         description = get_description(soup)
442 |         phone = get_seller_phone(url, soup)
443 |         seller_name = get_seller_name(soup)
444 |         date = get_date(soup)
445 |         status = "Не указано"  # нет такой информации
446 | 
447 |         return [city, district, street, block_number, rent_info, price, cottage_type,
448 |                 total_area, comforts, selling_detail, images, description, date, phone, material,
449 |                 total_floors, land_area, status, seller_name]
450 |     return None
451 | 
452 | 
453 | def get_commercial_data(html, url):
454 |     soup = BeautifulSoup(html, "lxml")
455 | 
456 |     title = get_title(soup)
457 |     if "сниму" not in title.lower():
458 |         address = ",".join(title.split(",")[1:]).strip()
459 |         address = address[:address.rfind(" на карте")]
460 |         if "(" in address:
461 |             address = address[:address.rfind("(")]
462 | 
463 |         city = address.split(",")[-1].strip()
464 |         district = address.split(",")[-2].strip()
465 |         block_number = address.split(",")[-3].strip()
466 |         street = address.split(",")[-4].strip()
467 | 
468 |         object_type, area = get_commercial_params(soup)
469 |         price = get_commercial_price(soup)
470 |         images = get_photos(soup)
471 |         description = get_description(soup)
472 |         phone = get_seller_phone(url, soup)
473 |         date = get_date(soup)
474 |         seller_name = get_seller_name(soup)
475 |         office_class, furniture, entrance = ["Не указано"] * 3
476 | 
477 |         return [city, district, street, block_number, price, object_type, office_class,
478 |             furniture, entrance, area, date, phone, images, description, seller_name]
479 |     return None
480 | 
481 | 
482 | def get_land_data(html, url):
483 |     soup = BeautifulSoup(html, "lxml")
484 | 
485 |     title = get_title(soup)
486 |     if "сниму" not in title.lower():
487 |         address = ",".join(title.split(",")[1:]).strip()
488 |         address = address[:address.rfind("(")].strip()
489 | 
490 |         city = address.split(",")[0]
491 |         if len(address.split(",")) > 1:
492 |             district = address.split(",")[1].strip()
493 |         else:
494 |             district = "Не указано"
495 |         street = "Не указано"
496 | 
497 |         if city.lower() == "саратов":
498 |             distance = "В черте города"
499 |         else:
500 |             distance = title[title.find("(") + 1:title.find(")")]
501 | 
502 |         area, land_type = get_land_params(soup)
503 |         price = get_price(soup)
504 |         images = get_photos(soup)
505 |         description = get_description(soup)
506 |         phone = get_seller_phone(url, soup)
507 |         date = get_date(soup)
508 |         seller_name = get_seller_name(soup)
509 |         sell_type = "Продажа"
510 |         deposit, seller_type = ["Не указано"] * 2
511 | 
512 |         return [city, district, street, sell_type, deposit, land_type, distance, area, price, seller_type, images,
513 |                 description, seller_name, phone, date]
514 |     return None
515 | 
516 | 
517 | def crawl_page(first_offer, html, category, sell_type):
518 |     global visited_urls, db
519 |     soup = BeautifulSoup(html, "lxml")
520 |     try:
521 |         #offers = soup.find_all("a", class_="site3adv") + soup.find_all("a", class_="site3")
522 |         offers = soup.find_all("a", class_="site3")
523 |     except:
524 |         offers = []
525 |     if offers is None or not offers:
526 |         print("Парсинг завершен kvadrat")
527 |         return True
528 |     for offer in offers:
529 |         try:
530 |             url = "http://kvadrat64.ru/" + offer.get("href")
531 |             if url in visited_urls:
532 |                 print("kvadrat not unique")
533 |                 time.sleep(random.uniform(5, 8))
534 |                 continue
535 |             else:
536 |                 visited_urls.append(url)
537 |             #print(url)
538 | 
539 |             data = []
540 |             if category == "Квартиры":
541 |                 data = get_apartment_data(get_html(url), url)
542 |                 # записываем ключевую информацию, чтобы потом найти дубликаты
543 |                 with open("total_data.txt", "a", encoding="utf8") as file:
544 |                     file.write("%s--%s--%s--%s--%s--%s\n" % (data[2], data[3], data[4], data[8], data[-1], url))
545 |             elif category == "Дома":
546 |                 data = get_cottage_data(get_html(url), url)
547 |                 with open("total_data.txt", "a", encoding="utf8") as file:
548 |                     file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[7], data[8], url))
549 |             elif category == "Участки":
550 |                 data = get_land_data(get_html(url), url)
551 |                 with open("total_data.txt", "a", encoding="utf8") as file:
552 |                     file.write("%s--%s--%s--%s\n" % (data[2], data[5], data[7], url))
553 |             elif category == "Коммерческая_недвижимость":
554 |                 data = get_commercial_data(get_html(url), url)
555 |                 with open("total_data.txt", "a", encoding="utf8") as file:
556 |                     file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[6], data[10], url))
557 | 
558 |             if first_offer:
559 |                 # сохраняем самую первую запись как точку выхода
560 |                 modifier = "w" if (category == "Квартиры" and sell_type == "Продажа") else "a"
561 |                 with open("breakpoints/kvadrat.txt", modifier, encoding="utf8") as file:
562 |                     file.write("%s--%s\n" % (data[2], data[5]))
563 |                 first_offer = False
564 | 
565 |             key_info = (data[2], data[5])
566 | 
567 |             if any(x == key_info for x in [break_apartment_sell, break_apartment_rent, break_cottage_sell,
568 |                                            break_cottage_rent, break_commercial_sell, break_commercial_rent,
569 |                                            break_dacha_sell, break_saratov_land_sell, break_region_land_sell]):
570 |                 print("Парсинг завершен kvadrat")
571 |                 return True
572 | 
573 |             data.insert(4, sell_type)
574 | 
575 |             # на каком месте находится дата объявления
576 |             index_of_date = -1
577 |             if category == "Квартиры" or category == "Коммерческая_недвижимость":
578 |                 index_of_date = -5
579 |             elif category == "Дома":
580 |                 index_of_date = -7
581 |             elif category == "Участки":
582 |                 index_of_date = -1
583 | 
584 |             if data[index_of_date] != "Не указано" and data[index_of_date] < datetime.datetime.today() - datetime.timedelta(days=1):
585 |                 # сраниваем форматы datetime, чтобы знать, когда закончить парсинг
586 |                 print("Парсинг завершен kvadrat")
587 |                 return True
588 |             else:
589 |                 # переводим в строковый формат
590 |                 data[index_of_date] = str(data[index_of_date]).split()[0]
591 | 
592 |             if data[0] != "Не указано" and data is not None:
593 |                 try:
594 |                     db.insert_data(category, data)
595 |                 except:
596 |                     db.close()
597 |                     db = DataBase()
598 |                     db.insert_data(category, data)
599 |                 print("parsed page kvadrat")
600 | 
601 |             #print(data)
602 | 
603 |         except Exception as e:
604 |             with open("logs.txt", "a", encoding="utf8") as file:
605 |                 file.write(str(e) + " kvadrat crawl_page\n")
606 | 
607 |         time.sleep(random.uniform(5, 8))
608 | 
609 | 
610 | def parse(category_url, category_name, sell_type):
611 | 
612 |     total_pages = get_total_pages(get_html(category_url))
613 | 
614 |     for page in range(1, total_pages + 1):
615 |         if (category_name == "Дома" and sell_type == "Продажа" and "sellzagbank" not in category_url) or category_name == "Участки":
616 |             url = category_url.split("-")
617 |             url_gen = "-".join(url[:2]) + "-" + str(page) + "-" + url[3]
618 |         else:
619 |             url_gen = category_url[:category_url.rfind("-") + 1] + str(page) + ".html"
620 | 
621 |         if page == 1:
622 |             completed = crawl_page(True, get_html(url_gen), category_name, sell_type)
623 |         else:
624 |             completed = crawl_page(False, get_html(url_gen), category_name, sell_type)
625 |         if completed:
626 |             break
627 | 
628 | 
629 | def main():
630 |     global visited_urls
631 |     url_apartments_sell = "http://kvadrat64.ru/sellflatbank-50-1.html"
632 |     parse(url_apartments_sell, "Квартиры", "Продажа")
633 | 
634 |     visited_urls = []
635 |     url_apartments_rent = "https://kvadrat64.ru/giveflatbank-50-1.html"
636 |     parse(url_apartments_rent, "Квартиры", "Аренда")
637 | 
638 |     visited_urls = []
639 |     url_cottages_sell = "https://kvadrat64.ru/search-103-1-50664.html"
640 |     parse(url_cottages_sell, "Дома", "Продажа")
641 | 
642 |     visited_urls = []
643 |     url_cottages_rent = "https://kvadrat64.ru/giveflatbank-9-1.html"
644 |     parse(url_cottages_rent, "Дома", "Аренда")
645 | 
646 |     visited_urls = []
647 |     url_commercials_sell = "https://kvadrat64.ru/sellcombank-1000-1.html"
648 |     parse(url_commercials_sell, "Коммерческая_недвижимость", "Продажа")
649 | 
650 |     visited_urls = []
651 |     url_commercials_rent = "https://kvadrat64.ru/givecombank-1000-1.html"
652 |     parse(url_commercials_rent, "Коммерческая_недвижимость", "Аренда")
653 | 
654 |     visited_urls = []
655 |     url_dachas_sell = "https://kvadrat64.ru/sellzagbank-1000-1.html"
656 |     parse(url_dachas_sell, "Дома", "Продажа")
657 | 
658 |     visited_urls = []
659 |     url_saratov_lands_sell = "https://kvadrat64.ru/search-41-1-24435.html"
660 |     parse(url_saratov_lands_sell, "Участки", "Продажа")
661 | 
662 |     visited_urls = []
663 |     url_region_lands_sell = "https://kvadrat64.ru/search-412-1-24450.html"
664 |     parse(url_region_lands_sell, "Участки", "Продажа")
665 | 
666 | 
667 | if __name__ == "__main__":
668 |     main()
669 |     db.close()
670 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import gc
 3 | from multiprocessing import Process
 4 | import os
 5 | import datetime
 6 | from database import DataBase
 7 | 
 8 | t1, t2, t3, t4, t5, t6 = [None] * 6
 9 | 
10 | 
11 | def cls():
12 |     os.system('cls' if os.name == 'nt' else 'clear')
13 | 
14 | 
15 | def main():
16 |     global t1, t2, t3, t4, t5, t6
17 |     if all(p is not None for p in [t1, t2, t3, t4, t5, t6]):
18 |         for p in [t1, t2, t3, t4, t5, t6]:
19 |             if p.is_alive():
20 |                 p.terminate()
21 |                 p.join()
22 | 
23 |     import avito_parsing
24 |     import irr_parsing
25 |     import kvadrat64_parsing
26 |     import ya_realty_parsing
27 |     import cian_parsing
28 |     import youla_parsing
29 | 
30 |     cls()
31 |     print("Job started", datetime.datetime.today())
32 | 
33 |     db = DataBase()
34 |     db.create_table("Квартиры")
35 |     db.create_table("Дома")
36 |     db.create_table("Коммерческая_недвижимость")
37 |     db.create_table("Участки")
38 |     db.create_table("Дубликаты")
39 | 
40 |     if os.path.isfile("logs.txt"):
41 |         os.remove("logs.txt")
42 | 
43 |     total_data = {}
44 |     try:
45 |         if os.path.isfile("total_data.txt"):
46 |             with open("total_data.txt", "r", encoding="utf8") as file:
47 |                 for line in file.readlines():
48 |                     data = line.strip().split("--")
49 |                     params = tuple(data[:-1])
50 |                     url = data[-1]
51 |                     total_data[params] = list(set(total_data.get(params, []) + [url]))
52 | 
53 |             for data in total_data:
54 |                 if all(x != "Не указано" for x in data):  # avoid writing dummy records
55 |                     if len(total_data[data]) > 1:
56 |                         db.insert_data("Дубликаты", [", ".join(data), "\n".join(total_data[data])])
57 |     except Exception as e:
58 |         print(e)
59 | 
60 |     if os.path.isfile("total_data.txt"):
61 |         os.remove("total_data.txt")
62 | 
63 |     t1 = Process(target=ya_realty_parsing.main)
64 |     t2 = Process(target=irr_parsing.main)
65 |     t3 = Process(target=youla_parsing.main)
66 |     t1.start()
67 |     t2.start()
68 |     t3.start()
69 |     t1.join()
70 |     t2.join()
71 |     t3.join()
72 | 
73 |     t4 = Process(target=kvadrat64_parsing.main)
74 |     t5 = Process(target=cian_parsing.main)
75 |     t6 = Process(target=avito_parsing.main)
76 |     t4.start()
77 |     t5.start()
78 |     t6.start()
79 |     t4.join()
80 |     t5.join()
81 |     t6.join()
82 | 
83 |     db.close()
84 |     gc.collect()
85 |     print("Job finished", datetime.datetime.today())
86 | 
87 | 
88 | if __name__ == '__main__':
89 |     import schedule
90 |     import time
91 | 
92 |     schedule.every().day.at("10:00").do(main)
93 | 
94 |     while True:
95 |         schedule.run_pending()
96 |         time.sleep(1)
97 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | asn1crypto==0.24.0
 2 | beautifulsoup4==4.6.1
 3 | certifi==2018.4.16
 4 | cffi==1.11.5
 5 | chardet==3.0.4
 6 | cryptography==3.2
 7 | EasyProcess==0.2.3
 8 | httmock==1.2.6
 9 | http-request-randomizer==1.2.3
10 | idna==2.7
11 | mysql-connector-python==8.0.12
12 | Pillow==8.1.1
13 | protobuf==3.6.1
14 | psutil==5.6.6
15 | pycparser==2.18
16 | pyOpenSSL==18.0.0
17 | python-dateutil==2.7.3
18 | PyVirtualDisplay==0.2.1
19 | requests==2.20.0
20 | selenium==3.14.0
21 | six==1.11.0
22 | urllib3==1.24.2
23 | xvfbwrapper==0.2.9


--------------------------------------------------------------------------------
/ya_realty_parsing.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import requests
  4 | from bs4 import BeautifulSoup
  5 | import time
  6 | import random
  7 | from fake_useragent import UserAgent
  8 | import datetime
  9 | from selenium import webdriver
 10 | from xvfbwrapper import Xvfb
 11 | from selenium.webdriver.chrome.options import Options
 12 | from database import DataBase
 13 | 
 14 | # на каких записях останавливаться
 15 | with open("breakpoints/ya.txt", "r", encoding="utf8") as file:
 16 |     breakpoints = file.readlines()
 17 |     try:
 18 |         break_apartment_sell = tuple(breakpoints[0].strip().split("--"))
 19 |     except:
 20 |         break_apartment_sell = None
 21 |     try:
 22 |         break_apartment_rent = tuple(breakpoints[1].strip().split("--"))
 23 |     except:
 24 |         break_apartment_rent = None
 25 |     try:
 26 |         break_cottage_sell = tuple(breakpoints[2].strip().split("--"))
 27 |     except:
 28 |         break_cottage_sell = None
 29 |     try:
 30 |         break_cottage_rent = tuple(breakpoints[3].strip().split("--"))
 31 |     except:
 32 |         break_cottage_rent = None
 33 |     try:
 34 |         break_commercial_sell = tuple(breakpoints[4].strip().split("--"))
 35 |     except:
 36 |         break_commercial_sell = None
 37 |     try:
 38 |         break_commercial_rent = tuple(breakpoints[5].strip().split("--"))
 39 |     except:
 40 |         break_commercial_rent = None
 41 | 
 42 | # defining chrome options for selenium
 43 | options = Options()
 44 | options.add_argument("--no-sandbox")
 45 | 
 46 | db = DataBase()
 47 | visited_urls = []
 48 | 
 49 | 
 50 | def transform_date(date):
 51 |     """
 52 |     Преобразуем дату, чтобы сравнить datetime-объекты
 53 |     """
 54 |     day, month, year = date.split()
 55 |     months = {
 56 |         "января": 1,
 57 |         "февраля": 2,
 58 |         "марта": 3,
 59 |         "апреля": 4,
 60 |         "мая": 5,
 61 |         "июня": 6,
 62 |         "июля": 7,
 63 |         "августа": 8,
 64 |         "сентября": 9,
 65 |         "октября": 10,
 66 |         "ноября": 11,
 67 |         "декабря": 12
 68 |     }
 69 | 
 70 |     date = datetime.datetime(int(year), months[month], int(day))
 71 |     return date
 72 | 
 73 | 
 74 | def get_html(url):
 75 |     req = requests.get(url, headers={"User-Agent": UserAgent().chrome, "Referer": url,
 76 |                                      "Accept-Language": "ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7",
 77 |                                      "Connection": "keep-alive", "Origin": "https://realty.yandex.ru",
 78 |                                      "DNT": "1"})
 79 |     return req.text.encode(req.encoding)
 80 | 
 81 | 
 82 | def get_title(soup):
 83 |     try:
 84 |         title = soup.find("h1", class_="offer-card__header-text").text.strip()
 85 |     except Exception as e:
 86 |         with open("logs.txt", "a", encoding="utf8") as file:
 87 |             file.write(str(e) + " ya get_title\n")
 88 |         title = "Не указано"
 89 |     return title
 90 | 
 91 | 
 92 | def get_address(soup):
 93 |     try:
 94 |         address = soup.find("h2", class_="offer-card__address ellipsis").text.strip()
 95 |         # separating data from the address string
 96 |         district, street = "Не указано", "Не указано"
 97 |         city = address.split(",")[0]
 98 |         block_number = address.split(",")[-1].strip()
 99 |         if "ул " in block_number.lower() or "ул." in block_number.lower() or "улица" in block_number.lower() \
100 |                 or " пер" in block_number.lower() or "проспект" in block_number.lower() or "проезд" in block_number.lower():
101 |             street = block_number
102 |             block_number = "Не указано"
103 | 
104 |         for param in address.split(",")[1:-1]:
105 |             if "ул " in param.lower() or "ул." in param.lower() or "улица" in param.lower() \
106 |                     or " пер" in param.lower() or "проспект" in param.lower() or "проезд" in param.lower():
107 |                 street = param.strip()
108 |             elif "район" in param.lower() or "р-н" in param.lower():
109 |                 district = param.strip()
110 | 
111 |         if street.split()[-1].strip().isdigit():
112 |             block_number = street.split()[-1].strip()
113 |             street = " ".join(street.split()[:-1]).strip()
114 | 
115 |         return city, district, street, block_number
116 | 
117 |     except Exception as e:
118 |         with open("logs.txt", "a", encoding="utf8") as file:
119 |             file.write(str(e) + " ya get_address\n")
120 |     return ["Не указано"] * 4
121 | 
122 | 
123 | def get_block_type(soup):
124 |     try:
125 |         block_type = soup.find("div", class_="offer-card__building-type")
126 |         if block_type is None:
127 |             block_type = "Вторичка"
128 |         else:
129 |             block_type = block_type.text.strip()
130 |     except Exception as e:
131 |         with open("logs.txt", "a", encoding="utf8") as file:
132 |             file.write(str(e) + " ya get_block_type\n")
133 |         block_type = "Не указано"
134 |     return block_type
135 | 
136 | 
137 | def get_price(soup):
138 |     try:
139 |         price = soup.find("h3", class_="offer-price offer-card__price offer-card__price").text.strip()
140 |     except Exception as e:
141 |         with open("logs.txt", "a", encoding="utf8") as file:
142 |             file.write(str(e) + " ya get_price\n")
143 |         price = "Не указано"
144 |     return price
145 | 
146 | 
147 | def get_selling_type(soup):
148 |     try:
149 |         selling_type = soup.find("div", class_="offer-card__terms").text.strip()
150 |     except Exception as e:
151 |         with open("logs.txt", "a", encoding="utf8") as file:
152 |             file.write(str(e) + " ya get_selling_type\n")
153 |         selling_type = "Не указано"
154 |     return selling_type
155 | 
156 | 
157 | def get_seller_type(soup):
158 |     try:
159 |         seller_type = soup.find("div", class_="offer-card__author-note").text.strip()
160 |     except Exception as e:
161 |         with open("logs.txt", "a", encoding="utf8") as file:
162 |             file.write(str(e) + " ya get_seller_type\n")
163 |         seller_type = "Не указано"
164 |     return seller_type
165 | 
166 | 
167 | def get_seller_name(soup):
168 |     try:
169 |         name = soup.find("div", class_="offer-card__author-name").text.strip()
170 |     except:
171 |         name = "Не указано"
172 |     return name
173 | 
174 | 
175 | def get_photos(soup):
176 |     try:
177 |         images = []
178 |         images_list = soup.find("div", class_="offer-card__photos-wrapper").find_all("a")
179 |         for image in images_list:
180 |             link = "https://realty.yandex.ru" + image.get("href")
181 |             images.append(link)
182 |         images = "\n".join(images)
183 |     except Exception as e:
184 |         with open("logs.txt", "a", encoding="utf8") as file:
185 |             file.write(str(e) + " ya get_photos\n")
186 |         images = "Не указано"
187 |     return images
188 | 
189 | 
190 | def get_description(soup):
191 |     try:
192 |         description = soup.find("div", class_="offer-card__desc-text").text.strip()
193 |     except Exception as e:
194 |         with open("logs.txt", "a", encoding="utf8") as file:
195 |             file.write(str(e) + " ya get_description\n")
196 |         description = "Не указано"
197 |     return description
198 | 
199 | 
200 | def get_date(soup, which_page):
201 |     # 0 - page with offers, 1 - offer itself
202 |     try:
203 |         if which_page == 0:
204 |             date = soup.find("div", class_="OffersSerpItem__publish-date").text.strip()
205 |         else:
206 |             date = soup.find("div", class_="offer-card__lot-date").text.strip()
207 |         if "назад" in date:
208 |             time_passed = int(date.split()[0])
209 |             if "минут" in date:
210 |                 date = str(datetime.datetime.today() - datetime.timedelta(minutes=time_passed)).split()[0]
211 |             elif "часов" in date or "часа" in date or "час" in date:
212 |                 date = str(datetime.datetime.today() - datetime.timedelta(hours=time_passed)).split()[0]
213 |             elif "сейчас" in date:
214 |                 date = str(datetime.datetime.today()).split()[0]
215 |         elif date == "вчера":
216 |             date = str(datetime.datetime.today() - datetime.timedelta(days=1)).split()[0]
217 |         elif len(date.split()) >= 3:
218 |             transformed_date = transform_date(date)
219 |             days_passed = str(datetime.datetime.today() - transformed_date).split()[0]
220 |             if int(days_passed) > 1:
221 |                 date = "too old"
222 |     except Exception as e:
223 |         with open("logs.txt", "a", encoding="utf8") as file:
224 |             file.write(str(e) + " ya get_date\n")
225 |         date = "Не указано"
226 |     return date
227 | 
228 | 
229 | def get_seller_phone(url):
230 |     phone = "Не указано"
231 |     try:
232 |         vdisplay = Xvfb()
233 |         vdisplay.start()
234 |         driver = webdriver.Chrome(options=options)
235 |         driver.set_window_size(1920, 1080)
236 |         driver.get(url)
237 | 
238 |         button = driver.find_element_by_xpath("/html/body/div[1]/div[2]/div/div[2]/div[2]/div[2]/div/div[1]/div[3]/div[1]/span/button")
239 |         button.click()
240 |         time.sleep(2)
241 |         phone = driver.find_element_by_xpath('//div[@class="helpful-info__contact-phones-string"]').text
242 |         driver.quit()
243 |         vdisplay.stop()
244 |     except Exception as e:
245 |         with open("logs.txt", "a", encoding="utf8") as file:
246 |             file.write(str(e) + " ya get_seller_phone\n")
247 |     return phone
248 | 
249 | 
250 | def get_apartment_params(soup):
251 |     rooms_number, floor, total_floors, total_area, material, year, kitchen_area, living_area = ["Не указано"] * 8
252 |     try:
253 |         params = [x.text.strip() for x in soup.find_all("div", class_="offer-card__feature-name")]
254 |         values = [x.text.strip() for x in soup.find_all("div", class_="offer-card__feature-value")]
255 |         for i in range(len(params)):
256 |             if "Количество комнат" in params[i]:
257 |                 rooms_number = values[i]
258 |             elif "Год постройки" in params[i]:
259 |                 year = values[i]
260 |             elif "Этаж" in params[i]:
261 |                 floor, total_floors = values[i].split(" из ")
262 |             elif "Общая площадь" in params[i]:
263 |                 total_area = values[i]
264 |             elif "Кухня" in params[i]:
265 |                 total_area = values[i]
266 |             elif "Жилая" in params[i]:
267 |                 total_area = values[i]
268 |             elif "Тип здания" in params[i]:
269 |                 material = values[i]
270 | 
271 |         if year == "Не указано":
272 |             new_block_params = [x.text.strip() for x in soup.find_all("div", class_="offer-card__site-subtitle-item")]
273 |             for param in new_block_params:
274 |                 if "строится" in param:
275 |                     year = param
276 |                     break
277 | 
278 |         if year == "Не указано":
279 |             new_params = [x.text.strip() for x in soup.find_all("div", class_="offer-card__main-feature-note")]
280 |             values = [x.text.strip() for x in soup.find_all("div", class_="offer-card__main-feature-title")]
281 |             for i in range(len(new_params)):
282 |                 if "год постройки" in new_params[i]:
283 |                     year = values[i]
284 |                     break
285 | 
286 |     except Exception as e:
287 |         with open("logs.txt", "a", encoding="utf8") as file:
288 |             file.write(str(e) + " ya get_apartment_params\n")
289 |     return rooms_number, floor, total_floors, total_area, material, year, kitchen_area, living_area
290 | 
291 | 
292 | def get_cottage_params(soup):
293 |     total_area, land_area, comforts, year, material, total_floors, land_status = ["Не указано"] * 7
294 |     try:
295 |         params = [x.text.strip() for x in soup.find_all("div", class_="offer-card__feature-name")]
296 |         values = [x.text.strip() for x in soup.find_all("div", class_="offer-card__feature-value")]
297 |         for i in range(len(params)):
298 |             if "Год постройки" in params[i]:
299 |                 year = values[i]
300 |             elif "Общая площадь" in params[i]:
301 |                 total_area = values[i]
302 |             elif "Площадь участка" in params[i]:
303 |                 land_area = values[i]
304 |             elif "Тип дома" in params[i]:
305 |                 material = values[i]
306 |             elif "Количество этажей" in params[i]:
307 |                 total_floors = values[i]
308 |             elif "Тип участка" in params[i]:
309 |                 land_status = values[i]
310 |             elif any(x in params[i].lower() for x in ["отапливаемый", "отопление", "водопровод", "канализация",
311 |                                                       "электроснабжение", "свет", "газ", "вода", "интернет",
312 |                                                       "телефон", "душ"]):
313 |                 if comforts == "Не указано":
314 |                     comforts = params[i].strip()
315 |                 else:
316 |                     comforts += "; " + params[i].strip()
317 | 
318 |         if year == "Не указано":
319 |             new_block_params = [x.text.strip() for x in soup.find_all("div", class_="offer-card__site-subtitle-item")]
320 |             for param in new_block_params:
321 |                 if "строится" in param:
322 |                     year = param
323 |     except Exception as e:
324 |         with open("logs.txt", "a", encoding="utf8") as file:
325 |             file.write(str(e) + " ya get_cottage_params\n")
326 |     return total_area, land_area, comforts, year, material, total_floors, land_status
327 | 
328 | 
329 | def get_commercial_params(soup):
330 |     entrance, furniture, additions, area = ["Не указано"] * 4
331 |     try:
332 |         params = [x.text.strip() for x in soup.find_all("div", class_="offer-card__feature-name")]
333 |         values = [x.text.strip() for x in soup.find_all("div", class_="offer-card__feature-value")]
334 |         for i in range(len(params)):
335 |             if "Мебель" in params[i]:
336 |                 furniture = values[i]
337 |             elif "Вход" in params[i]:
338 |                 entrance = values[i]
339 |             elif any(x in params[i].lower() for x in ["кондиционер", "интернет", "пожарная сигнализация",
340 |                                                       "вентиляция", "охраняемая парковка", "сигнализация", "лифт"])\
341 |                     and values[i].strip() == "да":
342 |                 if additions == "Не указано":
343 |                     additions = params[i].strip()
344 |                 else:
345 |                     additions += "; " + params[i].strip()
346 | 
347 |             new_params = [x.text.strip() for x in soup.find_all("div", class_="offer-card__main-feature-note")]
348 |             values = [x.text.strip() for x in soup.find_all("div", class_="offer-card__main-feature-title")]
349 |             for j in range(len(new_params)):
350 |                 if "общая" in new_params[j]:
351 |                     area = values[j]
352 |                     break
353 |     except Exception as e:
354 |         with open("logs.txt", "a", encoding="utf8") as file:
355 |             file.write(str(e) + " ya get_commercial_params\n")
356 |     return entrance, furniture, additions, area
357 | 
358 | 
359 | def get_apartment_data(html, url):
360 |     soup = BeautifulSoup(html, "lxml")
361 | 
362 |     #title = get_title(soup)
363 |     city, district, street, block_number = get_address(soup)
364 |     block_type = get_block_type(soup)
365 |     price = get_price(soup)
366 |     rooms_number, floor, total_floors, total_area, material, year, kitchen_area, living_area = get_apartment_params(soup)
367 |     selling_detail = get_selling_type(soup)
368 |     if "продажа" in selling_detail.lower() or "ипотека" in selling_detail.lower():
369 |         rent_info = "Не аренда"
370 |     else:
371 |         rent_info = selling_detail
372 |         selling_detail = "Не указано"
373 | 
374 |     #seller_type = get_seller_type(soup)
375 |     images = get_photos(soup)
376 |     description = get_description(soup)
377 |     phone = get_seller_phone(url)
378 |     date = get_date(soup, 1)
379 | 
380 |     return [city, district, street, block_number, rent_info, price, block_type,
381 |             rooms_number, total_area, total_floors, material, selling_detail, images,
382 |             description, date, phone, kitchen_area, living_area, floor]
383 | 
384 | 
385 | def get_cottage_data(html, url):
386 |     soup = BeautifulSoup(html, "lxml")
387 | 
388 |     title = get_title(soup)
389 |     city, district, street, block_number = get_address(soup)
390 |     cottage_type = title.split(",")[0]
391 |     price = get_price(soup)
392 |     total_area, land_area, comforts, year, material, total_floors, land_status = get_cottage_params(soup)
393 |     selling_detail = get_selling_type(soup)
394 |     if "продажа" in selling_detail.lower() or "ипотека" in selling_detail.lower():
395 |         rent_info = "Не аренда"
396 |     else:
397 |         rent_info = selling_detail
398 |         selling_detail = "Не указано"
399 | 
400 |     images = get_photos(soup)
401 |     description = get_description(soup)
402 |     phone = get_seller_phone(url)
403 |     date = get_date(soup, 1)
404 |     seller_name = get_seller_name(soup)
405 | 
406 |     return [city, district, street, block_number, rent_info, price, cottage_type,
407 |             total_area, comforts, selling_detail, images, description, date, phone, material,
408 |             total_floors, land_area, land_status, seller_name]
409 | 
410 | 
411 | def get_commercial_data(html, url):
412 |     soup = BeautifulSoup(html, "lxml")
413 | 
414 |     title = get_title(soup)
415 |     city, district, street, block_number = get_address(soup)
416 |     price = get_price(soup)
417 |     object_type = title.split(",")[0]
418 |     entrance, furniture, additions, area = get_commercial_params(soup)
419 |     phone = get_seller_phone(url)
420 |     images = get_photos(soup)
421 |     description = get_description(soup)
422 |     seller_name = get_seller_name(soup)
423 |     date = get_date(soup, 1)
424 |     office_class = "Не указано"
425 | 
426 |     return [city, district, street, block_number, price, object_type, office_class,
427 |             furniture, entrance, area, date, phone, images, description, seller_name]
428 | 
429 | 
430 | def crawl_page(first_offer, html, category, sell_type):
431 |     global visited_urls, db
432 |     soup = BeautifulSoup(html, "lxml")
433 |     # так как пагинация динамическая и мы не можем получить число страниц, проверяем, есть ли на странице объявления
434 |     try:
435 |         offers = soup.find("ol", class_="OffersSerp__list").find_all("li", class_="OffersSerp__list-item_type_offer")
436 |     except:
437 |         offers = []
438 |     if offers is None or not offers:
439 |         print("Парсинг завершен ya")
440 |         return True
441 |     k = 0
442 |     for offer in offers:
443 |         try:
444 |             date = get_date(soup, 0)
445 |             if date == "too old":
446 |                 print("Парсинг завершен ya")
447 |                 return True
448 | 
449 |             url = "https://realty.yandex.ru" + offer.find("a", class_="OffersSerpItem__link").get("href")
450 |             if url in visited_urls:
451 |                 print("ya not unique")
452 |                 time.sleep(random.uniform(10, 15))
453 |                 continue
454 |             else:
455 |                 visited_urls.append(url)
456 |             #print(url)
457 | 
458 |             data = []
459 |             if category == "Квартиры":
460 |                 data = get_apartment_data(get_html(url), url)
461 |                 # записываем ключевую информацию, чтобы потом найти дубликаты
462 |                 with open("total_data.txt", "a", encoding="utf8") as file:
463 |                     file.write("%s--%s--%s--%s--%s--%s\n" % (data[2], data[3], data[4], data[8], data[-1], url))
464 |             elif category == "Дома":
465 |                 data = get_cottage_data(get_html(url), url)
466 |                 with open("total_data.txt", "a", encoding="utf8") as file:
467 |                     file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[7], data[8], url))
468 |             elif category == "Коммерческая_недвижимость":
469 |                 data = get_commercial_data(get_html(url), url)
470 |                 with open("total_data.txt", "a", encoding="utf8") as file:
471 |                     file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[6], data[10], url))
472 | 
473 |             if first_offer:
474 |                 # сохраняем самую первую запись как точку выхода
475 |                 modifier = "w" if (category == "Квартиры" and sell_type == "Продажа") else "a"
476 |                 with open("breakpoints/ya.txt", modifier, encoding="utf8") as file:
477 |                     file.write("%s--%s\n" % (data[2], data[5]))
478 |                 first_offer = False
479 | 
480 |             key_info = (data[2], data[5])
481 | 
482 |             if any(x == key_info for x in [break_apartment_sell, break_apartment_rent, break_cottage_sell,
483 |                                            break_cottage_rent, break_commercial_sell, break_commercial_rent]):
484 |                 print("Парсинг завершен ya")
485 |                 return True
486 | 
487 |             data.insert(4, sell_type)
488 |             #print(*data, sep="\n")
489 |             #print("--------------------------------------")
490 |             if data[0] != "Не указано":
491 |                 try:
492 |                     db.insert_data(category, data)
493 |                 except:
494 |                     db.close()
495 |                     db = DataBase()
496 |                     db.insert_data(category, data)
497 |             print("parsed page ya")
498 | 
499 | 
500 |         except Exception as e:
501 |             with open("logs.txt", "a", encoding="utf8") as file:
502 |                 file.write(str(e) + " ya crawl_page\n")
503 |             #print(e)
504 |             #print("Ошибка в crawl_page")
505 | 
506 |         k += 1
507 |         if k % 5 == 0:  # после каждого пятого запроса, делаем паузу побольше
508 |             time.sleep(100)
509 |         else:
510 |             time.sleep(random.uniform(10, 15))
511 | 
512 | 
513 | def parse(category_url, category_name, sell_type):
514 |     completed = False
515 |     page = 0
516 |     while not completed:
517 |         url_gen = category_url[:category_url.rfind("=") + 1] + str(page)
518 |         if page == 0:
519 |             completed = crawl_page(True, get_html(url_gen), category_name, sell_type)
520 |         else:
521 |             completed = crawl_page(False, get_html(url_gen), category_name, sell_type)
522 |         page += 1
523 | 
524 | 
525 | def main():
526 |     global visited_urls
527 |     url_apartments_sell = "https://realty.yandex.ru/saratovskaya_oblast/kupit/kvartira/?sort=DATE_DESC&page=0"
528 |     parse(url_apartments_sell, "Квартиры", "Продажа")
529 | 
530 |     visited_urls = []
531 |     url_apartments_rent = "https://realty.yandex.ru/saratovskaya_oblast/snyat/kvartira/?sort=DATE_DESC&page=0"
532 |     parse(url_apartments_rent, "Квартиры", "Аренда")
533 | 
534 |     visited_urls = []
535 |     url_cottages_sell = "https://realty.yandex.ru/saratovskaya_oblast/kupit/dom/?sort=DATE_DESC&page=0"
536 |     parse(url_cottages_sell, "Дома", "Продажа")
537 | 
538 |     visited_urls = []
539 |     url_cottages_rent = "https://realty.yandex.ru/saratovskaya_oblast/snyat/dom/?sort=DATE_DESC&page=0"
540 |     parse(url_cottages_rent, "Дома", "Аренда")
541 | 
542 |     visited_urls = []
543 |     url_commercials_sell = "https://realty.yandex.ru/saratovskaya_oblast/kupit/kommercheskaya-nedvizhimost/?sort=DATE_DESC&page=0"
544 |     parse(url_commercials_sell, "Коммерческая_недвижимость", "Продажа")
545 | 
546 |     visited_urls = []
547 |     url_commercials_rent = "https://realty.yandex.ru/saratovskaya_oblast/snyat/kommercheskaya-nedvizhimost/?sort=DATE_DESC&page=0"
548 |     parse(url_commercials_rent, "Коммерческая_недвижимость", "Аренда")
549 | 
550 | 
551 | if __name__ == "__main__":
552 |     main()
553 |     db.close()
554 | 


--------------------------------------------------------------------------------
/youla_parsing.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import requests
  4 | from bs4 import BeautifulSoup
  5 | import time
  6 | import random
  7 | from fake_useragent import UserAgent
  8 | import datetime
  9 | from selenium import webdriver
 10 | from xvfbwrapper import Xvfb
 11 | from selenium.webdriver.chrome.options import Options
 12 | from database import DataBase
 13 | 
 14 | db = DataBase()
 15 | visited_urls = []
 16 | 
 17 | # defining chrome options for selenium
 18 | options = Options()
 19 | options.add_argument("--no-sandbox")
 20 | 
 21 | 
 22 | def get_html(url):
 23 |     req = requests.get(url, headers={"User-Agent": UserAgent().chrome})
 24 |     return req.text.encode(req.encoding)
 25 | 
 26 | 
 27 | def get_date(html, k):
 28 |     soup = BeautifulSoup(html, "lxml")
 29 | 
 30 |     try:
 31 |         date = soup.find_all("span", class_="hidden-xs")[k].text.strip()
 32 |         if "сегодня" in date:
 33 |             return str(datetime.datetime.today()).split()[0]
 34 |         elif "вчера" in date:
 35 |             return str(datetime.datetime.today() - datetime.timedelta(days=1)).split()[0]
 36 |         else:
 37 |             return "too old"
 38 |     except Exception as e:
 39 |         date = "Не указано"
 40 |         with open("logs.txt", "a", encoding="utf8") as file:
 41 |             file.write(str(e) + " youla get_date\n")
 42 |     return date
 43 | 
 44 | 
 45 | def get_category(html, k):
 46 |     soup = BeautifulSoup(html, "lxml")
 47 | 
 48 |     try:
 49 |         title = soup.find_all("div", class_="product_item__title")[k].text.split(",")[0].strip()
 50 |         if "Квартира" in title:
 51 |             return "Квартира"
 52 |         elif "Дом" in title:
 53 |             return "Дом"
 54 |         elif "Коттедж" in title:
 55 |             return "Коттедж"
 56 |         elif "Таунхаус" in title:
 57 |             return "Таунхаус"
 58 |         elif "Дача" in title:
 59 |             return "Дача"
 60 |         elif "Участок" in title:
 61 |             return "Участок"
 62 |     except Exception as e:
 63 |         with open("logs.txt", "a", encoding="utf8") as file:
 64 |             file.write(str(e) + " youla get_category\n")
 65 |     return None
 66 | 
 67 | 
 68 | def get_address(driver):
 69 |     try:
 70 |         address = driver.find_element_by_tag_name("table").find_elements_by_tag_name("tbody")[0].find_elements_by_tag_name("span")[0].text.strip()
 71 |         # separating data from the address string
 72 |         district, street = "Не указано", "Не указано"
 73 |         city = address.split(",")[0]
 74 |         block_number = address.split(",")[-1].strip()
 75 |         if "ул " in block_number.lower() or "ул." in block_number.lower() or "улица" in block_number.lower() \
 76 |                 or " пер" in block_number.lower() or "проспект" in block_number.lower() or "проезд" in block_number.lower():
 77 |             street = block_number
 78 |             block_number = "Не указано"
 79 | 
 80 |         for param in address.split(",")[1:-1]:
 81 |             if "ул " in param.lower() or "ул." in param.lower() or "улица" in param.lower() \
 82 |                     or " пер" in param.lower() or "проспект" in param.lower() or "проезд" in param.lower():
 83 |                 street = param.strip()
 84 |             elif "район" in param.lower() or "р-н" in param.lower():
 85 |                 district = param.strip()
 86 | 
 87 |         if street.split()[-1].strip().isdigit():
 88 |             block_number = street.split()[-1].strip()
 89 |             if block_number == "unnamed road":
 90 |                 block_number = "Не указано"
 91 |             street = " ".join(street.split()[:-1]).strip()
 92 | 
 93 |         return city, district, street, block_number
 94 |     except Exception as e:
 95 |         with open("logs.txt", "a", encoding="utf8") as file:
 96 |             file.write(str(e) + " youla get_address\n")
 97 |     return ["Не указано"] * 4
 98 | 
 99 | 
100 | def get_selling_type(url):
101 |     sell_type, rent_info = "Не указано", "Не указано"
102 |     if "prodaja" in url:
103 |         sell_type = "Продажа"
104 |     elif "arenda" in url:
105 |         if "posutochno" in url:
106 |             sell_type = "Аренда"
107 |             rent_info = "посуточно"
108 |         else:
109 |             sell_type = "Аренда"
110 |             rent_info = "длительный срок"
111 |     return sell_type, rent_info
112 | 
113 | 
114 | def get_price(driver):
115 |     try:
116 |         price = driver.find_element_by_css_selector("div[class='sticky-inner-wrapper']").find_element_by_tag_name("span").text.strip()
117 |     except Exception as e:
118 |         with open("logs.txt", "a", encoding="utf8") as file:
119 |             file.write(str(e) + " youla get_price\n")
120 |         price = "Не указано"
121 |     return price
122 | 
123 | 
124 | def get_seller_info(driver):
125 |     seller_type, seller_name = "Не указано", "Не указано"
126 |     try:
127 |         block = driver.find_element_by_css_selector("div[data-test-component='ProductOwner']").find_element_by_tag_name("div")
128 |         seller_name = block.find_element_by_tag_name("a").text.strip()
129 |         seller_name = seller_name[:seller_name.rfind("(")]
130 |         seller_type = block.find_element_by_tag_name("div").text.strip()
131 |     except Exception as e:
132 |         with open("logs.txt", "a", encoding="utf8") as file:
133 |             file.write(str(e) + " youla get_seller_info\n")
134 |     return seller_type, seller_name
135 | 
136 | 
137 | def get_photos(driver):
138 |     try:
139 |         images = "\n".join([x.get_attribute("src") for x in driver.find_elements_by_tag_name("div")
140 |                             if x.get_attribute("src") is not None])
141 |         if not images:
142 |             images = driver.find_element_by_css_selector("div[data-test-component='ProductGallery']").find_element_by_tag_name("img").get_attribute("src")
143 |     except Exception as e:
144 |         images = "Не указано"
145 |         with open("logs.txt", "a", encoding="utf8") as file:
146 |             file.write(str(e) + " youla get_photos\n")
147 |     return images
148 | 
149 | 
150 | def get_description(driver):
151 |     try:
152 |         description = driver.find_element_by_tag_name("table").find_elements_by_tag_name("tbody")[1].find_element_by_tag_name("td").text.strip()
153 |     except Exception as e:
154 |         description = "Не указано"
155 |         with open("logs.txt", "a", encoding="utf8") as file:
156 |             file.write(str(e) + " youla get_description\n")
157 |     return description
158 | 
159 | 
160 | def get_seller_phone(driver):
161 |     try:
162 |         button = driver.find_element_by_css_selector("button[data-test-action='PhoheNumberClick']")
163 |         button.click()
164 |         time.sleep(3)
165 |         phone = driver.find_element_by_xpath('//*[@id="app"]/div[2]/div[10]/div/div/div/div[2]/div[2]/div/a').text.strip()
166 |     except Exception as e:
167 |         phone = "Не указано"
168 |         with open("logs.txt", "a", encoding="utf8") as file:
169 |             file.write(str(e) + " youla get_seller_phone\n")
170 |     return phone
171 | 
172 | 
173 | def get_apartment_params(driver):
174 |     material, lift, year, rooms_number, floor, total_floors, total_area, kitchen_area, repair = ["Не указано"] * 9
175 |     try:
176 |         expand = driver.find_element_by_tag_name("table").find_elements_by_tag_name("tbody")[2].find_element_by_tag_name("div")
177 |         expand.click()
178 |         params = driver.find_element_by_tag_name("table").find_elements_by_tag_name("tbody")[2].find_elements_by_tag_name("th")
179 |         values = driver.find_element_by_tag_name("table").find_elements_by_tag_name("tbody")[2].find_elements_by_tag_name("td")
180 |         for i in range(len(params)):
181 |             if "Комнат в квартире" in params[i].text.strip():
182 |                 rooms_number = values[i].text.strip()
183 |             elif "Общая площадь" in params[i].text.strip():
184 |                 total_area = values[i].text.strip()
185 |             elif "Этаж" in params[i].text.strip():
186 |                 floor = values[i].text.strip().split()[0]
187 |             elif "Этажность дома" in params[i].text.strip():
188 |                 total_floors = values[i].text.strip()
189 |             elif "Площадь кухни" in params[i].text.strip():
190 |                 kitchen_area = values[i].text.strip()
191 |             elif "Ремонт" in params[i].text.strip():
192 |                 repair = values[i].text.strip()
193 |             elif "Лифт" in params[i].text.strip():
194 |                 lift = values[i].text.strip()
195 |             elif "Тип дома" in params[i].text.strip():
196 |                 material = values[i].text.strip()
197 |             elif "Год постройки" in params[i].text.strip():
198 |                 year = values[i].text.strip()
199 |     except Exception as e:
200 |         with open("logs.txt", "a", encoding="utf8") as file:
201 |             file.write(str(e) + " youla get_apartment_params\n")
202 |     return material, lift, year, rooms_number, floor, total_floors, total_area, kitchen_area, repair
203 | 
204 | 
205 | def get_cottage_params(driver):
206 |     total_area, material, total_floors, bedrooms, land_area, status, comforts = ["Не указано"] * 7
207 |     try:
208 |         expand = driver.find_element_by_tag_name("table").find_elements_by_tag_name("tbody")[2].find_element_by_tag_name("div")
209 |         expand.click()
210 |         params = driver.find_element_by_tag_name("table").find_elements_by_tag_name("tbody")[2].find_elements_by_tag_name("th")
211 |         values = driver.find_element_by_tag_name("table").find_elements_by_tag_name("tbody")[2].find_elements_by_tag_name("td")
212 |         for i in range(len(params)):
213 |             if "Площадь дома" in params[i].text.strip():
214 |                 total_area = values[i].text.strip()
215 |             elif "Материал дома" in params[i].text.strip():
216 |                 material = values[i].text.strip()
217 |             elif "Количество спален" in params[i].text.strip():
218 |                 bedrooms = values[i].text.strip()
219 |             elif "Площадь участка" in params[i].text.strip():
220 |                 land_area = values[i].text.strip()
221 |             elif "Этажей" in params[i].text.strip():
222 |                 total_floors = values[i].text.strip()
223 |             elif "Тип участка" in params[i].text.strip():
224 |                 status = values[i].text.strip()
225 |             elif any(x in params[i].text.strip() for x in ["Электричество", "Газ", "Водоснабжение", "Отопление", "Гараж", "Санузлы"]):
226 |                 if comforts == "Не указано":
227 |                     comforts = params[i].text.strip() + " - " + values[i].text.strip().lower() + "; "
228 |                 else:
229 |                     comforts += params[i].text.strip() + " - " + values[i].text.strip().lower() + "; "
230 |     except Exception as e:
231 |         with open("logs.txt", "a", encoding="utf8") as file:
232 |             file.write(str(e) + " youla get_cottage_params\n")
233 |     return total_area, material, total_floors, bedrooms, land_area, status, comforts
234 | 
235 | 
236 | def get_apartment_data(url):
237 |     vdisplay = Xvfb()
238 |     vdisplay.start()
239 |     driver = webdriver.Chrome(options=options)
240 |     driver.set_window_size(1920, 1080)
241 |     driver.get(url)
242 | 
243 |     city, district, street, block_number = get_address(driver)
244 |     sell_type, rent_info = get_selling_type(url)
245 |     if "продажа" in sell_type.lower():
246 |         rent_info = "Не аренда"
247 |     material, lift, year, rooms_number, floor, total_floors, total_area, kitchen_area, repair = get_apartment_params(driver)
248 |     block_type = "Вторичка"
249 |     living_area = "Не указано"
250 |     price = get_price(driver)
251 |     if "Аренда" in sell_type:
252 |         if "posutochno" in url:
253 |             price += "/день"
254 |         else:
255 |             price += "/мес."
256 |     #seller_type, seller_name = get_seller_info(driver)
257 |     images = get_photos(driver)
258 |     description = get_description(driver)
259 |     phone = get_seller_phone(driver)
260 |     selling_detail = "Не указано"
261 | 
262 |     driver.quit()
263 |     vdisplay.stop()
264 | 
265 |     return [city, district, street, block_number, sell_type, rent_info, price, block_type,
266 |             rooms_number, total_area, total_floors, material, selling_detail, images,
267 |             description, phone, kitchen_area, living_area, floor]
268 | 
269 | 
270 | def get_cottage_data(url, category):
271 |     vdisplay = Xvfb()
272 |     vdisplay.start()
273 |     driver = webdriver.Chrome(options=options)
274 |     driver.set_window_size(1920, 1080)
275 |     driver.get(url)
276 | 
277 |     if "doma" in url:
278 |         cottage_type = "Дом"
279 |     elif "uchastka" in url:
280 |         cottage_type = "Участок"
281 |     else:
282 |         cottage_type = "Не указано"
283 | 
284 |     city, district, street, block_number = get_address(driver)
285 |     sell_type, rent_info = get_selling_type(url)
286 |     if "продажа" in sell_type.lower():
287 |         rent_info = "Не аренда"
288 |     price = get_price(driver)
289 |     if "Аренда" in sell_type:
290 |         if "posutochno" in url:
291 |             price += "/день"
292 |         else:
293 |             price += "/мес."
294 |     total_area, material, total_floors, bedrooms, land_area, status, comforts = get_cottage_params(driver)
295 |     _, seller_name = get_seller_info(driver)
296 |     images = get_photos(driver)
297 |     description = get_description(driver)
298 |     phone = get_seller_phone(driver)
299 |     selling_detail = "Не указано"
300 | 
301 |     driver.quit()
302 |     vdisplay.stop()
303 | 
304 |     if category == "Участок":
305 |         material, total_floors = "Участок", "Участок"
306 | 
307 |     return [city, district, street, block_number, sell_type, rent_info, price, cottage_type,
308 |             total_area, comforts, selling_detail, images, description, phone, material,
309 |             total_floors, land_area, status, seller_name]
310 | 
311 | 
312 | def crawl_page(html):
313 |     global visited_urls, db
314 |     soup = BeautifulSoup(html, "lxml")
315 |     # так как пагинация динамическая и мы не можем получить число страниц, проверяем, есть ли на странице объявления
316 |     offers = soup.find_all("li", class_="product_item")
317 |     if offers is None or not offers:
318 |         print("Парсинг завершен youla")
319 |         return True
320 |     k = 0
321 |     for offer in offers:
322 |         try:
323 |             category = get_category(html, k)
324 |             date = get_date(html, k)
325 |             if date == "too old" and len(offer.get("class")) == 1:
326 |                 print("Парсинг завершен youla")
327 |                 return True
328 |             elif date == "too old":
329 |                 date = str(datetime.datetime.today() - datetime.timedelta(days=2)).split()[0]
330 |             k += 1
331 |             url = "https://youla.ru" + offer.find("a").get("href")
332 |             if url in visited_urls:
333 |                 print("youla not unique")
334 |                 time.sleep(random.uniform(10, 15))
335 |                 continue
336 |             else:
337 |                 visited_urls.append(url)
338 |             #print(url)
339 | 
340 |             if category is None or "saratov" not in url:
341 |                 time.sleep(random.uniform(5, 8))
342 |                 continue
343 | 
344 |             data = []
345 |             if category == "Квартира":
346 |                 data = get_apartment_data(url)
347 |                 data.insert(15, date)
348 |                 if data[0] != "Не указано":
349 |                     try:
350 |                         db.insert_data("Квартиры", data)
351 |                     except:
352 |                         db.close()
353 |                         db = DataBase()
354 |                         db.insert_data("Квартиры", data)
355 |                 with open("total_data.txt", "a", encoding="utf8") as file:
356 |                     file.write("%s--%s--%s--%s--%s--%s\n" % (data[2], data[3], data[4], data[8], data[-1], url))
357 |             elif any(x in category for x in ["Дом", "Коттедж", "Таунхаус", "Дача", "Участок"]):
358 |                 data = get_cottage_data(url, category)
359 |                 data.insert(13, date)
360 |                 if data[0] != "Не указано":
361 |                     try:
362 |                         db.insert_data("Дома", data)
363 |                     except:
364 |                         db.close()
365 |                         db = DataBase()
366 |                         db.insert_data("Дома", data)
367 |                 with open("total_data.txt", "a", encoding="utf8") as file:
368 |                     file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[7], data[8], url))
369 | 
370 |             #print(*data, sep="\n")
371 |             #print("--------------------------------------")
372 |             print("parsed page youla")
373 | 
374 |         except Exception as e:
375 |             with open("logs.txt", "a", encoding="utf8") as file:
376 |                 file.write(str(e) + " youla crawl_page\n")
377 |             #print(e)
378 |             #print("Ошибка в crawl_page")
379 | 
380 | 
381 | def parse(url):
382 |     completed = False
383 |     page = 1
384 |     while not completed:
385 |         url_gen = url[:url.rfind("=") + 1] + str(page)
386 |         completed = crawl_page(get_html(url_gen))
387 |         page += 1
388 | 
389 | 
390 | def main():
391 |     url = "https://youla.ru/saratov/nedvijimost?attributes[sort_field]=date_published&attributes[term_of_placement][from]=-1%20day&attributes[term_of_placement][to]=now&page=1"
392 |     parse(url)
393 | 
394 | 
395 | if __name__ == "__main__":
396 |     main()
397 |     db.close()
398 | 


--------------------------------------------------------------------------------