├── .gitignore
├── README.md
├── avito_parsing.py
├── cian_parsing.py
├── database.py
├── irr_parsing.py
├── kvadrat64_parsing.py
├── main.py
├── requirements.txt
├── ya_realty_parsing.py
└── youla_parsing.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
106 | .idea/
107 | *.csv
108 | chromedriver.exe
109 | phone.gif
110 | phone_number.png
111 | logs.txt
112 | # breakpoints/
113 | total_data.txt
114 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # real_estate_parsing
--------------------------------------------------------------------------------
/avito_parsing.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import requests
4 | from bs4 import BeautifulSoup
5 | import time
6 | import random
7 | import datetime
8 | from fake_useragent import UserAgent
9 | from selenium import webdriver
10 | from selenium.webdriver.chrome.options import Options
11 | from xvfbwrapper import Xvfb
12 | from PIL import Image
13 | from pytesseract import image_to_string
14 | import sys
15 | from database import DataBase
16 |
17 | # на каких записях останавливаться
18 | with open("breakpoints/avito.txt", "r", encoding="utf8") as file:
19 | breakpoints = file.readlines()
20 | try:
21 | break_apartment = tuple(breakpoints[0].strip().split("--"))
22 | except:
23 | break_apartment = None
24 | try:
25 | break_cottage = tuple(breakpoints[1].strip().split("--"))
26 | except:
27 | break_cottage = None
28 | try:
29 | break_land = tuple(breakpoints[2].strip().split("--"))
30 | except:
31 | break_land = None
32 | try:
33 | break_commercial = tuple(breakpoints[3].strip().split("--"))
34 | except:
35 | break_commercial = None
36 |
37 |
38 | #defining chrome options for selenium
39 | options = Options()
40 | options.add_argument('--no-sandbox')
41 |
42 | db = DataBase()
43 | visited_urls = []
44 |
45 |
46 | def get_html(url):
47 | req = requests.get(url, headers={"User-Agent": UserAgent().chrome})
48 | return req.text.encode(req.encoding)
49 |
50 |
51 | def get_total_pages(html):
52 | soup = BeautifulSoup(html, "lxml")
53 | try:
54 | pages = soup.find("div", class_="pagination-pages clearfix").find_all("a", class_="pagination-page")[-1].get("href")
55 | total_pages = int(pages.split("=")[1].split("&")[0])
56 | except Exception as e:
57 | with open("logs.txt", "a", encoding="utf8") as file:
58 | file.write(str(e) + " avito get_total_pages\n")
59 | sys.exit(0)
60 | return total_pages
61 |
62 |
63 | def get_title(soup):
64 | try:
65 | title = soup.find("span", class_="title-info-title-text").text.strip()
66 | except Exception as e:
67 | with open("logs.txt", "a", encoding="utf8") as file:
68 | file.write(str(e) + " avito get_title\n")
69 | title = "Не указано"
70 | return title
71 |
72 |
73 | def get_address(soup):
74 | try:
75 | address = "{}, {}".format(soup.find("meta", itemprop="addressLocality").get("content").strip(),
76 | soup.find("span", itemprop="streetAddress").text.strip())
77 | # separating data from the address string
78 | district, street = "Не указано", "Не указано"
79 | city = address.split(",")[0]
80 | block_number = address.split(",")[-1].strip()
81 | if "ул " in block_number.lower() or "ул." in block_number.lower() or "улица" in block_number.lower()\
82 | or " пер" in block_number.lower() or "проспект" in block_number.lower() or "проезд" in block_number.lower():
83 | street = block_number
84 | block_number = "Не указано"
85 |
86 | for param in address.split(",")[1:-1]:
87 | if "ул " in param.lower() or "ул." in param.lower() or "улица" in param.lower() \
88 | or " пер" in param.lower() or "проспект" in param.lower() or "проезд" in param.lower():
89 | street = param.strip()
90 | elif "район" in param.lower() or "р-н" in param.lower():
91 | district = param.strip()
92 |
93 | if street.split()[-1].strip().isdigit():
94 | block_number = street.split()[-1].strip()
95 | street = " ".join(street.split()[:-1]).strip()
96 |
97 | return city, district, street, block_number
98 | except Exception as e:
99 | with open("logs.txt", "a", encoding="utf8") as file:
100 | file.write(str(e) + " avito get_address\n")
101 | return ["Не указано"] * 4
102 |
103 |
104 | def get_selling_info(soup):
105 | try:
106 | per_meter = False # если цена указана за квадратный метр
107 | price = soup.find("span", class_="price-value-string js-price-value-string").text.strip()
108 | if "за сутки" in price:
109 | sell_type = "Аренда"
110 | rent_info = "посуточно"
111 | elif "в месяц" in price:
112 | sell_type = "Аренда"
113 | rent_info = "длительный срок"
114 | if "за " in price:
115 | per_meter = True
116 | else:
117 | sell_type = "Продажа"
118 | rent_info = "Не аренда"
119 | price = soup.find("span", class_="js-item-price").text.strip()
120 | # ошибка кодировки при записи, собираем сообщение вручную
121 | if rent_info == "посуточно":
122 | price = "от " + price + " за сутки"
123 | elif rent_info == "длительный срок":
124 | if per_meter:
125 | price = price + " в месяц за м2"
126 | else:
127 | price = price + " в месяц"
128 | except Exception as e:
129 | with open("logs.txt", "a", encoding="utf8") as file:
130 | file.write(str(e) + " avito get_selling info\n")
131 | sell_type, price, rent_info = ["Не указано"] * 3
132 | return sell_type, price, rent_info
133 |
134 |
135 | def get_deposit(soup):
136 | try:
137 | deposit = soup.find("div", class_="item-price-sub-price").text.strip()
138 | except Exception as e:
139 | with open("logs.txt", "a", encoding="utf8") as file:
140 | file.write(str(e) + " avito get_deposit\n")
141 | deposit = "Не указано"
142 | return deposit
143 |
144 |
145 | def get_seller_type(soup):
146 | try:
147 | seller_type = soup.find("div", class_="seller-info-prop seller-info-prop_short_margin")
148 | if seller_type is not None:
149 | seller_type = "Посредник"
150 | else:
151 | seller_type = "Собственник"
152 | except Exception as e:
153 | with open("logs.txt", "a", encoding="utf8") as file:
154 | file.write(str(e) + " avito get_seller_type\n")
155 | seller_type = "Не указано"
156 | return seller_type
157 |
158 |
159 | def get_seller_name(soup):
160 | try:
161 | seller_name = soup.find("div", class_="seller-info-name").find("a").text.strip()
162 | except Exception as e:
163 | with open("logs.txt", "a", encoding="utf8") as file:
164 | file.write(str(e) + " avito get_seller_name\n")
165 | seller_name = "Не указано"
166 | return seller_name
167 |
168 |
169 | def get_photos(soup):
170 | try:
171 | images = []
172 | images_list = soup.find("ul", class_="gallery-list js-gallery-list").find_all("li", class_="gallery-list-item js-gallery-list-item")
173 | for image in images_list:
174 | link = image.find("span").get("style").split(":")[1].strip()[4:-2]
175 | images.append(link)
176 | images = "\n".join(images)
177 | except:
178 | # если нет фото, возьмем фото с "обложки"
179 | try:
180 | images = soup.find("span", class_="gallery-img-cover").get("style").split(":")[1].strip()[4:-2]
181 | except Exception as e:
182 | with open("logs.txt", "a", encoding="utf8") as file:
183 | file.write(str(e) + " avito get_photos\n")
184 | images = "Не указано"
185 | return images
186 |
187 |
188 | def get_description(soup):
189 | try:
190 | description = soup.find("div", class_="item-description-text").find("p").text.strip()
191 | except Exception as e:
192 | with open("logs.txt", "a", encoding="utf8") as file:
193 | file.write(str(e) + " avito get_description\n")
194 | description = "Не указано"
195 | return description
196 |
197 |
198 | def get_date(soup):
199 | try:
200 | date = soup.find("div", class_="title-info-metadata-item").text.split(",")[1].strip()
201 | if "сегодня" in date:
202 | date = str(datetime.datetime.today()).split()[0]
203 | elif "вчера" in date:
204 | date = str(datetime.datetime.today() - datetime.timedelta(days=1)).split()[0]
205 | else:
206 | date = "too old"
207 | except Exception as e:
208 | with open("logs.txt", "a", encoding="utf8") as file:
209 | file.write(str(e) + " avito get_date\n")
210 | date = "Не указано"
211 | return date
212 |
213 |
214 | def get_seller_phone(url):
215 | # телефон показывается в виде картинки, используем selenium и pytesseract
216 | vdisplay = Xvfb()
217 | vdisplay.start()
218 | driver = webdriver.Chrome(options=options)
219 | driver.set_window_size(1920, 1080)
220 | driver.get(url)
221 |
222 | try:
223 | button = driver.find_element_by_xpath('//a[@class="button item-phone-button js-item-phone-button '
224 | 'button-origin button-origin-blue button-origin_full-width '
225 | 'button-origin_large-extra item-phone-button_hide-phone '
226 | 'item-phone-button_card js-item-phone-button_card"]')
227 | button.click()
228 | time.sleep(2)
229 | driver.save_screenshot("phone_number.png")
230 |
231 | image = driver.find_element_by_xpath('//div[@class="item-phone-big-number js-item-phone-big-number"]//*')
232 |
233 | cropped = Image.open("phone_number.png")
234 | x, y = image.location["x"], image.location["y"]
235 | width, height = image.size["width"], image.size["height"]
236 | cropped.crop((x, y, x + width, y + height)).save("phone.gif")
237 |
238 | phone = Image.open("phone.gif")
239 | phone_text = image_to_string(phone)
240 | except Exception as e:
241 | with open("logs.txt", "a", encoding="utf8") as file:
242 | file.write(str(e) + " avito get_seller_phone\n")
243 | phone_text = "Не указано"
244 |
245 | driver.quit()
246 | vdisplay.stop()
247 |
248 | return phone_text
249 |
250 |
251 | def get_apartment_params(soup):
252 | rooms_number, floor_number, total_floors, material, total_area, kitchen_area, living_area = ["Не указано"] * 7
253 | block_type = "Вторичка"
254 | try:
255 | params = soup.find_all("li", class_="item-params-list-item")
256 | for i in range(len(params)):
257 | info = params[i].text.strip()
258 | if "Количество комнат" in info:
259 | rooms_number = info.split(":")[1].strip()
260 | elif "Этажей в доме" in info:
261 | total_floors = info.split(":")[1].strip()
262 | elif "Этаж" in info:
263 | floor_number = info.split(":")[1].strip()
264 | elif "Тип дома" in info:
265 | material = info.split(":")[1].strip()
266 | elif "Общая площадь" in info:
267 | total_area = info.split(":")[1].split("м²")[0].strip()
268 | elif "Площадь кухни" in info:
269 | kitchen_area = info.split(":")[1].split("м²")[0].strip()
270 | elif "Жилая площадь" in info:
271 | living_area = info.split(":")[1].split("м²")[0].strip()
272 | elif "Официальный застройщик" in info or "Название объекта недвижимости" in info:
273 | block_type = "Новостройка"
274 | except Exception as e:
275 | with open("logs.txt", "a", encoding="utf8") as file:
276 | file.write(str(e) + " avito get_apartment_params\n")
277 | return rooms_number, floor_number, total_floors, material, total_area, kitchen_area, living_area, block_type
278 |
279 |
280 | def get_cottage_params(soup):
281 | house_type, total_floors, distance, material, total_area, land_area = ["Не указано"] * 6
282 | try:
283 | params = soup.find_all("li", class_="item-params-list-item")
284 | for i in range(len(params)):
285 | info = params[i].text.strip()
286 | if "Вид объекта" in info:
287 | house_type = info.split(":")[1].strip()
288 | elif "Этажей в доме" in info:
289 | total_floors = info.split(":")[1].strip()
290 | elif "Расстояние до города" in info:
291 | distance = info.split(":")[1].split("км")[0].strip() + " км"
292 | elif "Материал стен" in info:
293 | material = info.split(":")[1].strip()
294 | elif "Площадь дома" in info:
295 | total_area = info.split(":")[1].split("м²")[0].strip()
296 | elif "Площадь участка" in info:
297 | land_area = info.split(":")[1].split("сот")[0].strip() + " сот"
298 | except Exception as e:
299 | with open("logs.txt", "a", encoding="utf8") as file:
300 | file.write(str(e) + " avito get_cottage_params\n")
301 | return house_type, total_floors, distance, material, total_area, land_area
302 |
303 |
304 | def get_land_params(soup):
305 | distance, area = "Не указано", "Не указано"
306 | try:
307 | labels = soup.find_all("span", class_="item-params-label")
308 | params = soup.find("div", class_="item-params").find_all("span")
309 | for i in range(len(labels)):
310 | info = params[i * 2].text.strip()
311 | label = labels[i].text.strip()
312 | if "Расстояние до города" in label:
313 | distance = info.split(":")[1].split("км")[0].strip() + " км"
314 | elif "Площадь" in label:
315 | area = info.split(":")[1].split("сот")[0].strip() + " сот"
316 | except Exception as e:
317 | with open("logs.txt", "a", encoding="utf8") as file:
318 | file.write(str(e) + " avito get_land_params\n")
319 | return distance, area
320 |
321 |
322 | def get_commercial_params(soup):
323 | office_class, area = "Не указано", "Не указано"
324 | try:
325 | labels = soup.find_all("span", class_="item-params-label")
326 | params = soup.find("div", class_="item-params").find_all("span")
327 | for i in range(len(labels)):
328 | info = params[i * 2].text.strip()
329 | label = labels[i].text.strip()
330 | if "Площадь" in label:
331 | area = info.split(":")[1].split("м²")[0].strip()
332 | elif "Класс здания" in label:
333 | office_class = info.split(":")[1].strip()
334 | except Exception as e:
335 | with open("logs.txt", "a", encoding="utf8") as file:
336 | file.write(str(e) + " avito get_commercial_params\n")
337 | return office_class, area
338 |
339 |
340 | def get_apartment_data(url, html):
341 | soup = BeautifulSoup(html, "lxml")
342 |
343 | title = get_title(soup)
344 | if "сниму" not in title.lower() and "куплю" not in title.lower():
345 | city, district, street, block_number = get_address(soup)
346 | sell_type, price, rent_info = get_selling_info(soup)
347 | rooms_number, floor_number, total_floors, material, total_area, kitchen_area, living_area, block_type = get_apartment_params(soup)
348 | #seller_type = get_seller_type(soup)
349 | #seller_name = get_seller_name(soup)
350 | images = get_photos(soup)
351 | description = get_description(soup)
352 | phone = get_seller_phone(url)
353 | date = get_date(soup)
354 | selling_detail = "Не указано" # на авито не указывается эта информация
355 |
356 | return [city, district, street, block_number, sell_type, rent_info, price, block_type,
357 | rooms_number, total_area, total_floors, material, selling_detail, images,
358 | description, date, phone, kitchen_area, living_area, floor_number]
359 |
360 | return None
361 |
362 |
363 | def get_cottage_data(url, html):
364 | soup = BeautifulSoup(html, "lxml")
365 |
366 | title = get_title(soup)
367 | if "сниму" not in title.lower() and "куплю" not in title.lower():
368 | city, district, street, block_number = get_address(soup)
369 | sell_type, price, rent_info = get_selling_info(soup)
370 | house_type, total_floors, distance, material, total_area, land_area = get_cottage_params(soup)
371 | #seller_type = get_seller_type(soup)
372 | seller_name = get_seller_name(soup)
373 | images = get_photos(soup)
374 | description = get_description(soup)
375 | phone = get_seller_phone(url)
376 | date = get_date(soup)
377 | selling_detail, comforts, land_status = ["Не указано"] * 3 # на авито не указывается эта информация
378 |
379 | return [city, district, street, block_number, sell_type, rent_info, price, house_type,
380 | total_area, comforts, selling_detail, images, description, date, phone, material,
381 | total_floors, land_area, land_status, seller_name]
382 | return None
383 |
384 |
385 | def get_land_data(url, html):
386 | soup = BeautifulSoup(html, "lxml")
387 |
388 | title = get_title(soup)
389 | if "сниму" not in title.lower() and "куплю" not in title.lower():
390 | # категория земель указывается в скобках в названии объявления
391 | if "(" in title:
392 | land_type = title[title.find("(") + 1:].split(")")[0]
393 | else:
394 | land_type = "Не указано"
395 |
396 | city, district, street, _ = get_address(soup)
397 | sell_type, price, _ = get_selling_info(soup)
398 |
399 | if "Аренда" in sell_type:
400 | deposit = get_deposit(soup)
401 | else:
402 | deposit = "Не аренда"
403 |
404 | distance, area = get_land_params(soup)
405 | seller_type = get_seller_type(soup)
406 | seller_name = get_seller_name(soup)
407 | images = get_photos(soup)
408 | description = get_description(soup)
409 | phone = get_seller_phone(url)
410 | date = get_date(soup)
411 |
412 | return [city, district, street, sell_type, deposit, land_type, distance, area, price, seller_type, images,
413 | description, seller_name, phone, date]
414 | return None
415 |
416 |
417 | def get_commercial_data(url, html):
418 | soup = BeautifulSoup(html, "lxml")
419 |
420 | title = get_title(soup)
421 | if "сниму" not in title.lower() and "куплю" not in title.lower():
422 | # анализируем вид помещения по заголовку
423 | if "офис" in title.lower():
424 | object_type = "Офисное помещение"
425 | elif "торг" in title.lower():
426 | object_type = "Торговое помещение"
427 | elif "гостиница" in title.lower():
428 | object_type = "Гостиница"
429 | elif "свобод" in title.lower():
430 | object_type = "Помещение свободного назначения"
431 | elif "производ" in title.lower():
432 | object_type = "Производственное помещение"
433 | elif "склад" in title.lower():
434 | object_type = "Складское помещение"
435 | else:
436 | object_type = "Не указано"
437 |
438 | city, district, street, block_number = get_address(soup)
439 | sell_type, price, _ = get_selling_info(soup)
440 |
441 | # if "Аренда" in sell_type:
442 | # deposit = get_deposit(soup)
443 | # else:
444 | # deposit = "Не аренда"
445 |
446 | # если не офис, не заполняем поле office_class
447 | if object_type == "Офисное помещение":
448 | office_class, area = get_commercial_params(soup)
449 | else:
450 | _, area = get_commercial_params(soup)
451 | office_class = "Не офис"
452 |
453 | #seller_type = get_seller_type(soup)
454 | seller_name = get_seller_name(soup)
455 | images = get_photos(soup)
456 | description = get_description(soup)
457 | phone = get_seller_phone(url)
458 | date = get_date(soup)
459 | furniture, entrance = "Не указано", "Не указано" # на авито не указывается эта информация
460 |
461 | return [city, district, street, block_number, sell_type, price, object_type, office_class,
462 | furniture, entrance, area, date, phone, images, description, seller_name]
463 | return None
464 |
465 |
466 | def crawl_page(first_offer, html, category):
467 | global visited_urls, db
468 | soup = BeautifulSoup(html, "lxml")
469 | try:
470 | offers = soup.find("div", class_="catalog-list").find_all("div", class_="item_table")
471 | except:
472 | offers = []
473 | if offers is None or not offers:
474 | print("Парсинг завершен avito")
475 | return True
476 | for offer in offers:
477 | try:
478 | if first_offer:
479 | # сохраняем самую первую запись как точку выхода
480 | modifier = "w" if category == "Квартиры" else "a"
481 | with open("breakpoints/avito.txt", modifier, encoding="utf8") as file:
482 | file.write("%s--%s\n" % (offer.find("a", class_="item-description-title-link").get("title"),
483 | offer.find("span", {"class": "price", "itemprop": "price"}).get("content")))
484 | first_offer = False
485 |
486 | if offer.find("div", class_="js-item-date c-2").text.strip() == "2 дня назад":
487 | print("Парсинг завершен avito")
488 | return True
489 |
490 | key_info = (offer.find("a", class_="item-description-title-link").get("title"), offer.find("span", {"class": "price", "itemprop": "price"}).get("content"))
491 |
492 | if any(x == key_info for x in [break_apartment, break_cottage, break_land, break_commercial]):
493 | print("Парсинг завершен avito")
494 | return True
495 |
496 | url = "https://avito.ru" + offer.find("div", class_="description").find("h3").find("a").get("href")
497 | if url in visited_urls:
498 | print("avito not unique")
499 | time.sleep(random.uniform(5, 8))
500 | continue
501 | else:
502 | visited_urls.append(url)
503 |
504 | data = []
505 | if category == "Квартиры":
506 | data = get_apartment_data(url, get_html(url))
507 | # записываем ключевую информацию, чтобы потом найти дубликаты
508 | with open("total_data.txt", "a", encoding="utf8") as file:
509 | file.write("%s--%s--%s--%s--%s--%s\n" % (data[2], data[3], data[4], data[8], data[-1], url))
510 | elif category == "Дома":
511 | data = get_cottage_data(url, get_html(url))
512 | with open("total_data.txt", "a", encoding="utf8") as file:
513 | file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[7], data[8], url))
514 | elif category == "Участки":
515 | data = get_land_data(url, get_html(url))
516 | with open("total_data.txt", "a", encoding="utf8") as file:
517 | file.write("%s--%s--%s--%s\n" % (data[2], data[5], data[7], url))
518 | elif category == "Коммерческая_недвижимость":
519 | data = get_commercial_data(url, get_html(url))
520 | with open("total_data.txt", "a", encoding="utf8") as file:
521 | file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[6], data[10], url))
522 |
523 | if data[0] != "Не указано" and data is not None:
524 | try:
525 | db.insert_data(category, data)
526 | except:
527 | db.close()
528 | db = DataBase()
529 | db.insert_data(category, data)
530 | print("parsed page avito")
531 |
532 | #print(data)
533 |
534 | except Exception as e:
535 | with open("logs.txt", "a", encoding="utf8") as file:
536 | file.write(str(e) + " avito crawl_page\n")
537 | #print(str(e) + " avito crawl_page")
538 |
539 | time.sleep(random.uniform(5, 8))
540 |
541 |
542 | def parse(category_url, base_url, category_name):
543 | page_part = "p="
544 | parameters_part = "&s=104&s_trg=3&bt=1"
545 |
546 | total_pages = get_total_pages(get_html(category_url))
547 |
548 | for page in range(1, total_pages + 1):
549 | url_gen = base_url + page_part + str(page) + parameters_part
550 | if page == 1:
551 | completed = crawl_page(True, get_html(url_gen), category_name)
552 | else:
553 | completed = crawl_page(False, get_html(url_gen), category_name)
554 | if completed:
555 | break
556 |
557 |
558 | def main():
559 | global visited_urls
560 | url_apartments = "https://www.avito.ru/saratovskaya_oblast/kvartiry?p=1&s=104&s_trg=3&bt=1"
561 | base_url = "https://www.avito.ru/saratovskaya_oblast/kvartiry?"
562 | parse(url_apartments, base_url, "Квартиры")
563 |
564 | visited_urls = []
565 | url_cottages = "https://www.avito.ru/saratovskaya_oblast/doma_dachi_kottedzhi?s=104&s_trg=3&bt=1"
566 | base_url = "https://www.avito.ru/saratovskaya_oblast/doma_dachi_kottedzhi?"
567 | parse(url_cottages, base_url, "Дома")
568 |
569 | visited_urls = []
570 | url_lands = "https://www.avito.ru/saratovskaya_oblast/zemelnye_uchastki?s=104&s_trg=3&bt=1"
571 | base_url = "https://www.avito.ru/saratovskaya_oblast/zemelnye_uchastki?"
572 | parse(url_lands, base_url, "Участки")
573 |
574 | visited_urls = []
575 | url_commercials = "https://www.avito.ru/saratovskaya_oblast/kommercheskaya_nedvizhimost?s=104&s_trg=3&bt=1"
576 | base_url = "https://www.avito.ru/saratovskaya_oblast/kommercheskaya_nedvizhimost?"
577 | parse(url_commercials, base_url, "Коммерческая_недвижимость")
578 |
579 |
580 | if __name__ == "__main__":
581 | main()
582 | db.close()
583 |
--------------------------------------------------------------------------------
/cian_parsing.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import requests
4 | from bs4 import BeautifulSoup
5 | import time
6 | import random
7 | from fake_useragent import UserAgent
8 | import datetime
9 | from selenium import webdriver
10 | from xvfbwrapper import Xvfb
11 | from selenium.webdriver.chrome.options import Options
12 | from database import DataBase
13 |
14 |
15 | db = DataBase()
16 | visited_urls = []
17 |
18 | # defining chrome options for selenium
19 | options = Options()
20 | options.add_argument("--no-sandbox")
21 |
22 |
23 | def get_html(url):
24 | req = requests.get(url, headers={"User-Agent": UserAgent().chrome})
25 | return req.text.encode(req.encoding)
26 |
27 |
28 | def get_title(soup):
29 | try:
30 | title = soup.find("h1").text.strip()
31 | except Exception as e:
32 | #print(str(e) + " title")
33 | title = "Не указано"
34 | return title
35 |
36 |
37 | def get_address(soup):
38 | try:
39 | address = soup.find("address").text.strip()
40 | if "На карте" in address:
41 | address = address[:address.rfind("На карте")]
42 | # separating data from the address string
43 | district, street = "Не указано", "Не указано"
44 | city = address.split(",")[1].strip()
45 | block_number = address.split(",")[-1].strip()
46 | if "ул " in block_number.lower() or "ул." in block_number.lower() or "улица" in block_number.lower() \
47 | or " пер" in block_number.lower() or "проезд" in block_number.lower() or "проспект" in block_number.lower():
48 | street = block_number
49 | block_number = "Не указано"
50 |
51 | for param in address.split(",")[1:-1]:
52 | if "ул " in param.lower() or "ул." in param.lower() or "улица" in param.lower() or " пер" in param.lower() \
53 | or "проезд" in param.lower() or "проспект" in param.lower():
54 | street = param.strip()
55 | elif "район" in param.lower() or "р-н" in param.lower():
56 | district = param.strip()
57 |
58 | if street.split()[-1].strip().isdigit():
59 | block_number = street.split()[-1].strip()
60 | street = " ".join(street.split()[:-1]).strip()
61 |
62 | return city, district, street, block_number
63 | except Exception as e:
64 | with open("logs.txt", "a", encoding="utf8") as file:
65 | file.write(str(e) + " cian get_title\n")
66 | return ["Не указано"] * 4
67 |
68 |
69 | def get_price(soup):
70 | try:
71 | price = soup.find("span", {"itemprop": "price"})
72 | if price is not None:
73 | price = price.text.strip()
74 | else:
75 | price = "от " + soup.find("span", {"itemprop": "lowPrice"}).text.strip() + \
76 | " до " + soup.find("span", {"itemprop": "highPrice"}).text.strip() + "/мес."
77 | except Exception as e:
78 | with open("logs.txt", "a", encoding="utf8") as file:
79 | file.write(str(e) + " cian get_price\n")
80 | price = "Не указано"
81 | return price
82 |
83 |
84 | def get_selling_type(soup):
85 | try:
86 | paragraphs = [x for x in soup.find_all("p") if x.get("class") is not None
87 | and len(x.get("class")) == 1 and "description--" in x.get("class")[0]]
88 | if paragraphs:
89 | selling_type = paragraphs[0].text.strip()
90 | else:
91 | selling_type = "Не указано"
92 | except Exception as e:
93 | with open("logs.txt", "a", encoding="utf8") as file:
94 | file.write(str(e) + " cian get_selling_type\n")
95 | selling_type = "Не указано"
96 | return selling_type
97 |
98 |
99 | def get_seller_type(soup):
100 | try:
101 | divs = [x for x in soup.find_all("div") if x.get("class") is not None
102 | and len(x.get("class")) == 1 and "honest-container" in x.get("class")[0]]
103 | if not divs:
104 | seller_type = "Не указано"
105 | else:
106 | seller_type = divs[0].text.strip()
107 | if seller_type is not None and seller_type.lower() == "собственник":
108 | seller_type = "Собственник"
109 | else:
110 | seller_type = "Посредник"
111 | except Exception as e:
112 | with open("logs.txt", "a", encoding="utf8") as file:
113 | file.write(str(e) + " cian get_seller_type\n")
114 | seller_type = "Не указано"
115 | return seller_type
116 |
117 |
118 | def get_seller_name(soup):
119 | try:
120 | name = [x for x in soup.find_all("h2") if x.get("class") is not None and len(x.get("class")) == 1
121 | and "title--" in x.get("class")[0]]
122 | if name:
123 | name = name[0].text.strip()
124 | except Exception as e:
125 | with open("logs.txt", "a", encoding="utf8") as file:
126 | file.write(str(e) + " cian get_seller_name\n")
127 | name = "Не указано"
128 | return name
129 |
130 |
131 | def get_photos(url):
132 | try:
133 | driver = webdriver.Chrome()
134 | driver.get(url)
135 |
136 | images = []
137 | images_list = driver.find_elements_by_class_name("fotorama__img")
138 | images_list = [x.get_attribute("src") for x in images_list if "-2." in x.get_attribute("src")]
139 | for image in images_list:
140 | link = image.replace("-2.", "-1.")
141 | images.append(link)
142 | images = "\n".join(images)
143 | except Exception as e:
144 | with open("logs.txt", "a", encoding="utf8") as file:
145 | file.write(str(e) + " cian get_photos\n")
146 | images = "Не указано"
147 | return images
148 |
149 |
150 | def get_description(soup):
151 | try:
152 | paragraphs = [x for x in soup.find_all("p") if x.get("class") is not None
153 | and len(x.get("class")) == 1 and "description-text--" in x.get("class")[0]]
154 | description = paragraphs[0].text.strip()
155 | except Exception as e:
156 | with open("logs.txt", "a", encoding="utf8") as file:
157 | file.write(str(e) + " cian get_description\n")
158 | description = "Не указано"
159 | return description
160 |
161 |
162 | def get_date(soup):
163 | try:
164 | date = soup.find("div", id="frontend-offer-card").find("main").find_all("div")[4].text.strip()
165 | if "вчера" in date:
166 | date = str(datetime.datetime.today() - datetime.timedelta(days=1)).split()[0]
167 | elif "сегодня" in date:
168 | date = str(datetime.datetime.today()).split()[0]
169 | else:
170 | date = "too old"
171 | except Exception as e:
172 | with open("logs.txt", "a", encoding="utf8") as file:
173 | file.write(str(e) + " cian get_date\n")
174 | date = "Не указано"
175 | return date
176 |
177 |
178 | def driver_get_phone_and_images(url):
179 | vdisplay = Xvfb()
180 | vdisplay.start()
181 | driver = webdriver.Chrome(options=options)
182 | driver.set_window_size(1920, 1080)
183 | driver.get(url)
184 |
185 | try:
186 | images = []
187 | images_list = driver.find_elements_by_class_name("fotorama__img")
188 | images_list = [x.get_attribute("src") for x in images_list if "-2." in x.get_attribute("src")]
189 | for image in images_list:
190 | link = image.replace("-2.", "-1.")
191 | images.append(link)
192 | images = "\n".join(images)
193 | if not images:
194 | # берем с обложки
195 | images = driver.find_element_by_class_name("fotorama__img").get_attribute("src")
196 | except Exception as e:
197 | with open("logs.txt", "a", encoding="utf8") as file:
198 | file.write(str(e) + " cian get_images\n")
199 | images = "Не указано"
200 |
201 | try:
202 | button = [x for x in driver.find_elements_by_tag_name("button") if x.text.strip() == "Показать телефон"][-1]
203 | button.click()
204 | phone = "\n".join([x.text.strip() for x in driver.find_elements_by_tag_name("a") if x.get_attribute("class") is not None
205 | and "phone--" in x.get_attribute("class")])
206 | except Exception as e:
207 | phone = "Не указано"
208 | with open("logs.txt", "a", encoding="utf8") as file:
209 | file.write(str(e) + " cian get_phone\n")
210 | driver.quit()
211 | vdisplay.stop()
212 | return images, phone
213 |
214 |
215 | def get_apartment_params(soup):
216 | block_type, rooms_number, total_floors, total_area, material, year, kitchen_area, living_area, floor = ["Не указано"] * 9
217 | try:
218 | main_params = [x.text.strip() for x in soup.find_all("div") if x.get("class") is not None
219 | and len(x.get("class")) == 1 and "info-title--" in x.get("class")[0]]
220 | main_values = [x.text.strip() for x in soup.find_all("div") if x.get("class") is not None
221 | and len(x.get("class")) == 1 and "info-text--" in x.get("class")[0]]
222 | for i in range(len(main_params)):
223 | if "Общая" in main_params[i]:
224 | total_area = main_values[i]
225 | elif "Построен" in main_params[i]:
226 | year = main_values[i]
227 | elif "Кухня" in main_params[i]:
228 | kitchen_area = main_values[i]
229 | elif "Жилая" in main_params[i]:
230 | living_area = main_values[i]
231 |
232 | desc_params = [x.text.strip() for x in soup.find_all("span") if x.get("class") is not None
233 | and len(x.get("class")) == 1 and "name--" in x.get("class")[0]]
234 | desc_values = [x.text.strip() for x in soup.find_all("span") if x.get("class") is not None
235 | and len(x.get("class")) == 1 and "value--" in x.get("class")[0]]
236 | for i in range(len(desc_params)):
237 | if "Тип жилья" in desc_params[i]:
238 | block_type = desc_values[i]
239 | elif "Количество комнат" in desc_params[i]:
240 | rooms_number = desc_values[i]
241 | elif "Этаж" in desc_params[i]:
242 | floor = desc_values[i]
243 | elif "Этажей в доме" in desc_params[i]:
244 | total_floors = desc_values[i]
245 | elif "Тип дома" in desc_params[i]:
246 | material = desc_values[i]
247 |
248 | if year == "Не указано":
249 | building_params = [x.text.strip() for x in soup.find_all("div") if x.get("class") is not None
250 | and len(x.get("class")) == 1 and "name--" in x.get("class")[0]]
251 | building_values = [x.text.strip() for x in soup.find_all("div") if x.get("class") is not None
252 | and len(x.get("class")) == 1 and "value--" in x.get("class")[0]]
253 | for i in range(len(building_params)):
254 | if "Год постройки" in building_params[i]:
255 | year = building_values[i]
256 | break
257 | except Exception as e:
258 | with open("logs.txt", "a", encoding="utf8") as file:
259 | file.write(str(e) + " cian get_apartment_params\n")
260 | return block_type, rooms_number, total_floors, total_area, material, year, kitchen_area, living_area, floor
261 |
262 |
263 | def get_cottage_params(soup):
264 | total_area, material, land_area, status, comforts, total_floors = ["Не указано"] * 6
265 | try:
266 | main_params = [x.text.strip() for x in soup.find_all("div") if x.get("class") is not None
267 | and len(x.get("class")) == 1 and "info-title--" in x.get("class")[0]]
268 | main_values = [x.text.strip() for x in soup.find_all("div") if x.get("class") is not None
269 | and len(x.get("class")) == 1 and "info-text--" in x.get("class")[0]]
270 | for i in range(len(main_params)):
271 | if "Общая" in main_params[i]:
272 | total_area = main_values[i]
273 | elif "Участок" in main_params[i]:
274 | land_area = main_values[i]
275 | elif "Тип дома" in main_params[i]:
276 | material = main_values[i]
277 | elif "Этажей в доме" in main_params[i]:
278 | total_floors = main_values[i]
279 |
280 | comforts_list = [x.text.strip() for x in soup.find_all("li") if x.get("class") is not None
281 | and len(x.get("class")) == 2 and "item--" in x.get("class")[0]]
282 | if comforts:
283 | comforts = "; ".join(comforts_list)
284 |
285 | desc_params = [x.text.strip() for x in soup.find_all("span") if x.get("class") is not None
286 | and len(x.get("class")) == 1 and "name--" in x.get("class")[0]]
287 | desc_values = [x.text.strip() for x in soup.find_all("span") if x.get("class") is not None
288 | and len(x.get("class")) == 1 and "value--" in x.get("class")[0]]
289 | for i in range(len(desc_params)):
290 | if "Статус участка" in desc_params[i]:
291 | status = desc_values[i]
292 | elif land_area == "Не указано" and "Площадь участка" in desc_params[i]:
293 | land_area = desc_values[i]
294 | elif material == "Не указано" and "Тип дома" in desc_params[i]:
295 | material = desc_values[i]
296 | except Exception as e:
297 | with open("logs.txt", "a", encoding="utf8") as file:
298 | file.write(str(e) + " cian get_cottage_params\n")
299 | return total_area, material, land_area, status, comforts, total_floors
300 |
301 |
302 | def get_commercial_params(soup):
303 | area, office_class, floor, furniture, entrance = ["Не указано"] * 5
304 | try:
305 | main_params = [x.text.strip() for x in soup.find_all("div") if x.get("class") is not None
306 | and len(x.get("class")) == 1 and "info-title--" in x.get("class")[0]]
307 | main_values = [x.text.strip() for x in soup.find_all("div") if x.get("class") is not None
308 | and len(x.get("class")) == 1 and "info-text--" in x.get("class")[0]]
309 | for i in range(len(main_params)):
310 | if "Класс" in main_params[i]:
311 | office_class = main_values[i]
312 | elif "Этаж" in main_params[i]:
313 | floor = main_values[i]
314 | elif "Площадь" in main_params[i]:
315 | area = main_values[i]
316 |
317 | desc_params = [x.text.strip() for x in soup.find_all("span") if x.get("class") is not None
318 | and len(x.get("class")) == 1 and "name--" in x.get("class")[0]]
319 | desc_values = [x.text.strip() for x in soup.find_all("span") if x.get("class") is not None
320 | and len(x.get("class")) == 1 and "value--" in x.get("class")[0]]
321 | for i in range(len(desc_params)):
322 | if "Вход" in desc_params[i]:
323 | entrance = desc_values[i]
324 | elif "Мебель" in desc_params[i]:
325 | furniture = desc_values[i]
326 | except Exception as e:
327 | with open("logs.txt", "a", encoding="utf8") as file:
328 | file.write(str(e) + " cian get_commercial_params\n")
329 | return area, office_class, floor, furniture, entrance
330 |
331 |
332 | def get_apartment_data(html, url):
333 | soup = BeautifulSoup(html, "lxml")
334 |
335 | # title = get_title(soup)
336 | city, district, street, block_number = get_address(soup)
337 | price = get_price(soup)
338 | block_type, rooms_number, total_floors, total_area, material, year, kitchen_area, living_area, floor = get_apartment_params(soup)
339 | selling_detail = get_selling_type(soup)
340 | if "продажа" in selling_detail.lower() or "ипотека" in selling_detail.lower():
341 | rent_info = "Не аренда"
342 | else:
343 | rent_info = selling_detail
344 | selling_detail = "Не указано"
345 | #seller_type = get_seller_type(soup)
346 | description = get_description(soup)
347 | date = get_date(soup)
348 | images, phone = driver_get_phone_and_images(url)
349 |
350 | return [city, district, street, block_number, rent_info, price, block_type,
351 | rooms_number, total_area, total_floors, material, selling_detail, images,
352 | description, date, phone, kitchen_area, living_area, floor]
353 |
354 |
355 | def get_cottage_data(html, url):
356 | soup = BeautifulSoup(html, "lxml")
357 |
358 | title = get_title(soup)
359 | city, district, street, block_number = get_address(soup)
360 | price = get_price(soup)
361 | cottage_type = title.split(",")[0]
362 | total_area, material, land_area, status, comforts, total_floors = get_cottage_params(soup)
363 | selling_detail = get_selling_type(soup)
364 | if "продажа" in selling_detail.lower() or "ипотека" in selling_detail.lower():
365 | rent_info = "Не аренда"
366 | else:
367 | rent_info = selling_detail
368 | selling_detail = "Не указано"
369 | description = get_description(soup)
370 | date = get_date(soup)
371 | images, phone = driver_get_phone_and_images(url)
372 | seller_name = get_seller_name(soup)
373 |
374 | return [city, district, street, block_number, rent_info, price, cottage_type,
375 | total_area, comforts, selling_detail, images, description, date, phone, material,
376 | total_floors, land_area, status, seller_name]
377 |
378 |
379 | def get_commercial_data(html, url):
380 | soup = BeautifulSoup(html, "lxml")
381 |
382 | title = get_title(soup)
383 | city, district, street, block_number = get_address(soup)
384 | price = get_price(soup)
385 |
386 | if "офис" in title.lower():
387 | object_type = "Офисное помещение"
388 | elif "торговая площадь" in title.lower():
389 | object_type = "Торговая площадь"
390 | elif "склад" in title.lower():
391 | object_type = "Склад"
392 | elif "своб. назнач." in title.lower() or "свободное назначение" in title.lower():
393 | object_type = "Свободного назначения"
394 | elif "гараж" in title.lower():
395 | object_type = "Гараж"
396 | elif "автосервис" in title.lower():
397 | object_type = "Автосервис"
398 | elif "производство" in title.lower():
399 | object_type = "Производство"
400 | elif "готовый бизнес" in title.lower():
401 | object_type = "Готовый бизнес"
402 | else:
403 | object_type = "Не указано"
404 |
405 | area, office_class, floor, furniture, entrance = get_commercial_params(soup)
406 | if object_type != "Офисное помещение":
407 | office_class = "Не офис"
408 | description = get_description(soup)
409 | date = get_date(soup)
410 | images, phone = driver_get_phone_and_images(url)
411 | seller_name = get_seller_name(soup)
412 |
413 | return [city, district, street, block_number, price, object_type, office_class,
414 | furniture, entrance, area, date, phone, images, description, seller_name]
415 |
416 |
417 | def crawl_page(page, html, category, sell_type):
418 | global visited_urls, db
419 | soup = BeautifulSoup(html, "lxml")
420 | if page != 1 and "".join([x.text.strip() for x in soup.find_all("li")
421 | if len(x.get("class")) == 2 and "list-item--active" in "".join(x.get("class"))]) == "1":
422 | print("Парсинг завершен cian")
423 | return True
424 | # так как пагинация динамическая и мы не можем получить число страниц, проверяем, есть ли на странице объявления
425 | try:
426 | offers = [x for x in soup.find("div", id="frontend-serp").find("div").find_all("div")
427 | if x.get("class") is not None and "offer-container" in x.get("class")[0]]
428 | except:
429 | offers = []
430 | if offers is None or not offers:
431 | print("Парсинг завершен cian")
432 | return True
433 | for offer in offers:
434 | try:
435 | url = offer.find("a").get("href")
436 | if url in visited_urls:
437 | print("cian not unique")
438 | time.sleep(random.uniform(5, 8))
439 | continue
440 | else:
441 | visited_urls.append(url)
442 | #print(url)
443 |
444 | data = []
445 | if category == "Квартиры":
446 | data = get_apartment_data(get_html(url), url)
447 | # записываем ключевую информацию, чтобы потом найти дубликаты
448 | with open("total_data.txt", "a", encoding="utf8") as file:
449 | file.write("%s--%s--%s--%s--%s--%s\n" % (data[2], data[3], data[4], data[8], data[-1], url))
450 | elif category == "Дома":
451 | data = get_cottage_data(get_html(url), url)
452 | with open("total_data.txt", "a", encoding="utf8") as file:
453 | file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[7], data[8], url))
454 | elif category == "Коммерческая_недвижимость":
455 | data = get_commercial_data(get_html(url), url)
456 | with open("total_data.txt", "a", encoding="utf8") as file:
457 | file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[6], data[10], url))
458 |
459 | # на каком месте находится дата объявления
460 | index_of_date = -1
461 | if category == "Квартиры" or category == "Коммерческая_недвижимость":
462 | index_of_date = -5
463 | elif category == "Дома":
464 | index_of_date = -7
465 | elif category == "Участки":
466 | index_of_date = -1
467 | if data[index_of_date] == "too old":
468 | print("Парсинг завершен cian")
469 | return True
470 |
471 | data.insert(4, sell_type)
472 | if data[0] != "Не указано":
473 | try:
474 | db.insert_data(category, data)
475 | except:
476 | db.close()
477 | db = DataBase()
478 | db.insert_data(category, data)
479 | print("parsed page cian")
480 |
481 | #print(*data, sep="\n")
482 | #print("--------------------------------------")
483 |
484 | except Exception as e:
485 | with open("logs.txt", "a", encoding="utf8") as file:
486 | file.write(str(e) + " cian crawl_page\n")
487 |
488 | time.sleep(random.uniform(5, 8))
489 |
490 |
491 | def parse(category_url, category_name, sell_type):
492 | completed = False
493 | page = 1
494 | while not completed:
495 | url_gen = category_url[:category_url.rfind("=") + 1] + str(page)
496 | completed = crawl_page(page, get_html(url_gen), category_name, sell_type)
497 | page += 1
498 |
499 |
500 | def main():
501 | global visited_urls
502 | url_cottages_sell = "https://saratov.cian.ru/cat.php?deal_type=sale&engine_version=2&object_type%5B0%5D=1&offer_type=suburban®ion=4609&totime=86400&page=1"
503 | parse(url_cottages_sell, "Дома", "Продажа")
504 |
505 | visited_urls = []
506 | url_cottages_rent = "https://saratov.cian.ru/cat.php?deal_type=rent&engine_version=2&object_type%5B0%5D=1&offer_type=suburban®ion=4609&totime=86400&page=1"
507 | parse(url_cottages_rent, "Дома", "Аренда")
508 |
509 | visited_urls = []
510 | url_commercials_sell = "https://saratov.cian.ru/cat.php?deal_type=sale&engine_version=2&offer_type=offices&office_type%5B0%5D=1&office_type%5B10%5D=12&office_type%5B1%5D=2&office_type%5B2%5D=3&office_type%5B3%5D=4&office_type%5B4%5D=5&office_type%5B5%5D=6&office_type%5B6%5D=7&office_type%5B7%5D=9&office_type%5B8%5D=10&office_type%5B9%5D=11®ion=4609&totime=86400&page=1"
511 | parse(url_commercials_sell, "Коммерческая_недвижимость", "Продажа")
512 |
513 | visited_urls = []
514 | url_commercials_rent = "https://saratov.cian.ru/cat.php?deal_type=rent&engine_version=2&offer_type=offices&office_type%5B0%5D=1&office_type%5B10%5D=12&office_type%5B1%5D=2&office_type%5B2%5D=3&office_type%5B3%5D=4&office_type%5B4%5D=5&office_type%5B5%5D=6&office_type%5B6%5D=7&office_type%5B7%5D=9&office_type%5B8%5D=10&office_type%5B9%5D=11®ion=4609&totime=86400&page=1"
515 | parse(url_commercials_rent, "Коммерческая_недвижимость", "Аренда")
516 |
517 | visited_urls = []
518 | url_apartments_sell = "https://saratov.cian.ru/cat.php?deal_type=sale&engine_version=2&offer_type=flat®ion=4609&room1=1&room2=1&room3=1&room4=1&room5=1&room6=1&room7=1&room9=1&totime=86400&page=1"
519 | parse(url_apartments_sell, "Квартиры", "Продажа")
520 |
521 | visited_urls = []
522 | url_apartments_rent = "https://saratov.cian.ru/cat.php?deal_type=rent&engine_version=2&offer_type=flat®ion=4609&room1=1&room2=1&room3=1&room4=1&room5=1&room6=1&room7=1&room9=1&totime=86400&page=1"
523 | parse(url_apartments_rent, "Квартиры", "Аренда")
524 |
525 |
526 | if __name__ == "__main__":
527 | main()
528 | db.close()
529 |
--------------------------------------------------------------------------------
/database.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import mysql.connector
4 | from mysql.connector import Error
5 |
6 | # using sensitive data placeholders, replace it with passwords
7 | host = "host"
8 | database = "db"
9 | user = "user"
10 | password = "pass"
11 |
12 |
13 | class DataBase:
14 | def __init__(self):
15 | try:
16 | self.conn = mysql.connector.connect(host=host, database=database, user=user, password=password)
17 | self.cursor = self.conn.cursor()
18 | except Error as error:
19 | print("Error while connecting to database", error)
20 |
21 | def close(self):
22 | self.cursor.close()
23 | self.conn.close()
24 |
25 | def create_table(self, category):
26 | if category == "Квартиры":
27 | self.cursor.execute("CREATE TABLE IF NOT EXISTS Квартиры"
28 | "(Город TEXT, Район TEXT, Улица TEXT, Номер_дома TEXT, "
29 | "Тип_сделки TEXT, Срок_аренды TEXT, Цена TEXT, Тип_дома TEXT, Количество_комнат TEXT, "
30 | "Общая_площадь TEXT, Количество_этажей TEXT, Материал_стен TEXT, Тип_продажи TEXT, "
31 | "Фото TEXT, Описание TEXT, Дата TEXT, Телефон TEXT, Площадь_кухни TEXT, Жилая_площадь TEXT, "
32 | "Этаж TEXT);")
33 |
34 | elif category == "Дома":
35 | self.cursor.execute("CREATE TABLE IF NOT EXISTS Дома"
36 | "(Город TEXT, Район TEXT, Улица TEXT, Номер_дома TEXT, Тип_сделки TEXT, Срок_аренды TEXT, "
37 | "Цена TEXT, Тип_дома TEXT, Площадь_дома TEXT, Удобства TEXT, Тип_продажи TEXT, "
38 | "Фото TEXT, Описание TEXT, Дата TEXT, Телефон TEXT, Материал_стен TEXT, "
39 | "Количество_этажей TEXT, Площадь_участка TEXT, Статус_участка TEXT, Имя_продавца TEXT);")
40 |
41 | elif category == "Коммерческая_недвижимость":
42 | self.cursor.execute("CREATE TABLE IF NOT EXISTS Коммерческая_недвижимость"
43 | "(Город TEXT, Район TEXT, Улица TEXT, Номер_дома TEXT, Тип_сделки TEXT, Цена TEXT, "
44 | "Тип_недвижимости TEXT, Класс_здания TEXT, Мебель TEXT, Вход TEXT, Общая_площадь TEXT, "
45 | "Дата TEXT, Телефон TEXT, Фото TEXT, Описание TEXT, Имя_продавца TEXT);")
46 |
47 | elif category == "Участки":
48 | self.cursor.execute("CREATE TABLE IF NOT EXISTS Участки"
49 | "(Город TEXT, Район TEXT, Улица TEXT, Тип_сделки TEXT, Залог TEXT, Статус_участка TEXT, "
50 | "Расстояние_до_города TEXT, Площадь_участка TEXT, Цена TEXT, Право_собственности TEXT, "
51 | "Фото TEXT, Описание TEXT, Имя_продавца TEXT, Телефон TEXT, Дата TEXT);")
52 |
53 | elif category == "Дубликаты":
54 | self.cursor.execute("CREATE TABLE IF NOT EXISTS Дубликаты (Заголовок TEXT, URLs TEXT);")
55 |
56 | def insert_data(self, table_name, data):
57 | data_string = ', '.join(['%s'] * len(data))
58 | query = "INSERT INTO %s VALUES (%s);" % (table_name, data_string)
59 | self.cursor.execute(query, data)
60 | self.conn.commit()
61 |
--------------------------------------------------------------------------------
/irr_parsing.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import requests
4 | from bs4 import BeautifulSoup
5 | import time
6 | import random
7 | from fake_useragent import UserAgent
8 | import datetime
9 | import base64
10 | from database import DataBase
11 |
12 |
13 | # на каких записях останавливаться
14 | with open("breakpoints/irr.txt", "r", encoding="utf8") as file:
15 | breakpoints = file.readlines()
16 | try:
17 | break_apartment_sell = tuple(breakpoints[0].strip().split("--"))
18 | except:
19 | break_apartment_sell = None
20 | try:
21 | break_apartment_rent = tuple(breakpoints[1].strip().split("--"))
22 | except:
23 | break_apartment_rent = None
24 | try:
25 | break_commercial_sell = tuple(breakpoints[2].strip().split("--"))
26 | except:
27 | break_commercial_sell = None
28 | try:
29 | break_commercial_rent = tuple(breakpoints[3].strip().split("--"))
30 | except:
31 | break_commercial_rent = None
32 | try:
33 | break_cottage_sell = tuple(breakpoints[4].strip().split("--"))
34 | except:
35 | break_cottage_sell = None
36 | try:
37 | break_cottage_rent = tuple(breakpoints[5].strip().split("--"))
38 | except:
39 | break_cottage_rent = None
40 |
41 | # получаем вчерашнюю дату
42 | today = datetime.datetime.today()
43 | yesterday = str(today - datetime.timedelta(days=2)).split()[0].split("-")
44 | if yesterday[1][0] == "0":
45 | yesterday[1] = yesterday[1][1:]
46 | if yesterday[2][0] == "0":
47 | yesterday[2] = yesterday[2][1:]
48 | months = {
49 | "1": "января",
50 | "2": "февраля",
51 | "3": "марта",
52 | "4": "апреля",
53 | "5": "мая",
54 | "6": "июня",
55 | "7": "июля",
56 | "8": "августа",
57 | "9": "сентября",
58 | "10": "октября",
59 | "11": "ноября",
60 | "12": "декабря"
61 | }
62 | date_break_point = yesterday[2] + " " + months[yesterday[1]]
63 |
64 | db = DataBase()
65 | visited_urls = []
66 |
67 |
68 | def get_html(url):
69 | req = requests.get(url, headers={"User-Agent": UserAgent().chrome})
70 | return req.text.encode(req.encoding)
71 |
72 |
73 | def get_total_pages(html):
74 | soup = BeautifulSoup(html, "lxml")
75 | total_pages = soup.find("div", class_="pagination__pages")
76 | if total_pages is not None:
77 | total_pages = total_pages.find_all("a", class_="pagination__pagesLink")[-1].text.strip()
78 | else:
79 | total_pages = 1
80 | return int(total_pages)
81 |
82 |
83 | def get_title(soup):
84 | try:
85 | title = soup.find("h1", class_="productPage__title").text.strip()
86 | except Exception as e:
87 | with open("logs.txt", "a", encoding="utf8") as file:
88 | file.write(str(e) + " irr get_title\n")
89 | title = "Не указано"
90 | return title
91 |
92 |
93 | def get_address(soup):
94 | try:
95 | address = soup.find("div", class_="productPage__infoTextBold js-scrollToMap").text.strip()
96 | city = address.split(",")[0]
97 | except Exception as e:
98 | with open("logs.txt", "a", encoding="utf8") as file:
99 | file.write(str(e) + " irr get_address\n")
100 | city = "Не указано"
101 | return city
102 |
103 |
104 | def get_material(soup):
105 | try:
106 | material = "Не указано"
107 | building_params = soup.find_all("div", class_="productPage__infoColumnBlock js-columnBlock")[-1].find_all("li", class_="productPage__infoColumnBlockText")
108 | for i in range(len(building_params)):
109 | info = building_params[i].text.strip()
110 | if "Материал стен" in info:
111 | material = info.split(":")[1].strip()
112 | except Exception as e:
113 | with open("logs.txt", "a", encoding="utf8") as file:
114 | file.write(str(e) + " irr get_material\n")
115 | material = "Не указано"
116 | return material
117 |
118 |
119 | def get_price(soup):
120 | try:
121 | price = " ".join(soup.find("div", class_="productPage__price").text.strip().split("\xa0"))
122 | fee = soup.find("div", class_="productPage__fee")
123 | if fee is not None:
124 | price += " (" + fee.text.strip() + ")"
125 |
126 | if "в месяц" in price:
127 | rent_info = "длительный срок"
128 | elif "за сутки" in price:
129 | rent_info = "посуточно"
130 | else:
131 | rent_info = "Не аренда"
132 |
133 | except Exception as e:
134 | with open("logs.txt", "a", encoding="utf8") as file:
135 | file.write(str(e) + " irr get_price\n")
136 | price, rent_info = "Не указано", "Не указано"
137 | return price, rent_info
138 |
139 |
140 | def get_block_type(soup):
141 | block_type = "Вторичка"
142 | try:
143 | seller_site = soup.find("a", class_="js-sellerSiteLink")
144 | if seller_site is not None:
145 | block_type = "Новостройка"
146 | except Exception as e:
147 | with open("logs.txt", "a", encoding="utf8") as file:
148 | file.write(str(e) + " irr get_block_type\n")
149 | return block_type
150 |
151 |
152 | def get_seller_info(soup):
153 | try:
154 | company_name = soup.find("div", class_="productPage__infoTextBold productPage__infoTextBold_inline").find("a")
155 | if company_name is not None:
156 | seller_type = "Компания"
157 | seller_name = company_name.text.strip()
158 | else:
159 | seller_type = "Частное лицо"
160 | seller_name = soup.find("div", class_="productPage__infoTextBold productPage__infoTextBold_inline").text.strip()
161 | except Exception as e:
162 | with open("logs.txt", "a", encoding="utf8") as file:
163 | file.write(str(e) + " irr get_seller_info\n")
164 | seller_name, seller_type = "Не указано", "Не указано"
165 | return seller_type, seller_name
166 |
167 |
168 | def get_photos(soup):
169 | try:
170 | images = []
171 | images_list = soup.find("div", class_="lineGallery js-lineProductGallery").find_all("meta")
172 | for image in images_list:
173 | link = image.get("content")
174 | images.append(link)
175 | images = "\n".join(images)
176 | except Exception as e:
177 | with open("logs.txt", "a", encoding="utf8") as file:
178 | file.write(str(e) + " irr get_photos\n")
179 | images = "Не указано"
180 | return images
181 |
182 |
183 | def get_description(soup):
184 | try:
185 | description = " ".join(soup.find("p", class_="productPage__descriptionText js-productPageDescription").text.strip().split())
186 | except Exception as e:
187 | with open("logs.txt", "a", encoding="utf8") as file:
188 | file.write(str(e) + " irr get_description\n")
189 | description = "Не указано"
190 | return description
191 |
192 |
193 | def get_date(soup):
194 | try:
195 | relative_date = soup.find("div", class_="productPage__createDate").find("span").text.strip()
196 | if "," in relative_date:
197 | if relative_date.split(",")[0] == "сегодня":
198 | date = str(datetime.datetime.today()).split()[0] + relative_date.split(",")[1]
199 | else:
200 | date = str(datetime.datetime.today() - datetime.timedelta(days=2)).split()[0] + relative_date.split(",")[1]
201 | else:
202 | date = relative_date
203 | except Exception as e:
204 | with open("logs.txt", "a", encoding="utf8") as file:
205 | file.write(str(e) + " irr get_date\n")
206 | date = "Не указано"
207 | return date
208 |
209 |
210 | def get_seller_phone(soup):
211 | try:
212 | ciphered_phone = soup.find("input", {"class": "js-backendVar", "name": "phoneBase64"}).get("value")
213 | except Exception as e:
214 | with open("logs.txt", "a", encoding="utf8") as file:
215 | file.write(str(e) + " irr get_seller-phone\n")
216 | ciphered_phone = "Не указано"
217 | return base64.b64decode(ciphered_phone).decode("utf-8")
218 |
219 |
220 | def get_apartment_params(soup):
221 | rooms_number, floor, total_floors, total_area, kitchen_area, living_area, furnish, district, street, block_number = ["Не указано"] * 10
222 | try:
223 | building_params = []
224 | divs = soup.find_all("div", class_="productPage__infoColumnBlock js-columnBlock")
225 | for i in range(len(divs)):
226 | building_params.extend(divs[i].find_all("li", class_="productPage__infoColumnBlockText"))
227 |
228 | for i in range(len(building_params)):
229 | info = building_params[i].text.strip()
230 | if "Этаж:" in info:
231 | floor = info.split(":")[1].strip()
232 | elif "Этажей в здании" in info:
233 | total_floors = info.split(":")[1].strip()
234 | elif "Комнат в квартире" in info:
235 | rooms_number = info.split(":")[1].strip()
236 | elif "Общая площадь" in info:
237 | total_area = info.split(":")[1].strip()
238 | elif "Жилая площадь" in info:
239 | living_area = info.split(":")[1].strip()
240 | elif "Площадь кухни" in info:
241 | kitchen_area = info.split(":")[1].strip()
242 | elif "Ремонт" in info:
243 | furnish = info.split(":")[1].strip()
244 | if furnish == "1":
245 | # иногда выводит "1", хотя на странице не указан
246 | furnish = "Не указано"
247 | elif "Улица" in info:
248 | street = info.split(":")[1].strip()
249 | elif "Район города" in info:
250 | district = info.split(":")[1].strip()
251 | elif "Дом" in info:
252 | block_number = info.split(":")[1].strip()
253 | except Exception as e:
254 | with open("logs.txt", "a", encoding="utf8") as file:
255 | file.write(str(e) + " irr get_apartment_params\n")
256 | return rooms_number, floor, total_floors, total_area, kitchen_area, living_area, furnish, district, street, block_number
257 |
258 |
259 | def get_commercial_params(soup):
260 | building_type, parking, ceilings, area, entrance, district, street, block_number = ["Не указано"] * 8
261 | try:
262 | building_params = []
263 | divs = soup.find_all("div", class_="productPage__infoColumnBlock js-columnBlock")
264 | for i in range(len(divs)):
265 | building_params.extend(divs[i].find_all("li", class_="productPage__infoColumnBlockText"))
266 |
267 | for i in range(len(building_params)):
268 | info = building_params[i].text.strip()
269 | if "Тип здания" in info:
270 | building_type = info.split(":")[1].strip()
271 | elif "Общая площадь" in info:
272 | area = info.split(":")[1].strip()
273 | elif "Парковка" in info:
274 | parking = "Парковка есть"
275 | elif "Высота потолков" in info:
276 | ceilings = info.split(":")[1].strip()
277 | elif "Вход" in info:
278 | entrance = info.strip()
279 | elif "Улица" in info:
280 | street = info.split(":")[1].strip()
281 | elif "Район города" in info:
282 | district = info.split(":")[1].strip()
283 | elif "Дом" in info:
284 | block_number = info.split(":")[1].strip()
285 | except Exception as e:
286 | with open("logs.txt", "a", encoding="utf8") as file:
287 | file.write(str(e) + " irr get_commercial_params\n")
288 | return building_type, parking, ceilings, area, entrance, district, street, block_number
289 |
290 |
291 | def get_cottage_params(soup):
292 | house_area, material, total_floors, land_area, status, comforts, district, street, block_number = ["Не указано"] * 9
293 | try:
294 | building_params = []
295 | divs = soup.find_all("div", class_="productPage__infoColumnBlock js-columnBlock")
296 | for i in range(len(divs)):
297 | building_params.extend(divs[i].find_all("li", class_="productPage__infoColumnBlockText"))
298 | for i in range(len(building_params)):
299 | info = building_params[i].text.strip()
300 | if "Площадь участка" in info:
301 | land_area = info.split(":")[1].strip()
302 | elif "Площадь строения" in info:
303 | house_area = info.split(":")[1].strip()
304 | elif "Материал стен" in info:
305 | material = info.split(":")[1].strip()
306 | elif "Количество этажей" in info:
307 | total_floors = info.split(":")[1].strip()
308 | elif "Вид разрешенного использования" in info:
309 | status = info.split(":")[1].strip()
310 | elif any(x in info.lower() for x in ["отапливаемый", "отопление", "водопровод", "канализация",
311 | "свет", "газ", "вода", "интернет", "телефон"]):
312 | if comforts == "Не указано":
313 | comforts = info.strip()
314 | else:
315 | comforts += "; " + info.strip()
316 | elif "Улица" in info:
317 | street = info.split(":")[1].strip()
318 | elif "Район города" in info:
319 | district = info.split(":")[1].strip()
320 | elif "Дом" in info:
321 | block_number = info.split(":")[1].strip()
322 | except Exception as e:
323 | with open("logs.txt", "a", encoding="utf8") as file:
324 | file.write(str(e) + " irr get_cottage_params\n")
325 | return house_area, material, total_floors, land_area, status, comforts, district, street, block_number
326 |
327 |
328 | def get_apartment_data(html):
329 | soup = BeautifulSoup(html, "lxml")
330 |
331 | #title = get_title(soup)
332 | city = get_address(soup)
333 | material = get_material(soup)
334 | rooms_number, floor, total_floors, total_area, kitchen_area, living_area, furnish, district, street, block_number = get_apartment_params(soup)
335 | price, rent_info = get_price(soup)
336 | block_type = get_block_type(soup)
337 | #seller_type, seller_name = get_seller_info(soup)
338 | images = get_photos(soup)
339 | description = get_description(soup)
340 | phone = get_seller_phone(soup)
341 | date = get_date(soup)
342 | selling_detail = "Не указано" # на irr нет этой информации
343 |
344 | return [city, district, street, block_number, rent_info, price, block_type,
345 | rooms_number, total_area, total_floors, material, selling_detail, images,
346 | description, date, phone, kitchen_area, living_area, floor]
347 |
348 |
349 | def get_commercial_data(html):
350 | soup = BeautifulSoup(html, "lxml")
351 |
352 | title = get_title(soup)
353 | # анализируем вид помещения по заголовку
354 | if "офис" in title.lower():
355 | object_type = "Офисное помещение"
356 | elif "торг" in title.lower():
357 | object_type = "Торговое помещение"
358 | elif "гостиница" in title.lower():
359 | object_type = "Гостиница"
360 | elif "производ" in title.lower():
361 | object_type = "Производственное помещение"
362 | elif "склад" in title.lower():
363 | object_type = "Складское помещение"
364 | elif "помещение" in title.lower():
365 | object_type = "Помещение свободного назначения"
366 | else:
367 | object_type = "Не указано"
368 |
369 | city = get_address(soup)
370 | building_type, parking, ceilings, area, entrance, district, street, block_number = get_commercial_params(soup)
371 | price, rent_info = get_price(soup)
372 | seller_type, seller_name = get_seller_info(soup)
373 | images = get_photos(soup)
374 | description = get_description(soup)
375 | phone = get_seller_phone(soup)
376 | date = get_date(soup)
377 | office_class, furniture = "Не указано", "Не указано" # на irr нет этой информации
378 |
379 | return [city, district, street, block_number, price, object_type, office_class,
380 | furniture, entrance, area, date, phone, images, description, seller_name]
381 |
382 |
383 | def get_cottage_data(html):
384 | soup = BeautifulSoup(html, "lxml")
385 |
386 | title = get_title(soup)
387 |
388 | # определим тип объекта по заголовку
389 | if "дом" in title.lower():
390 | object_type = "Дом"
391 | elif "участок" in title.lower():
392 | object_type = "Участок"
393 | elif "таунхаус" in title.lower():
394 | object_type = "Таунхаус"
395 | else:
396 | object_type = "Не указано"
397 |
398 | city = get_address(soup)
399 | price, rent_info = get_price(soup)
400 | house_area, material, total_floors, land_area, status, comforts, district, street, block_number = get_cottage_params(soup)
401 | _, seller_name = get_seller_info(soup)
402 | date = get_date(soup)
403 | images = get_photos(soup)
404 | description = get_description(soup)
405 | phone = get_seller_phone(soup)
406 | selling_detail = "Не указано" # на irr нет этой информации
407 |
408 | return [city, district, street, block_number, rent_info, price, object_type,
409 | house_area, comforts, selling_detail, images, description, date, phone, material,
410 | total_floors, land_area, status, seller_name]
411 |
412 |
413 | def crawl_page(first_offer, html, category, sell_type):
414 | global visited_urls, db
415 | soup = BeautifulSoup(html, "lxml")
416 | try:
417 | offers = soup.find("div", class_="listing js-productGrid ").find_all("div", class_="listing__item")
418 | except:
419 | offers = []
420 | if offers is None or not offers:
421 | print("Парсинг завершен irr")
422 | return True
423 | for offer in offers:
424 | try:
425 | date = offer.find("span", class_="listing__itemDate").find("div", class_="updateProduct").text.strip()
426 | if date == date_break_point:
427 | print("Парсинг завершен irr")
428 | return True
429 |
430 | url = offer.find("div", class_="listing__itemTitleWrapper").find("a", class_="listing__itemTitle").get("href")
431 | if url in visited_urls:
432 | print("irr not unique")
433 | time.sleep(random.uniform(5, 8))
434 | continue
435 | else:
436 | visited_urls.append(url)
437 | #print(url)
438 |
439 | data = []
440 | if category == "Квартиры":
441 | data = get_apartment_data(get_html(url))
442 | # записываем ключевую информацию, чтобы потом найти дубликаты
443 | with open("total_data.txt", "a", encoding="utf8") as file:
444 | file.write("%s--%s--%s--%s--%s--%s\n" % (data[2], data[3], data[4], data[8], data[-1], url))
445 | elif category == "Дома":
446 | data = get_cottage_data(get_html(url))
447 | with open("total_data.txt", "a", encoding="utf8") as file:
448 | file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[7], data[8], url))
449 | elif category == "Коммерческая_недвижимость":
450 | data = get_commercial_data(get_html(url))
451 | with open("total_data.txt", "a", encoding="utf8") as file:
452 | file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[6], data[10], url))
453 |
454 | if first_offer:
455 | # сохраняем самую первую запись как точку выхода
456 | modifier = "w" if (category == "Квартиры" and sell_type == "Продажа") else "a"
457 | with open("breakpoints/irr.txt", modifier, encoding="utf8") as file:
458 | file.write("%s--%s\n" % (data[2], data[5]))
459 | first_offer = False
460 |
461 | key_info = (data[2], data[5])
462 |
463 | if any(x == key_info for x in [break_apartment_sell, break_apartment_rent, break_commercial_sell,
464 | break_commercial_rent, break_cottage_sell, break_cottage_rent]):
465 | print("Парсинг завершен irr")
466 | return True
467 |
468 | data.insert(4, sell_type)
469 | if data[0] != "Не указано":
470 | try:
471 | db.insert_data(category, data)
472 | except:
473 | db.close()
474 | db = DataBase()
475 | db.insert_data(category, data)
476 | print("parsed page irr")
477 | #print(data)
478 |
479 | except Exception as e:
480 | with open("logs.txt", "a", encoding="utf8") as file:
481 | file.write(str(e) + " irr crawl_page\n")
482 |
483 | time.sleep(random.uniform(5, 8))
484 |
485 |
486 | def parse(category_url, category_name, sell_type):
487 | page_part = "page"
488 |
489 | total_pages = get_total_pages(get_html(category_url))
490 |
491 | for page in range(1, total_pages + 1):
492 | url_gen = category_url + page_part + str(page)
493 | if page == 1:
494 | completed = crawl_page(True, get_html(url_gen), category_name, sell_type)
495 | else:
496 | completed = crawl_page(False, get_html(url_gen), category_name, sell_type)
497 | if completed:
498 | break
499 |
500 |
501 | def main():
502 | global visited_urls
503 | # на сайте есть разделения продажа/аренда
504 | # сначала парсим страницу с предложениями продажи
505 | url_apartments_sell = "https://saratovskaya-obl.irr.ru/real-estate/apartments-sale/sort/date_sort:desc/"
506 | parse(url_apartments_sell, "Квартиры", "Продажа")
507 |
508 | visited_urls = []
509 | url_apartments_rent = "https://saratovskaya-obl.irr.ru/real-estate/rent/sort/date_sort:desc/"
510 | parse(url_apartments_rent, "Квартиры", "Аренда")
511 |
512 | visited_urls = []
513 | url_commercials_sell = "https://saratovskaya-obl.irr.ru/real-estate/commercial-sale/sort/date_sort:desc/"
514 | parse(url_commercials_sell, "Коммерческая_недвижимость", "Продажа")
515 |
516 | visited_urls = []
517 | url_commercials_rent = "https://saratovskaya-obl.irr.ru/real-estate/commercial/sort/date_sort:desc/"
518 | parse(url_commercials_rent, "Коммерческая_недвижимость", "Аренда")
519 |
520 | visited_urls = []
521 | url_cottages_sell = "https://saratovskaya-obl.irr.ru/real-estate/out-of-town/sort/date_sort:desc/"
522 | parse(url_cottages_sell, "Дома", "Продажа")
523 |
524 | visited_urls = []
525 | url_cottages_rent = "https://saratovskaya-obl.irr.ru/real-estate/out-of-town-rent/sort/date_sort:desc/"
526 | parse(url_cottages_rent, "Дома", "Аренда")
527 |
528 |
529 | if __name__ == "__main__":
530 | main()
531 | db.close()
532 |
--------------------------------------------------------------------------------
/kvadrat64_parsing.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import requests
4 | from bs4 import BeautifulSoup
5 | import time
6 | import random
7 | from fake_useragent import UserAgent
8 | import datetime
9 | from selenium import webdriver
10 | from xvfbwrapper import Xvfb
11 | from selenium.webdriver.chrome.options import Options
12 | from database import DataBase
13 |
14 |
15 | # на каких записях останавливаться
16 | with open("breakpoints/kvadrat.txt", "r", encoding="utf8") as file:
17 | breakpoints = file.readlines()
18 | try:
19 | break_apartment_sell = tuple(breakpoints[0].strip().split("--"))
20 | except:
21 | break_apartment_sell = None
22 | try:
23 | break_apartment_rent = tuple(breakpoints[1].strip().split("--"))
24 | except:
25 | break_apartment_rent = None
26 | try:
27 | break_cottage_sell = tuple(breakpoints[2].strip().split("--"))
28 | except:
29 | break_cottage_sell = None
30 | try:
31 | break_cottage_rent = tuple(breakpoints[3].strip().split("--"))
32 | except:
33 | break_cottage_rent = None
34 | try:
35 | break_commercial_sell = tuple(breakpoints[4].strip().split("--"))
36 | except:
37 | break_commercial_sell = None
38 | try:
39 | break_commercial_rent = tuple(breakpoints[5].strip().split("--"))
40 | except:
41 | break_commercial_rent = None
42 | try:
43 | break_dacha_sell = tuple(breakpoints[6].strip().split("--"))
44 | except:
45 | break_dacha_sell = None
46 | try:
47 | break_saratov_land_sell = tuple(breakpoints[7].strip().split("--"))
48 | except:
49 | break_saratov_land_sell = None
50 | try:
51 | break_region_land_sell = tuple(breakpoints[8].strip().split("--"))
52 | except:
53 | break_region_land_sell = None
54 |
55 | # defining chrome options for selenium
56 | options = Options()
57 | options.add_argument("--no-sandbox")
58 |
59 | db = DataBase()
60 | visited_urls = []
61 |
62 |
63 | def transform_date(date_str):
64 | """
65 | Преобразуем дату, чтобы сравнить datetime-объекты
66 | """
67 | day, month, year = date_str.split("-")
68 | if day[0] == "0":
69 | day = day[1]
70 | if month[0] == "0":
71 | month = month[1]
72 |
73 | date = datetime.datetime(int(year), int(month), int(day))
74 | return date
75 |
76 |
77 | def get_html(url):
78 | # сайт использует кодировку windows-1251, поэтому меняем на utf-8
79 | req = requests.get(url, headers={"User-Agent": UserAgent().chrome})
80 | return req.text.encode(req.encoding)
81 |
82 |
83 | def get_total_pages(html):
84 | soup = BeautifulSoup(html, "lxml")
85 | try:
86 | total_pages = soup.find("div", class_="a t100")
87 | if total_pages is not None:
88 | total_pages = total_pages.find_all("a", class_="phase")[-1].text.strip()
89 | else:
90 | total_pages = 0
91 | except Exception as e:
92 | total_pages = 0
93 | with open("logs.txt", "a", encoding="utf8") as file:
94 | file.write(str(e) + " kvadrat get_total_pages\n")
95 | return int(total_pages)
96 |
97 |
98 | def get_title(soup):
99 | try:
100 | title = soup.find("td", class_="hh").text.strip()
101 | except Exception as e:
102 | title = "Не указано"
103 | with open("logs.txt", "a", encoding="utf8") as file:
104 | file.write(str(e) + " kvadrat get_title\n")
105 | return title
106 |
107 |
108 | def get_price(soup):
109 | try:
110 | price = soup.find("td", class_="thprice").text.strip()
111 | except Exception as e:
112 | with open("logs.txt", "a", encoding="utf8") as file:
113 | file.write(str(e) + " kvadrat get_price\n")
114 | price = "Не указано"
115 | return price
116 |
117 |
118 | def get_commercial_price(soup):
119 | price = "Не указано"
120 | try:
121 | aggregated = [x.find_all("span", class_="d") for x in soup.find_all("td", class_="tddec2")] # список из всех ссылок из tddec2
122 | flat_aggregated = [item for sublist in aggregated for item in sublist] # из двумерного списка делаем одномерный
123 | price_params = [x.text.strip() for x in flat_aggregated]
124 | for param in price_params:
125 | if "за м²" in param:
126 | price = "м2".join(param.split("м²"))
127 | except Exception as e:
128 | with open("logs.txt", "a", encoding="utf8") as file:
129 | file.write(str(e) + " kvadrat get_price\n")
130 | return price
131 |
132 |
133 | def get_selling_type(soup):
134 | try:
135 | # если продажа, ищем тип продажи
136 | selling_type = "; ".join([x.text.strip() for x in soup.find("td", class_="tddec2").find_all("span", class_="d")])
137 | if not selling_type:
138 | selling_type = "Не продажа"
139 | # если аренда - срок аренды
140 | rent_info = [x.text.strip() for x in soup.find_all("td", class_="tddec2")[-2].find_all("span", class_="d")]
141 | for info in rent_info:
142 | if "аренда" in info:
143 | rent_info = info
144 | break
145 | if not rent_info:
146 | rent_info = "Не аренда"
147 | except Exception as e:
148 | with open("logs.txt", "a", encoding="utf8") as file:
149 | file.write(str(e) + " kvadrat get_selling_type\n")
150 | selling_type = "Не указано"
151 | rent_info = "Не указано"
152 | return selling_type, rent_info
153 |
154 |
155 | def get_photos(soup):
156 | try:
157 | images = []
158 | # список ссылок на картинки в полном размере
159 | td_images = soup.find("td", class_="tdimg").find_all("a")
160 | for image_item in td_images:
161 | link = "https://kvadrat64.ru/" + image_item.get("href")
162 | html_gallery = BeautifulSoup(get_html(link), "lxml")
163 | image = html_gallery.find("img", {"style": "cursor:pointer;"})
164 | if image is not None:
165 | images.append("https://kvadrat64.ru/" + image.get("src"))
166 | images = "\n".join(images)
167 | # если нет картинок в галерее, пытаемся вытащить с облоджки
168 | if not images:
169 | images = "https://kvadrat64.ru/" + soup.find("div", id="mainfotoid").find("img").get("src")
170 | except Exception as e:
171 | with open("logs.txt", "a", encoding="utf8") as file:
172 | file.write(str(e) + " kvadrat get_photos\n")
173 | images = "Не указано"
174 | return images
175 |
176 |
177 | def get_description(soup):
178 | try:
179 | description = soup.find("p", class_="dinfo").text.strip().replace("\r", "")
180 | except Exception as e:
181 | with open("logs.txt", "a", encoding="utf8") as file:
182 | file.write(str(e) + " kvadrat get_description\n")
183 | description = "Не указано"
184 | return description
185 |
186 |
187 | def get_date(soup):
188 | try:
189 | date = soup.find("div", class_="tdate").text.strip().split(",")[1]
190 | if "сделать" in date:
191 | date = date.split("сделать")[0].split("создано")[1].strip()
192 | else:
193 | date = date.split("VIP")[0].split("создано")[1].strip()
194 | date = transform_date(date)
195 | except Exception as e:
196 | with open("logs.txt", "a", encoding="utf8") as file:
197 | file.write(str(e) + " kvadrat get_date\n")
198 | date = "Не указано"
199 | return date
200 |
201 |
202 | def get_seller_name(soup):
203 | try:
204 | name = soup.find_all("td", class_="tddec2")[-1].find("span").text.strip()
205 | except Exception as e:
206 | with open("logs.txt", "a", encoding="utf8") as file:
207 | file.write(str(e) + " kvadrat get_seller_name\n")
208 | name = "Не указано"
209 | return name
210 |
211 |
212 | def get_seller_phone(url, soup):
213 | phone = "Не указано"
214 | # телефон появляется динамически, используем selenium
215 | try:
216 | # иногда посредники указывают телефон в самом тексте; проверяем это
217 | tddec = soup.find_all("td", class_="tddec2")[-1].find_all(text=True)
218 | found = False
219 | for i in range(len(tddec)):
220 | if "Персона для контактов" in tddec[i]:
221 | phone = tddec[i + 1].split(",")[-1].strip()
222 | found = True
223 | elif "Контактный телефон" in tddec[i]:
224 | found = False
225 |
226 | if "".join(phone.split()).isalpha():
227 | phone = "Не указано"
228 |
229 | if not found:
230 | vdisplay = Xvfb()
231 | vdisplay.start()
232 | driver = webdriver.Chrome(options=options)
233 | driver.set_window_size(1920, 1080)
234 | driver.get(url)
235 |
236 | button = driver.find_element_by_xpath('//span[@class="showphone"]')
237 | button.click()
238 | time.sleep(3)
239 | seller_info = driver.find_elements_by_xpath('//td[@class="tddec2"]')[-1].text
240 | for info in seller_info.split("\n"):
241 | if "Контактный телефон" in info:
242 | phone = info.split(":")[1].strip()
243 | driver.quit()
244 | vdisplay.stop()
245 | except Exception as e:
246 | with open("logs.txt", "a", encoding="utf8") as file:
247 | file.write(str(e) + " kvadrat get_seller_phone\n")
248 | phone = "Не указано"
249 | return phone
250 |
251 |
252 | def get_apartment_params(soup):
253 | block_type, total_area, kitchen_area, living_area, floor, total_floors, material = ["Не указано"] * 7
254 | try:
255 | ###
256 | # из-за кривой структуры сайта, формируем все сами в удобный формат
257 | params_raw = str(soup.find("td", class_="tddec")).split("
")
258 | params = BeautifulSoup(params_raw[0], "lxml").find("td", class_="tddec").text.strip().split("\xa0")
259 | for param in params_raw[1:]:
260 | params.append(BeautifulSoup(param, "lxml").text.strip())
261 | ###
262 | new_block = False # в новостройке ли квартира
263 | add_info = "" # дата сдачи, застройщик (указываем в одноц графе)
264 | for param in params:
265 | if "Площадь общая" in param:
266 | total_area = param.split(":")[1].split("м²")[0].strip() + " м2"
267 | elif "Кухня" in param:
268 | kitchen_area = param.split(":")[1].split("м²")[0].strip() + " м2"
269 | elif "Жилая" in param:
270 | living_area = param.split(":")[1].split("м²")[0].strip() + " м2"
271 | elif "этажей в доме" in param:
272 | total_floors = param.split(":")[1].split("/")[1]
273 | floor = param.split(":")[1].split("/")[0].split()[1]
274 | elif "cтроение" in param:
275 | material = param.split(":")[1].strip()
276 | elif "Застройщик" in param or "Дата сдачи" in param or "Стадия строительства" in param:
277 | new_block = True
278 | add_info += param.split(":")[1] + ";"
279 |
280 | if new_block:
281 | block_type = "Новостройка " + add_info
282 | else:
283 | block_type = "Вторичка"
284 | except Exception as e:
285 | with open("logs.txt", "a", encoding="utf8") as file:
286 | file.write(str(e) + " kvadrat get_apartment_params\n")
287 | return block_type, total_area, kitchen_area, living_area, floor, total_floors, material
288 |
289 |
290 | def get_cottage_params(soup):
291 | total_area, material, comforts, total_floors, land_area = ["Не указано"] * 5
292 | try:
293 | ###
294 | # из-за кривой структуры сайта, формируем все сами в удобный формат
295 | params_raw = str(soup.find("td", class_="tddec")).split("
")
296 | params = BeautifulSoup(params_raw[0], "lxml").find("td", class_="tddec").text.strip().split("\xa0")
297 | for param in params_raw[1:]:
298 | params.append(BeautifulSoup(param, "lxml").text.strip())
299 | ###
300 | for param in params:
301 | if "Площадь общая" in param:
302 | total_area = param.split(":")[1].split("м²")[0].strip() + " м2"
303 | elif "cтроение" in param:
304 | material = param.split(":")[1].strip()
305 | elif "Площадь участка" in param:
306 | land_area = param.split(":")[1].strip()
307 | elif "Этажей" in param:
308 | total_floors = param.split(":")[1].strip()
309 | elif "Коммуникации" in param:
310 | comforts = param.split(":")[1].strip()
311 | except Exception as e:
312 | with open("logs.txt", "a", encoding="utf8") as file:
313 | file.write(str(e) + " kvadrat get_cottage_params\n")
314 | return total_area, material, comforts, total_floors, land_area
315 |
316 |
317 | def get_commercial_params(soup):
318 | object_type, area = ["Не указано"] * 2
319 | try:
320 | ###
321 | # из-за кривой структуры сайта, формируем все сами в удобный формат
322 | params_raw = str(soup.find("td", class_="tddec")).split("
")
323 | params = BeautifulSoup(params_raw[0], "lxml").find("td", class_="tddec").text.strip().split("\xa0")
324 | for param in params_raw[1:]:
325 | params.append(BeautifulSoup(param, "lxml").text.strip())
326 | ###
327 | for param in params:
328 | if "Объект" in param:
329 | object_type = param.split(":")[1].strip()
330 | elif "площадь" in param:
331 | area = param.split(":")[1].strip()
332 | except Exception as e:
333 | with open("logs.txt", "a", encoding="utf8") as file:
334 | file.write(str(e) + " kvadrat get_commercial_params\n")
335 | return object_type, area
336 |
337 |
338 | def get_dacha_params(soup):
339 | total_area = "Не указано"
340 | try:
341 | ###
342 | # из-за кривой структуры сайта, формируем все сами в удобный формат
343 | params_raw = str(soup.find("td", class_="tddec")).split("
")
344 | params = BeautifulSoup(params_raw[0], "lxml").find("td", class_="tddec").text.strip().split("\xa0")
345 | for param in params_raw[1:]:
346 | params.append(BeautifulSoup(param, "lxml").text.strip())
347 | ###
348 | for param in params:
349 | if "Площадь дома" in param:
350 | total_area = param.split(":")[1].strip()
351 | break
352 | except Exception as e:
353 | with open("logs.txt", "a", encoding="utf8") as file:
354 | file.write(str(e) + " kvadrat get_dacha_params\n")
355 | return total_area
356 |
357 |
358 | def get_land_params(soup):
359 | total_area, land_type = ["Не указано"] * 2
360 | try:
361 | ###
362 | # из-за кривой структуры сайта, формируем все сами в удобный формат
363 | params_raw = str(soup.find("td", class_="tddec")).split("
")
364 | params = BeautifulSoup(params_raw[0], "lxml").find("td", class_="tddec").text.strip().split("\xa0")
365 | for param in params_raw[1:]:
366 | params.append(BeautifulSoup(param, "lxml").text.strip())
367 | ###
368 | for param in params:
369 | if "Площадь участка" in param:
370 | total_area = param.split(":")[1].strip()
371 | elif "Тип земли" in params:
372 | land_type = param.split(":")[1].strip()
373 | except Exception as e:
374 | with open("logs.txt", "a", encoding="utf8") as file:
375 | file.write(str(e) + " kvadrat get_land_params\n")
376 | return total_area, land_type
377 |
378 |
379 | def get_apartment_data(html, url):
380 | soup = BeautifulSoup(html, "lxml")
381 |
382 | title = get_title(soup)
383 | if "сниму" not in title.lower():
384 | address = ",".join(title.split(",")[1:]).strip()
385 | address = address[:address.rfind(" на карте")]
386 | if "сдам" in address.lower():
387 | address = " ".join(address.split()[1:])
388 | if "(" in address:
389 | address = address[:address.rfind("(")]
390 |
391 | city = address.split(",")[-1].strip()
392 | district = address.split(",")[-2].strip()
393 | block_number = address.split(",")[-3].strip()
394 | street = address.split(",")[-4].strip()
395 |
396 | rooms_number = title.split(",")[0]
397 | block_type, total_area, kitchen_area, living_area, floor, total_floors, material = get_apartment_params(soup)
398 | price = get_price(soup)
399 | selling_detail, rent_info = get_selling_type(soup) # чистая продажа/ипотека/без посредников; если аренда, срок аренды
400 | if not selling_detail:
401 | selling_detail = "Не продажа"
402 | images = get_photos(soup)
403 | description = get_description(soup)
404 | phone = get_seller_phone(url, soup)
405 | date = get_date(soup)
406 |
407 | return [city, district, street, block_number, rent_info, price, block_type,
408 | rooms_number, total_area, total_floors, material, selling_detail, images,
409 | description, date, phone, kitchen_area, living_area, floor]
410 | return None
411 |
412 |
413 | def get_cottage_data(html, url):
414 | soup = BeautifulSoup(html, "lxml")
415 |
416 | title = get_title(soup)
417 | if "сниму" not in title.lower():
418 | address = ",".join(title.split(",")[1:]).strip()
419 | address = address[:address.rfind(" на карте")]
420 | if "(" in address:
421 | address = address[:address.rfind("(")]
422 |
423 | if address == address.upper():
424 | city, street, block_number = address.split(",") + (["Не указано"] * (3 - len(address.split(","))))
425 | district = "Не указано"
426 | else:
427 | city = address.split(",")[-1].strip()
428 | district = address.split(",")[-2].strip()
429 | block_number = address.split(",")[-3].strip()
430 | street = address.split(",")[-4].strip()
431 |
432 | cottage_type = title.split(",")[0]
433 | if "сдам" in cottage_type.lower():
434 | cottage_type = " ".join(cottage_type.split()[1:])
435 | price = get_price(soup)
436 | total_area, material, comforts, total_floors, land_area = get_cottage_params(soup)
437 | selling_detail, rent_info = get_selling_type(soup) # чистая продажа/ипотека/без посредников; если аренда, срок аренды
438 | if not selling_detail:
439 | selling_detail = "Не продажа"
440 | images = get_photos(soup)
441 | description = get_description(soup)
442 | phone = get_seller_phone(url, soup)
443 | seller_name = get_seller_name(soup)
444 | date = get_date(soup)
445 | status = "Не указано" # нет такой информации
446 |
447 | return [city, district, street, block_number, rent_info, price, cottage_type,
448 | total_area, comforts, selling_detail, images, description, date, phone, material,
449 | total_floors, land_area, status, seller_name]
450 | return None
451 |
452 |
453 | def get_commercial_data(html, url):
454 | soup = BeautifulSoup(html, "lxml")
455 |
456 | title = get_title(soup)
457 | if "сниму" not in title.lower():
458 | address = ",".join(title.split(",")[1:]).strip()
459 | address = address[:address.rfind(" на карте")]
460 | if "(" in address:
461 | address = address[:address.rfind("(")]
462 |
463 | city = address.split(",")[-1].strip()
464 | district = address.split(",")[-2].strip()
465 | block_number = address.split(",")[-3].strip()
466 | street = address.split(",")[-4].strip()
467 |
468 | object_type, area = get_commercial_params(soup)
469 | price = get_commercial_price(soup)
470 | images = get_photos(soup)
471 | description = get_description(soup)
472 | phone = get_seller_phone(url, soup)
473 | date = get_date(soup)
474 | seller_name = get_seller_name(soup)
475 | office_class, furniture, entrance = ["Не указано"] * 3
476 |
477 | return [city, district, street, block_number, price, object_type, office_class,
478 | furniture, entrance, area, date, phone, images, description, seller_name]
479 | return None
480 |
481 |
482 | def get_land_data(html, url):
483 | soup = BeautifulSoup(html, "lxml")
484 |
485 | title = get_title(soup)
486 | if "сниму" not in title.lower():
487 | address = ",".join(title.split(",")[1:]).strip()
488 | address = address[:address.rfind("(")].strip()
489 |
490 | city = address.split(",")[0]
491 | if len(address.split(",")) > 1:
492 | district = address.split(",")[1].strip()
493 | else:
494 | district = "Не указано"
495 | street = "Не указано"
496 |
497 | if city.lower() == "саратов":
498 | distance = "В черте города"
499 | else:
500 | distance = title[title.find("(") + 1:title.find(")")]
501 |
502 | area, land_type = get_land_params(soup)
503 | price = get_price(soup)
504 | images = get_photos(soup)
505 | description = get_description(soup)
506 | phone = get_seller_phone(url, soup)
507 | date = get_date(soup)
508 | seller_name = get_seller_name(soup)
509 | sell_type = "Продажа"
510 | deposit, seller_type = ["Не указано"] * 2
511 |
512 | return [city, district, street, sell_type, deposit, land_type, distance, area, price, seller_type, images,
513 | description, seller_name, phone, date]
514 | return None
515 |
516 |
517 | def crawl_page(first_offer, html, category, sell_type):
518 | global visited_urls, db
519 | soup = BeautifulSoup(html, "lxml")
520 | try:
521 | #offers = soup.find_all("a", class_="site3adv") + soup.find_all("a", class_="site3")
522 | offers = soup.find_all("a", class_="site3")
523 | except:
524 | offers = []
525 | if offers is None or not offers:
526 | print("Парсинг завершен kvadrat")
527 | return True
528 | for offer in offers:
529 | try:
530 | url = "http://kvadrat64.ru/" + offer.get("href")
531 | if url in visited_urls:
532 | print("kvadrat not unique")
533 | time.sleep(random.uniform(5, 8))
534 | continue
535 | else:
536 | visited_urls.append(url)
537 | #print(url)
538 |
539 | data = []
540 | if category == "Квартиры":
541 | data = get_apartment_data(get_html(url), url)
542 | # записываем ключевую информацию, чтобы потом найти дубликаты
543 | with open("total_data.txt", "a", encoding="utf8") as file:
544 | file.write("%s--%s--%s--%s--%s--%s\n" % (data[2], data[3], data[4], data[8], data[-1], url))
545 | elif category == "Дома":
546 | data = get_cottage_data(get_html(url), url)
547 | with open("total_data.txt", "a", encoding="utf8") as file:
548 | file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[7], data[8], url))
549 | elif category == "Участки":
550 | data = get_land_data(get_html(url), url)
551 | with open("total_data.txt", "a", encoding="utf8") as file:
552 | file.write("%s--%s--%s--%s\n" % (data[2], data[5], data[7], url))
553 | elif category == "Коммерческая_недвижимость":
554 | data = get_commercial_data(get_html(url), url)
555 | with open("total_data.txt", "a", encoding="utf8") as file:
556 | file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[6], data[10], url))
557 |
558 | if first_offer:
559 | # сохраняем самую первую запись как точку выхода
560 | modifier = "w" if (category == "Квартиры" and sell_type == "Продажа") else "a"
561 | with open("breakpoints/kvadrat.txt", modifier, encoding="utf8") as file:
562 | file.write("%s--%s\n" % (data[2], data[5]))
563 | first_offer = False
564 |
565 | key_info = (data[2], data[5])
566 |
567 | if any(x == key_info for x in [break_apartment_sell, break_apartment_rent, break_cottage_sell,
568 | break_cottage_rent, break_commercial_sell, break_commercial_rent,
569 | break_dacha_sell, break_saratov_land_sell, break_region_land_sell]):
570 | print("Парсинг завершен kvadrat")
571 | return True
572 |
573 | data.insert(4, sell_type)
574 |
575 | # на каком месте находится дата объявления
576 | index_of_date = -1
577 | if category == "Квартиры" or category == "Коммерческая_недвижимость":
578 | index_of_date = -5
579 | elif category == "Дома":
580 | index_of_date = -7
581 | elif category == "Участки":
582 | index_of_date = -1
583 |
584 | if data[index_of_date] != "Не указано" and data[index_of_date] < datetime.datetime.today() - datetime.timedelta(days=1):
585 | # сраниваем форматы datetime, чтобы знать, когда закончить парсинг
586 | print("Парсинг завершен kvadrat")
587 | return True
588 | else:
589 | # переводим в строковый формат
590 | data[index_of_date] = str(data[index_of_date]).split()[0]
591 |
592 | if data[0] != "Не указано" and data is not None:
593 | try:
594 | db.insert_data(category, data)
595 | except:
596 | db.close()
597 | db = DataBase()
598 | db.insert_data(category, data)
599 | print("parsed page kvadrat")
600 |
601 | #print(data)
602 |
603 | except Exception as e:
604 | with open("logs.txt", "a", encoding="utf8") as file:
605 | file.write(str(e) + " kvadrat crawl_page\n")
606 |
607 | time.sleep(random.uniform(5, 8))
608 |
609 |
610 | def parse(category_url, category_name, sell_type):
611 |
612 | total_pages = get_total_pages(get_html(category_url))
613 |
614 | for page in range(1, total_pages + 1):
615 | if (category_name == "Дома" and sell_type == "Продажа" and "sellzagbank" not in category_url) or category_name == "Участки":
616 | url = category_url.split("-")
617 | url_gen = "-".join(url[:2]) + "-" + str(page) + "-" + url[3]
618 | else:
619 | url_gen = category_url[:category_url.rfind("-") + 1] + str(page) + ".html"
620 |
621 | if page == 1:
622 | completed = crawl_page(True, get_html(url_gen), category_name, sell_type)
623 | else:
624 | completed = crawl_page(False, get_html(url_gen), category_name, sell_type)
625 | if completed:
626 | break
627 |
628 |
629 | def main():
630 | global visited_urls
631 | url_apartments_sell = "http://kvadrat64.ru/sellflatbank-50-1.html"
632 | parse(url_apartments_sell, "Квартиры", "Продажа")
633 |
634 | visited_urls = []
635 | url_apartments_rent = "https://kvadrat64.ru/giveflatbank-50-1.html"
636 | parse(url_apartments_rent, "Квартиры", "Аренда")
637 |
638 | visited_urls = []
639 | url_cottages_sell = "https://kvadrat64.ru/search-103-1-50664.html"
640 | parse(url_cottages_sell, "Дома", "Продажа")
641 |
642 | visited_urls = []
643 | url_cottages_rent = "https://kvadrat64.ru/giveflatbank-9-1.html"
644 | parse(url_cottages_rent, "Дома", "Аренда")
645 |
646 | visited_urls = []
647 | url_commercials_sell = "https://kvadrat64.ru/sellcombank-1000-1.html"
648 | parse(url_commercials_sell, "Коммерческая_недвижимость", "Продажа")
649 |
650 | visited_urls = []
651 | url_commercials_rent = "https://kvadrat64.ru/givecombank-1000-1.html"
652 | parse(url_commercials_rent, "Коммерческая_недвижимость", "Аренда")
653 |
654 | visited_urls = []
655 | url_dachas_sell = "https://kvadrat64.ru/sellzagbank-1000-1.html"
656 | parse(url_dachas_sell, "Дома", "Продажа")
657 |
658 | visited_urls = []
659 | url_saratov_lands_sell = "https://kvadrat64.ru/search-41-1-24435.html"
660 | parse(url_saratov_lands_sell, "Участки", "Продажа")
661 |
662 | visited_urls = []
663 | url_region_lands_sell = "https://kvadrat64.ru/search-412-1-24450.html"
664 | parse(url_region_lands_sell, "Участки", "Продажа")
665 |
666 |
667 | if __name__ == "__main__":
668 | main()
669 | db.close()
670 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import gc
3 | from multiprocessing import Process
4 | import os
5 | import datetime
6 | from database import DataBase
7 |
8 | t1, t2, t3, t4, t5, t6 = [None] * 6
9 |
10 |
11 | def cls():
12 | os.system('cls' if os.name == 'nt' else 'clear')
13 |
14 |
15 | def main():
16 | global t1, t2, t3, t4, t5, t6
17 | if all(p is not None for p in [t1, t2, t3, t4, t5, t6]):
18 | for p in [t1, t2, t3, t4, t5, t6]:
19 | if p.is_alive():
20 | p.terminate()
21 | p.join()
22 |
23 | import avito_parsing
24 | import irr_parsing
25 | import kvadrat64_parsing
26 | import ya_realty_parsing
27 | import cian_parsing
28 | import youla_parsing
29 |
30 | cls()
31 | print("Job started", datetime.datetime.today())
32 |
33 | db = DataBase()
34 | db.create_table("Квартиры")
35 | db.create_table("Дома")
36 | db.create_table("Коммерческая_недвижимость")
37 | db.create_table("Участки")
38 | db.create_table("Дубликаты")
39 |
40 | if os.path.isfile("logs.txt"):
41 | os.remove("logs.txt")
42 |
43 | total_data = {}
44 | try:
45 | if os.path.isfile("total_data.txt"):
46 | with open("total_data.txt", "r", encoding="utf8") as file:
47 | for line in file.readlines():
48 | data = line.strip().split("--")
49 | params = tuple(data[:-1])
50 | url = data[-1]
51 | total_data[params] = list(set(total_data.get(params, []) + [url]))
52 |
53 | for data in total_data:
54 | if all(x != "Не указано" for x in data): # avoid writing dummy records
55 | if len(total_data[data]) > 1:
56 | db.insert_data("Дубликаты", [", ".join(data), "\n".join(total_data[data])])
57 | except Exception as e:
58 | print(e)
59 |
60 | if os.path.isfile("total_data.txt"):
61 | os.remove("total_data.txt")
62 |
63 | t1 = Process(target=ya_realty_parsing.main)
64 | t2 = Process(target=irr_parsing.main)
65 | t3 = Process(target=youla_parsing.main)
66 | t1.start()
67 | t2.start()
68 | t3.start()
69 | t1.join()
70 | t2.join()
71 | t3.join()
72 |
73 | t4 = Process(target=kvadrat64_parsing.main)
74 | t5 = Process(target=cian_parsing.main)
75 | t6 = Process(target=avito_parsing.main)
76 | t4.start()
77 | t5.start()
78 | t6.start()
79 | t4.join()
80 | t5.join()
81 | t6.join()
82 |
83 | db.close()
84 | gc.collect()
85 | print("Job finished", datetime.datetime.today())
86 |
87 |
88 | if __name__ == '__main__':
89 | import schedule
90 | import time
91 |
92 | schedule.every().day.at("10:00").do(main)
93 |
94 | while True:
95 | schedule.run_pending()
96 | time.sleep(1)
97 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | asn1crypto==0.24.0
2 | beautifulsoup4==4.6.1
3 | certifi==2018.4.16
4 | cffi==1.11.5
5 | chardet==3.0.4
6 | cryptography==3.2
7 | EasyProcess==0.2.3
8 | httmock==1.2.6
9 | http-request-randomizer==1.2.3
10 | idna==2.7
11 | mysql-connector-python==8.0.12
12 | Pillow==8.1.1
13 | protobuf==3.6.1
14 | psutil==5.6.6
15 | pycparser==2.18
16 | pyOpenSSL==18.0.0
17 | python-dateutil==2.7.3
18 | PyVirtualDisplay==0.2.1
19 | requests==2.20.0
20 | selenium==3.14.0
21 | six==1.11.0
22 | urllib3==1.24.2
23 | xvfbwrapper==0.2.9
--------------------------------------------------------------------------------
/ya_realty_parsing.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import requests
4 | from bs4 import BeautifulSoup
5 | import time
6 | import random
7 | from fake_useragent import UserAgent
8 | import datetime
9 | from selenium import webdriver
10 | from xvfbwrapper import Xvfb
11 | from selenium.webdriver.chrome.options import Options
12 | from database import DataBase
13 |
14 | # на каких записях останавливаться
15 | with open("breakpoints/ya.txt", "r", encoding="utf8") as file:
16 | breakpoints = file.readlines()
17 | try:
18 | break_apartment_sell = tuple(breakpoints[0].strip().split("--"))
19 | except:
20 | break_apartment_sell = None
21 | try:
22 | break_apartment_rent = tuple(breakpoints[1].strip().split("--"))
23 | except:
24 | break_apartment_rent = None
25 | try:
26 | break_cottage_sell = tuple(breakpoints[2].strip().split("--"))
27 | except:
28 | break_cottage_sell = None
29 | try:
30 | break_cottage_rent = tuple(breakpoints[3].strip().split("--"))
31 | except:
32 | break_cottage_rent = None
33 | try:
34 | break_commercial_sell = tuple(breakpoints[4].strip().split("--"))
35 | except:
36 | break_commercial_sell = None
37 | try:
38 | break_commercial_rent = tuple(breakpoints[5].strip().split("--"))
39 | except:
40 | break_commercial_rent = None
41 |
42 | # defining chrome options for selenium
43 | options = Options()
44 | options.add_argument("--no-sandbox")
45 |
46 | db = DataBase()
47 | visited_urls = []
48 |
49 |
50 | def transform_date(date):
51 | """
52 | Преобразуем дату, чтобы сравнить datetime-объекты
53 | """
54 | day, month, year = date.split()
55 | months = {
56 | "января": 1,
57 | "февраля": 2,
58 | "марта": 3,
59 | "апреля": 4,
60 | "мая": 5,
61 | "июня": 6,
62 | "июля": 7,
63 | "августа": 8,
64 | "сентября": 9,
65 | "октября": 10,
66 | "ноября": 11,
67 | "декабря": 12
68 | }
69 |
70 | date = datetime.datetime(int(year), months[month], int(day))
71 | return date
72 |
73 |
74 | def get_html(url):
75 | req = requests.get(url, headers={"User-Agent": UserAgent().chrome, "Referer": url,
76 | "Accept-Language": "ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7",
77 | "Connection": "keep-alive", "Origin": "https://realty.yandex.ru",
78 | "DNT": "1"})
79 | return req.text.encode(req.encoding)
80 |
81 |
82 | def get_title(soup):
83 | try:
84 | title = soup.find("h1", class_="offer-card__header-text").text.strip()
85 | except Exception as e:
86 | with open("logs.txt", "a", encoding="utf8") as file:
87 | file.write(str(e) + " ya get_title\n")
88 | title = "Не указано"
89 | return title
90 |
91 |
92 | def get_address(soup):
93 | try:
94 | address = soup.find("h2", class_="offer-card__address ellipsis").text.strip()
95 | # separating data from the address string
96 | district, street = "Не указано", "Не указано"
97 | city = address.split(",")[0]
98 | block_number = address.split(",")[-1].strip()
99 | if "ул " in block_number.lower() or "ул." in block_number.lower() or "улица" in block_number.lower() \
100 | or " пер" in block_number.lower() or "проспект" in block_number.lower() or "проезд" in block_number.lower():
101 | street = block_number
102 | block_number = "Не указано"
103 |
104 | for param in address.split(",")[1:-1]:
105 | if "ул " in param.lower() or "ул." in param.lower() or "улица" in param.lower() \
106 | or " пер" in param.lower() or "проспект" in param.lower() or "проезд" in param.lower():
107 | street = param.strip()
108 | elif "район" in param.lower() or "р-н" in param.lower():
109 | district = param.strip()
110 |
111 | if street.split()[-1].strip().isdigit():
112 | block_number = street.split()[-1].strip()
113 | street = " ".join(street.split()[:-1]).strip()
114 |
115 | return city, district, street, block_number
116 |
117 | except Exception as e:
118 | with open("logs.txt", "a", encoding="utf8") as file:
119 | file.write(str(e) + " ya get_address\n")
120 | return ["Не указано"] * 4
121 |
122 |
123 | def get_block_type(soup):
124 | try:
125 | block_type = soup.find("div", class_="offer-card__building-type")
126 | if block_type is None:
127 | block_type = "Вторичка"
128 | else:
129 | block_type = block_type.text.strip()
130 | except Exception as e:
131 | with open("logs.txt", "a", encoding="utf8") as file:
132 | file.write(str(e) + " ya get_block_type\n")
133 | block_type = "Не указано"
134 | return block_type
135 |
136 |
137 | def get_price(soup):
138 | try:
139 | price = soup.find("h3", class_="offer-price offer-card__price offer-card__price").text.strip()
140 | except Exception as e:
141 | with open("logs.txt", "a", encoding="utf8") as file:
142 | file.write(str(e) + " ya get_price\n")
143 | price = "Не указано"
144 | return price
145 |
146 |
147 | def get_selling_type(soup):
148 | try:
149 | selling_type = soup.find("div", class_="offer-card__terms").text.strip()
150 | except Exception as e:
151 | with open("logs.txt", "a", encoding="utf8") as file:
152 | file.write(str(e) + " ya get_selling_type\n")
153 | selling_type = "Не указано"
154 | return selling_type
155 |
156 |
157 | def get_seller_type(soup):
158 | try:
159 | seller_type = soup.find("div", class_="offer-card__author-note").text.strip()
160 | except Exception as e:
161 | with open("logs.txt", "a", encoding="utf8") as file:
162 | file.write(str(e) + " ya get_seller_type\n")
163 | seller_type = "Не указано"
164 | return seller_type
165 |
166 |
167 | def get_seller_name(soup):
168 | try:
169 | name = soup.find("div", class_="offer-card__author-name").text.strip()
170 | except:
171 | name = "Не указано"
172 | return name
173 |
174 |
175 | def get_photos(soup):
176 | try:
177 | images = []
178 | images_list = soup.find("div", class_="offer-card__photos-wrapper").find_all("a")
179 | for image in images_list:
180 | link = "https://realty.yandex.ru" + image.get("href")
181 | images.append(link)
182 | images = "\n".join(images)
183 | except Exception as e:
184 | with open("logs.txt", "a", encoding="utf8") as file:
185 | file.write(str(e) + " ya get_photos\n")
186 | images = "Не указано"
187 | return images
188 |
189 |
190 | def get_description(soup):
191 | try:
192 | description = soup.find("div", class_="offer-card__desc-text").text.strip()
193 | except Exception as e:
194 | with open("logs.txt", "a", encoding="utf8") as file:
195 | file.write(str(e) + " ya get_description\n")
196 | description = "Не указано"
197 | return description
198 |
199 |
200 | def get_date(soup, which_page):
201 | # 0 - page with offers, 1 - offer itself
202 | try:
203 | if which_page == 0:
204 | date = soup.find("div", class_="OffersSerpItem__publish-date").text.strip()
205 | else:
206 | date = soup.find("div", class_="offer-card__lot-date").text.strip()
207 | if "назад" in date:
208 | time_passed = int(date.split()[0])
209 | if "минут" in date:
210 | date = str(datetime.datetime.today() - datetime.timedelta(minutes=time_passed)).split()[0]
211 | elif "часов" in date or "часа" in date or "час" in date:
212 | date = str(datetime.datetime.today() - datetime.timedelta(hours=time_passed)).split()[0]
213 | elif "сейчас" in date:
214 | date = str(datetime.datetime.today()).split()[0]
215 | elif date == "вчера":
216 | date = str(datetime.datetime.today() - datetime.timedelta(days=1)).split()[0]
217 | elif len(date.split()) >= 3:
218 | transformed_date = transform_date(date)
219 | days_passed = str(datetime.datetime.today() - transformed_date).split()[0]
220 | if int(days_passed) > 1:
221 | date = "too old"
222 | except Exception as e:
223 | with open("logs.txt", "a", encoding="utf8") as file:
224 | file.write(str(e) + " ya get_date\n")
225 | date = "Не указано"
226 | return date
227 |
228 |
229 | def get_seller_phone(url):
230 | phone = "Не указано"
231 | try:
232 | vdisplay = Xvfb()
233 | vdisplay.start()
234 | driver = webdriver.Chrome(options=options)
235 | driver.set_window_size(1920, 1080)
236 | driver.get(url)
237 |
238 | button = driver.find_element_by_xpath("/html/body/div[1]/div[2]/div/div[2]/div[2]/div[2]/div/div[1]/div[3]/div[1]/span/button")
239 | button.click()
240 | time.sleep(2)
241 | phone = driver.find_element_by_xpath('//div[@class="helpful-info__contact-phones-string"]').text
242 | driver.quit()
243 | vdisplay.stop()
244 | except Exception as e:
245 | with open("logs.txt", "a", encoding="utf8") as file:
246 | file.write(str(e) + " ya get_seller_phone\n")
247 | return phone
248 |
249 |
250 | def get_apartment_params(soup):
251 | rooms_number, floor, total_floors, total_area, material, year, kitchen_area, living_area = ["Не указано"] * 8
252 | try:
253 | params = [x.text.strip() for x in soup.find_all("div", class_="offer-card__feature-name")]
254 | values = [x.text.strip() for x in soup.find_all("div", class_="offer-card__feature-value")]
255 | for i in range(len(params)):
256 | if "Количество комнат" in params[i]:
257 | rooms_number = values[i]
258 | elif "Год постройки" in params[i]:
259 | year = values[i]
260 | elif "Этаж" in params[i]:
261 | floor, total_floors = values[i].split(" из ")
262 | elif "Общая площадь" in params[i]:
263 | total_area = values[i]
264 | elif "Кухня" in params[i]:
265 | total_area = values[i]
266 | elif "Жилая" in params[i]:
267 | total_area = values[i]
268 | elif "Тип здания" in params[i]:
269 | material = values[i]
270 |
271 | if year == "Не указано":
272 | new_block_params = [x.text.strip() for x in soup.find_all("div", class_="offer-card__site-subtitle-item")]
273 | for param in new_block_params:
274 | if "строится" in param:
275 | year = param
276 | break
277 |
278 | if year == "Не указано":
279 | new_params = [x.text.strip() for x in soup.find_all("div", class_="offer-card__main-feature-note")]
280 | values = [x.text.strip() for x in soup.find_all("div", class_="offer-card__main-feature-title")]
281 | for i in range(len(new_params)):
282 | if "год постройки" in new_params[i]:
283 | year = values[i]
284 | break
285 |
286 | except Exception as e:
287 | with open("logs.txt", "a", encoding="utf8") as file:
288 | file.write(str(e) + " ya get_apartment_params\n")
289 | return rooms_number, floor, total_floors, total_area, material, year, kitchen_area, living_area
290 |
291 |
292 | def get_cottage_params(soup):
293 | total_area, land_area, comforts, year, material, total_floors, land_status = ["Не указано"] * 7
294 | try:
295 | params = [x.text.strip() for x in soup.find_all("div", class_="offer-card__feature-name")]
296 | values = [x.text.strip() for x in soup.find_all("div", class_="offer-card__feature-value")]
297 | for i in range(len(params)):
298 | if "Год постройки" in params[i]:
299 | year = values[i]
300 | elif "Общая площадь" in params[i]:
301 | total_area = values[i]
302 | elif "Площадь участка" in params[i]:
303 | land_area = values[i]
304 | elif "Тип дома" in params[i]:
305 | material = values[i]
306 | elif "Количество этажей" in params[i]:
307 | total_floors = values[i]
308 | elif "Тип участка" in params[i]:
309 | land_status = values[i]
310 | elif any(x in params[i].lower() for x in ["отапливаемый", "отопление", "водопровод", "канализация",
311 | "электроснабжение", "свет", "газ", "вода", "интернет",
312 | "телефон", "душ"]):
313 | if comforts == "Не указано":
314 | comforts = params[i].strip()
315 | else:
316 | comforts += "; " + params[i].strip()
317 |
318 | if year == "Не указано":
319 | new_block_params = [x.text.strip() for x in soup.find_all("div", class_="offer-card__site-subtitle-item")]
320 | for param in new_block_params:
321 | if "строится" in param:
322 | year = param
323 | except Exception as e:
324 | with open("logs.txt", "a", encoding="utf8") as file:
325 | file.write(str(e) + " ya get_cottage_params\n")
326 | return total_area, land_area, comforts, year, material, total_floors, land_status
327 |
328 |
329 | def get_commercial_params(soup):
330 | entrance, furniture, additions, area = ["Не указано"] * 4
331 | try:
332 | params = [x.text.strip() for x in soup.find_all("div", class_="offer-card__feature-name")]
333 | values = [x.text.strip() for x in soup.find_all("div", class_="offer-card__feature-value")]
334 | for i in range(len(params)):
335 | if "Мебель" in params[i]:
336 | furniture = values[i]
337 | elif "Вход" in params[i]:
338 | entrance = values[i]
339 | elif any(x in params[i].lower() for x in ["кондиционер", "интернет", "пожарная сигнализация",
340 | "вентиляция", "охраняемая парковка", "сигнализация", "лифт"])\
341 | and values[i].strip() == "да":
342 | if additions == "Не указано":
343 | additions = params[i].strip()
344 | else:
345 | additions += "; " + params[i].strip()
346 |
347 | new_params = [x.text.strip() for x in soup.find_all("div", class_="offer-card__main-feature-note")]
348 | values = [x.text.strip() for x in soup.find_all("div", class_="offer-card__main-feature-title")]
349 | for j in range(len(new_params)):
350 | if "общая" in new_params[j]:
351 | area = values[j]
352 | break
353 | except Exception as e:
354 | with open("logs.txt", "a", encoding="utf8") as file:
355 | file.write(str(e) + " ya get_commercial_params\n")
356 | return entrance, furniture, additions, area
357 |
358 |
359 | def get_apartment_data(html, url):
360 | soup = BeautifulSoup(html, "lxml")
361 |
362 | #title = get_title(soup)
363 | city, district, street, block_number = get_address(soup)
364 | block_type = get_block_type(soup)
365 | price = get_price(soup)
366 | rooms_number, floor, total_floors, total_area, material, year, kitchen_area, living_area = get_apartment_params(soup)
367 | selling_detail = get_selling_type(soup)
368 | if "продажа" in selling_detail.lower() or "ипотека" in selling_detail.lower():
369 | rent_info = "Не аренда"
370 | else:
371 | rent_info = selling_detail
372 | selling_detail = "Не указано"
373 |
374 | #seller_type = get_seller_type(soup)
375 | images = get_photos(soup)
376 | description = get_description(soup)
377 | phone = get_seller_phone(url)
378 | date = get_date(soup, 1)
379 |
380 | return [city, district, street, block_number, rent_info, price, block_type,
381 | rooms_number, total_area, total_floors, material, selling_detail, images,
382 | description, date, phone, kitchen_area, living_area, floor]
383 |
384 |
385 | def get_cottage_data(html, url):
386 | soup = BeautifulSoup(html, "lxml")
387 |
388 | title = get_title(soup)
389 | city, district, street, block_number = get_address(soup)
390 | cottage_type = title.split(",")[0]
391 | price = get_price(soup)
392 | total_area, land_area, comforts, year, material, total_floors, land_status = get_cottage_params(soup)
393 | selling_detail = get_selling_type(soup)
394 | if "продажа" in selling_detail.lower() or "ипотека" in selling_detail.lower():
395 | rent_info = "Не аренда"
396 | else:
397 | rent_info = selling_detail
398 | selling_detail = "Не указано"
399 |
400 | images = get_photos(soup)
401 | description = get_description(soup)
402 | phone = get_seller_phone(url)
403 | date = get_date(soup, 1)
404 | seller_name = get_seller_name(soup)
405 |
406 | return [city, district, street, block_number, rent_info, price, cottage_type,
407 | total_area, comforts, selling_detail, images, description, date, phone, material,
408 | total_floors, land_area, land_status, seller_name]
409 |
410 |
411 | def get_commercial_data(html, url):
412 | soup = BeautifulSoup(html, "lxml")
413 |
414 | title = get_title(soup)
415 | city, district, street, block_number = get_address(soup)
416 | price = get_price(soup)
417 | object_type = title.split(",")[0]
418 | entrance, furniture, additions, area = get_commercial_params(soup)
419 | phone = get_seller_phone(url)
420 | images = get_photos(soup)
421 | description = get_description(soup)
422 | seller_name = get_seller_name(soup)
423 | date = get_date(soup, 1)
424 | office_class = "Не указано"
425 |
426 | return [city, district, street, block_number, price, object_type, office_class,
427 | furniture, entrance, area, date, phone, images, description, seller_name]
428 |
429 |
430 | def crawl_page(first_offer, html, category, sell_type):
431 | global visited_urls, db
432 | soup = BeautifulSoup(html, "lxml")
433 | # так как пагинация динамическая и мы не можем получить число страниц, проверяем, есть ли на странице объявления
434 | try:
435 | offers = soup.find("ol", class_="OffersSerp__list").find_all("li", class_="OffersSerp__list-item_type_offer")
436 | except:
437 | offers = []
438 | if offers is None or not offers:
439 | print("Парсинг завершен ya")
440 | return True
441 | k = 0
442 | for offer in offers:
443 | try:
444 | date = get_date(soup, 0)
445 | if date == "too old":
446 | print("Парсинг завершен ya")
447 | return True
448 |
449 | url = "https://realty.yandex.ru" + offer.find("a", class_="OffersSerpItem__link").get("href")
450 | if url in visited_urls:
451 | print("ya not unique")
452 | time.sleep(random.uniform(10, 15))
453 | continue
454 | else:
455 | visited_urls.append(url)
456 | #print(url)
457 |
458 | data = []
459 | if category == "Квартиры":
460 | data = get_apartment_data(get_html(url), url)
461 | # записываем ключевую информацию, чтобы потом найти дубликаты
462 | with open("total_data.txt", "a", encoding="utf8") as file:
463 | file.write("%s--%s--%s--%s--%s--%s\n" % (data[2], data[3], data[4], data[8], data[-1], url))
464 | elif category == "Дома":
465 | data = get_cottage_data(get_html(url), url)
466 | with open("total_data.txt", "a", encoding="utf8") as file:
467 | file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[7], data[8], url))
468 | elif category == "Коммерческая_недвижимость":
469 | data = get_commercial_data(get_html(url), url)
470 | with open("total_data.txt", "a", encoding="utf8") as file:
471 | file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[6], data[10], url))
472 |
473 | if first_offer:
474 | # сохраняем самую первую запись как точку выхода
475 | modifier = "w" if (category == "Квартиры" and sell_type == "Продажа") else "a"
476 | with open("breakpoints/ya.txt", modifier, encoding="utf8") as file:
477 | file.write("%s--%s\n" % (data[2], data[5]))
478 | first_offer = False
479 |
480 | key_info = (data[2], data[5])
481 |
482 | if any(x == key_info for x in [break_apartment_sell, break_apartment_rent, break_cottage_sell,
483 | break_cottage_rent, break_commercial_sell, break_commercial_rent]):
484 | print("Парсинг завершен ya")
485 | return True
486 |
487 | data.insert(4, sell_type)
488 | #print(*data, sep="\n")
489 | #print("--------------------------------------")
490 | if data[0] != "Не указано":
491 | try:
492 | db.insert_data(category, data)
493 | except:
494 | db.close()
495 | db = DataBase()
496 | db.insert_data(category, data)
497 | print("parsed page ya")
498 |
499 |
500 | except Exception as e:
501 | with open("logs.txt", "a", encoding="utf8") as file:
502 | file.write(str(e) + " ya crawl_page\n")
503 | #print(e)
504 | #print("Ошибка в crawl_page")
505 |
506 | k += 1
507 | if k % 5 == 0: # после каждого пятого запроса, делаем паузу побольше
508 | time.sleep(100)
509 | else:
510 | time.sleep(random.uniform(10, 15))
511 |
512 |
513 | def parse(category_url, category_name, sell_type):
514 | completed = False
515 | page = 0
516 | while not completed:
517 | url_gen = category_url[:category_url.rfind("=") + 1] + str(page)
518 | if page == 0:
519 | completed = crawl_page(True, get_html(url_gen), category_name, sell_type)
520 | else:
521 | completed = crawl_page(False, get_html(url_gen), category_name, sell_type)
522 | page += 1
523 |
524 |
525 | def main():
526 | global visited_urls
527 | url_apartments_sell = "https://realty.yandex.ru/saratovskaya_oblast/kupit/kvartira/?sort=DATE_DESC&page=0"
528 | parse(url_apartments_sell, "Квартиры", "Продажа")
529 |
530 | visited_urls = []
531 | url_apartments_rent = "https://realty.yandex.ru/saratovskaya_oblast/snyat/kvartira/?sort=DATE_DESC&page=0"
532 | parse(url_apartments_rent, "Квартиры", "Аренда")
533 |
534 | visited_urls = []
535 | url_cottages_sell = "https://realty.yandex.ru/saratovskaya_oblast/kupit/dom/?sort=DATE_DESC&page=0"
536 | parse(url_cottages_sell, "Дома", "Продажа")
537 |
538 | visited_urls = []
539 | url_cottages_rent = "https://realty.yandex.ru/saratovskaya_oblast/snyat/dom/?sort=DATE_DESC&page=0"
540 | parse(url_cottages_rent, "Дома", "Аренда")
541 |
542 | visited_urls = []
543 | url_commercials_sell = "https://realty.yandex.ru/saratovskaya_oblast/kupit/kommercheskaya-nedvizhimost/?sort=DATE_DESC&page=0"
544 | parse(url_commercials_sell, "Коммерческая_недвижимость", "Продажа")
545 |
546 | visited_urls = []
547 | url_commercials_rent = "https://realty.yandex.ru/saratovskaya_oblast/snyat/kommercheskaya-nedvizhimost/?sort=DATE_DESC&page=0"
548 | parse(url_commercials_rent, "Коммерческая_недвижимость", "Аренда")
549 |
550 |
551 | if __name__ == "__main__":
552 | main()
553 | db.close()
554 |
--------------------------------------------------------------------------------
/youla_parsing.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import requests
4 | from bs4 import BeautifulSoup
5 | import time
6 | import random
7 | from fake_useragent import UserAgent
8 | import datetime
9 | from selenium import webdriver
10 | from xvfbwrapper import Xvfb
11 | from selenium.webdriver.chrome.options import Options
12 | from database import DataBase
13 |
14 | db = DataBase()
15 | visited_urls = []
16 |
17 | # defining chrome options for selenium
18 | options = Options()
19 | options.add_argument("--no-sandbox")
20 |
21 |
22 | def get_html(url):
23 | req = requests.get(url, headers={"User-Agent": UserAgent().chrome})
24 | return req.text.encode(req.encoding)
25 |
26 |
27 | def get_date(html, k):
28 | soup = BeautifulSoup(html, "lxml")
29 |
30 | try:
31 | date = soup.find_all("span", class_="hidden-xs")[k].text.strip()
32 | if "сегодня" in date:
33 | return str(datetime.datetime.today()).split()[0]
34 | elif "вчера" in date:
35 | return str(datetime.datetime.today() - datetime.timedelta(days=1)).split()[0]
36 | else:
37 | return "too old"
38 | except Exception as e:
39 | date = "Не указано"
40 | with open("logs.txt", "a", encoding="utf8") as file:
41 | file.write(str(e) + " youla get_date\n")
42 | return date
43 |
44 |
45 | def get_category(html, k):
46 | soup = BeautifulSoup(html, "lxml")
47 |
48 | try:
49 | title = soup.find_all("div", class_="product_item__title")[k].text.split(",")[0].strip()
50 | if "Квартира" in title:
51 | return "Квартира"
52 | elif "Дом" in title:
53 | return "Дом"
54 | elif "Коттедж" in title:
55 | return "Коттедж"
56 | elif "Таунхаус" in title:
57 | return "Таунхаус"
58 | elif "Дача" in title:
59 | return "Дача"
60 | elif "Участок" in title:
61 | return "Участок"
62 | except Exception as e:
63 | with open("logs.txt", "a", encoding="utf8") as file:
64 | file.write(str(e) + " youla get_category\n")
65 | return None
66 |
67 |
68 | def get_address(driver):
69 | try:
70 | address = driver.find_element_by_tag_name("table").find_elements_by_tag_name("tbody")[0].find_elements_by_tag_name("span")[0].text.strip()
71 | # separating data from the address string
72 | district, street = "Не указано", "Не указано"
73 | city = address.split(",")[0]
74 | block_number = address.split(",")[-1].strip()
75 | if "ул " in block_number.lower() or "ул." in block_number.lower() or "улица" in block_number.lower() \
76 | or " пер" in block_number.lower() or "проспект" in block_number.lower() or "проезд" in block_number.lower():
77 | street = block_number
78 | block_number = "Не указано"
79 |
80 | for param in address.split(",")[1:-1]:
81 | if "ул " in param.lower() or "ул." in param.lower() or "улица" in param.lower() \
82 | or " пер" in param.lower() or "проспект" in param.lower() or "проезд" in param.lower():
83 | street = param.strip()
84 | elif "район" in param.lower() or "р-н" in param.lower():
85 | district = param.strip()
86 |
87 | if street.split()[-1].strip().isdigit():
88 | block_number = street.split()[-1].strip()
89 | if block_number == "unnamed road":
90 | block_number = "Не указано"
91 | street = " ".join(street.split()[:-1]).strip()
92 |
93 | return city, district, street, block_number
94 | except Exception as e:
95 | with open("logs.txt", "a", encoding="utf8") as file:
96 | file.write(str(e) + " youla get_address\n")
97 | return ["Не указано"] * 4
98 |
99 |
100 | def get_selling_type(url):
101 | sell_type, rent_info = "Не указано", "Не указано"
102 | if "prodaja" in url:
103 | sell_type = "Продажа"
104 | elif "arenda" in url:
105 | if "posutochno" in url:
106 | sell_type = "Аренда"
107 | rent_info = "посуточно"
108 | else:
109 | sell_type = "Аренда"
110 | rent_info = "длительный срок"
111 | return sell_type, rent_info
112 |
113 |
114 | def get_price(driver):
115 | try:
116 | price = driver.find_element_by_css_selector("div[class='sticky-inner-wrapper']").find_element_by_tag_name("span").text.strip()
117 | except Exception as e:
118 | with open("logs.txt", "a", encoding="utf8") as file:
119 | file.write(str(e) + " youla get_price\n")
120 | price = "Не указано"
121 | return price
122 |
123 |
124 | def get_seller_info(driver):
125 | seller_type, seller_name = "Не указано", "Не указано"
126 | try:
127 | block = driver.find_element_by_css_selector("div[data-test-component='ProductOwner']").find_element_by_tag_name("div")
128 | seller_name = block.find_element_by_tag_name("a").text.strip()
129 | seller_name = seller_name[:seller_name.rfind("(")]
130 | seller_type = block.find_element_by_tag_name("div").text.strip()
131 | except Exception as e:
132 | with open("logs.txt", "a", encoding="utf8") as file:
133 | file.write(str(e) + " youla get_seller_info\n")
134 | return seller_type, seller_name
135 |
136 |
137 | def get_photos(driver):
138 | try:
139 | images = "\n".join([x.get_attribute("src") for x in driver.find_elements_by_tag_name("div")
140 | if x.get_attribute("src") is not None])
141 | if not images:
142 | images = driver.find_element_by_css_selector("div[data-test-component='ProductGallery']").find_element_by_tag_name("img").get_attribute("src")
143 | except Exception as e:
144 | images = "Не указано"
145 | with open("logs.txt", "a", encoding="utf8") as file:
146 | file.write(str(e) + " youla get_photos\n")
147 | return images
148 |
149 |
150 | def get_description(driver):
151 | try:
152 | description = driver.find_element_by_tag_name("table").find_elements_by_tag_name("tbody")[1].find_element_by_tag_name("td").text.strip()
153 | except Exception as e:
154 | description = "Не указано"
155 | with open("logs.txt", "a", encoding="utf8") as file:
156 | file.write(str(e) + " youla get_description\n")
157 | return description
158 |
159 |
160 | def get_seller_phone(driver):
161 | try:
162 | button = driver.find_element_by_css_selector("button[data-test-action='PhoheNumberClick']")
163 | button.click()
164 | time.sleep(3)
165 | phone = driver.find_element_by_xpath('//*[@id="app"]/div[2]/div[10]/div/div/div/div[2]/div[2]/div/a').text.strip()
166 | except Exception as e:
167 | phone = "Не указано"
168 | with open("logs.txt", "a", encoding="utf8") as file:
169 | file.write(str(e) + " youla get_seller_phone\n")
170 | return phone
171 |
172 |
173 | def get_apartment_params(driver):
174 | material, lift, year, rooms_number, floor, total_floors, total_area, kitchen_area, repair = ["Не указано"] * 9
175 | try:
176 | expand = driver.find_element_by_tag_name("table").find_elements_by_tag_name("tbody")[2].find_element_by_tag_name("div")
177 | expand.click()
178 | params = driver.find_element_by_tag_name("table").find_elements_by_tag_name("tbody")[2].find_elements_by_tag_name("th")
179 | values = driver.find_element_by_tag_name("table").find_elements_by_tag_name("tbody")[2].find_elements_by_tag_name("td")
180 | for i in range(len(params)):
181 | if "Комнат в квартире" in params[i].text.strip():
182 | rooms_number = values[i].text.strip()
183 | elif "Общая площадь" in params[i].text.strip():
184 | total_area = values[i].text.strip()
185 | elif "Этаж" in params[i].text.strip():
186 | floor = values[i].text.strip().split()[0]
187 | elif "Этажность дома" in params[i].text.strip():
188 | total_floors = values[i].text.strip()
189 | elif "Площадь кухни" in params[i].text.strip():
190 | kitchen_area = values[i].text.strip()
191 | elif "Ремонт" in params[i].text.strip():
192 | repair = values[i].text.strip()
193 | elif "Лифт" in params[i].text.strip():
194 | lift = values[i].text.strip()
195 | elif "Тип дома" in params[i].text.strip():
196 | material = values[i].text.strip()
197 | elif "Год постройки" in params[i].text.strip():
198 | year = values[i].text.strip()
199 | except Exception as e:
200 | with open("logs.txt", "a", encoding="utf8") as file:
201 | file.write(str(e) + " youla get_apartment_params\n")
202 | return material, lift, year, rooms_number, floor, total_floors, total_area, kitchen_area, repair
203 |
204 |
205 | def get_cottage_params(driver):
206 | total_area, material, total_floors, bedrooms, land_area, status, comforts = ["Не указано"] * 7
207 | try:
208 | expand = driver.find_element_by_tag_name("table").find_elements_by_tag_name("tbody")[2].find_element_by_tag_name("div")
209 | expand.click()
210 | params = driver.find_element_by_tag_name("table").find_elements_by_tag_name("tbody")[2].find_elements_by_tag_name("th")
211 | values = driver.find_element_by_tag_name("table").find_elements_by_tag_name("tbody")[2].find_elements_by_tag_name("td")
212 | for i in range(len(params)):
213 | if "Площадь дома" in params[i].text.strip():
214 | total_area = values[i].text.strip()
215 | elif "Материал дома" in params[i].text.strip():
216 | material = values[i].text.strip()
217 | elif "Количество спален" in params[i].text.strip():
218 | bedrooms = values[i].text.strip()
219 | elif "Площадь участка" in params[i].text.strip():
220 | land_area = values[i].text.strip()
221 | elif "Этажей" in params[i].text.strip():
222 | total_floors = values[i].text.strip()
223 | elif "Тип участка" in params[i].text.strip():
224 | status = values[i].text.strip()
225 | elif any(x in params[i].text.strip() for x in ["Электричество", "Газ", "Водоснабжение", "Отопление", "Гараж", "Санузлы"]):
226 | if comforts == "Не указано":
227 | comforts = params[i].text.strip() + " - " + values[i].text.strip().lower() + "; "
228 | else:
229 | comforts += params[i].text.strip() + " - " + values[i].text.strip().lower() + "; "
230 | except Exception as e:
231 | with open("logs.txt", "a", encoding="utf8") as file:
232 | file.write(str(e) + " youla get_cottage_params\n")
233 | return total_area, material, total_floors, bedrooms, land_area, status, comforts
234 |
235 |
236 | def get_apartment_data(url):
237 | vdisplay = Xvfb()
238 | vdisplay.start()
239 | driver = webdriver.Chrome(options=options)
240 | driver.set_window_size(1920, 1080)
241 | driver.get(url)
242 |
243 | city, district, street, block_number = get_address(driver)
244 | sell_type, rent_info = get_selling_type(url)
245 | if "продажа" in sell_type.lower():
246 | rent_info = "Не аренда"
247 | material, lift, year, rooms_number, floor, total_floors, total_area, kitchen_area, repair = get_apartment_params(driver)
248 | block_type = "Вторичка"
249 | living_area = "Не указано"
250 | price = get_price(driver)
251 | if "Аренда" in sell_type:
252 | if "posutochno" in url:
253 | price += "/день"
254 | else:
255 | price += "/мес."
256 | #seller_type, seller_name = get_seller_info(driver)
257 | images = get_photos(driver)
258 | description = get_description(driver)
259 | phone = get_seller_phone(driver)
260 | selling_detail = "Не указано"
261 |
262 | driver.quit()
263 | vdisplay.stop()
264 |
265 | return [city, district, street, block_number, sell_type, rent_info, price, block_type,
266 | rooms_number, total_area, total_floors, material, selling_detail, images,
267 | description, phone, kitchen_area, living_area, floor]
268 |
269 |
270 | def get_cottage_data(url, category):
271 | vdisplay = Xvfb()
272 | vdisplay.start()
273 | driver = webdriver.Chrome(options=options)
274 | driver.set_window_size(1920, 1080)
275 | driver.get(url)
276 |
277 | if "doma" in url:
278 | cottage_type = "Дом"
279 | elif "uchastka" in url:
280 | cottage_type = "Участок"
281 | else:
282 | cottage_type = "Не указано"
283 |
284 | city, district, street, block_number = get_address(driver)
285 | sell_type, rent_info = get_selling_type(url)
286 | if "продажа" in sell_type.lower():
287 | rent_info = "Не аренда"
288 | price = get_price(driver)
289 | if "Аренда" in sell_type:
290 | if "posutochno" in url:
291 | price += "/день"
292 | else:
293 | price += "/мес."
294 | total_area, material, total_floors, bedrooms, land_area, status, comforts = get_cottage_params(driver)
295 | _, seller_name = get_seller_info(driver)
296 | images = get_photos(driver)
297 | description = get_description(driver)
298 | phone = get_seller_phone(driver)
299 | selling_detail = "Не указано"
300 |
301 | driver.quit()
302 | vdisplay.stop()
303 |
304 | if category == "Участок":
305 | material, total_floors = "Участок", "Участок"
306 |
307 | return [city, district, street, block_number, sell_type, rent_info, price, cottage_type,
308 | total_area, comforts, selling_detail, images, description, phone, material,
309 | total_floors, land_area, status, seller_name]
310 |
311 |
312 | def crawl_page(html):
313 | global visited_urls, db
314 | soup = BeautifulSoup(html, "lxml")
315 | # так как пагинация динамическая и мы не можем получить число страниц, проверяем, есть ли на странице объявления
316 | offers = soup.find_all("li", class_="product_item")
317 | if offers is None or not offers:
318 | print("Парсинг завершен youla")
319 | return True
320 | k = 0
321 | for offer in offers:
322 | try:
323 | category = get_category(html, k)
324 | date = get_date(html, k)
325 | if date == "too old" and len(offer.get("class")) == 1:
326 | print("Парсинг завершен youla")
327 | return True
328 | elif date == "too old":
329 | date = str(datetime.datetime.today() - datetime.timedelta(days=2)).split()[0]
330 | k += 1
331 | url = "https://youla.ru" + offer.find("a").get("href")
332 | if url in visited_urls:
333 | print("youla not unique")
334 | time.sleep(random.uniform(10, 15))
335 | continue
336 | else:
337 | visited_urls.append(url)
338 | #print(url)
339 |
340 | if category is None or "saratov" not in url:
341 | time.sleep(random.uniform(5, 8))
342 | continue
343 |
344 | data = []
345 | if category == "Квартира":
346 | data = get_apartment_data(url)
347 | data.insert(15, date)
348 | if data[0] != "Не указано":
349 | try:
350 | db.insert_data("Квартиры", data)
351 | except:
352 | db.close()
353 | db = DataBase()
354 | db.insert_data("Квартиры", data)
355 | with open("total_data.txt", "a", encoding="utf8") as file:
356 | file.write("%s--%s--%s--%s--%s--%s\n" % (data[2], data[3], data[4], data[8], data[-1], url))
357 | elif any(x in category for x in ["Дом", "Коттедж", "Таунхаус", "Дача", "Участок"]):
358 | data = get_cottage_data(url, category)
359 | data.insert(13, date)
360 | if data[0] != "Не указано":
361 | try:
362 | db.insert_data("Дома", data)
363 | except:
364 | db.close()
365 | db = DataBase()
366 | db.insert_data("Дома", data)
367 | with open("total_data.txt", "a", encoding="utf8") as file:
368 | file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[7], data[8], url))
369 |
370 | #print(*data, sep="\n")
371 | #print("--------------------------------------")
372 | print("parsed page youla")
373 |
374 | except Exception as e:
375 | with open("logs.txt", "a", encoding="utf8") as file:
376 | file.write(str(e) + " youla crawl_page\n")
377 | #print(e)
378 | #print("Ошибка в crawl_page")
379 |
380 |
381 | def parse(url):
382 | completed = False
383 | page = 1
384 | while not completed:
385 | url_gen = url[:url.rfind("=") + 1] + str(page)
386 | completed = crawl_page(get_html(url_gen))
387 | page += 1
388 |
389 |
390 | def main():
391 | url = "https://youla.ru/saratov/nedvijimost?attributes[sort_field]=date_published&attributes[term_of_placement][from]=-1%20day&attributes[term_of_placement][to]=now&page=1"
392 | parse(url)
393 |
394 |
395 | if __name__ == "__main__":
396 | main()
397 | db.close()
398 |
--------------------------------------------------------------------------------