├── .gitignore ├── README.md ├── avito_parsing.py ├── cian_parsing.py ├── database.py ├── irr_parsing.py ├── kvadrat64_parsing.py ├── main.py ├── requirements.txt ├── ya_realty_parsing.py └── youla_parsing.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | .idea/ 107 | *.csv 108 | chromedriver.exe 109 | phone.gif 110 | phone_number.png 111 | logs.txt 112 | # breakpoints/ 113 | total_data.txt 114 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # real_estate_parsing -------------------------------------------------------------------------------- /avito_parsing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import time 6 | import random 7 | import datetime 8 | from fake_useragent import UserAgent 9 | from selenium import webdriver 10 | from selenium.webdriver.chrome.options import Options 11 | from xvfbwrapper import Xvfb 12 | from PIL import Image 13 | from pytesseract import image_to_string 14 | import sys 15 | from database import DataBase 16 | 17 | # на каких записях останавливаться 18 | with open("breakpoints/avito.txt", "r", encoding="utf8") as file: 19 | breakpoints = file.readlines() 20 | try: 21 | break_apartment = tuple(breakpoints[0].strip().split("--")) 22 | except: 23 | break_apartment = None 24 | try: 25 | break_cottage = tuple(breakpoints[1].strip().split("--")) 26 | except: 27 | break_cottage = None 28 | try: 29 | break_land = tuple(breakpoints[2].strip().split("--")) 30 | except: 31 | break_land = None 32 | try: 33 | break_commercial = tuple(breakpoints[3].strip().split("--")) 34 | except: 35 | break_commercial = None 36 | 37 | 38 | #defining chrome options for selenium 39 | options = Options() 40 | options.add_argument('--no-sandbox') 41 | 42 | db = DataBase() 43 | visited_urls = [] 44 | 45 | 46 | def get_html(url): 47 | req = requests.get(url, headers={"User-Agent": UserAgent().chrome}) 48 | return req.text.encode(req.encoding) 49 | 50 | 51 | def get_total_pages(html): 52 | soup = BeautifulSoup(html, "lxml") 53 | try: 54 | pages = soup.find("div", class_="pagination-pages clearfix").find_all("a", class_="pagination-page")[-1].get("href") 55 | total_pages = int(pages.split("=")[1].split("&")[0]) 56 | except Exception as e: 57 | with open("logs.txt", "a", encoding="utf8") as file: 58 | file.write(str(e) + " avito get_total_pages\n") 59 | sys.exit(0) 60 | return total_pages 61 | 62 | 63 | def get_title(soup): 64 | try: 65 | title = soup.find("span", class_="title-info-title-text").text.strip() 66 | except Exception as e: 67 | with open("logs.txt", "a", encoding="utf8") as file: 68 | file.write(str(e) + " avito get_title\n") 69 | title = "Не указано" 70 | return title 71 | 72 | 73 | def get_address(soup): 74 | try: 75 | address = "{}, {}".format(soup.find("meta", itemprop="addressLocality").get("content").strip(), 76 | soup.find("span", itemprop="streetAddress").text.strip()) 77 | # separating data from the address string 78 | district, street = "Не указано", "Не указано" 79 | city = address.split(",")[0] 80 | block_number = address.split(",")[-1].strip() 81 | if "ул " in block_number.lower() or "ул." in block_number.lower() or "улица" in block_number.lower()\ 82 | or " пер" in block_number.lower() or "проспект" in block_number.lower() or "проезд" in block_number.lower(): 83 | street = block_number 84 | block_number = "Не указано" 85 | 86 | for param in address.split(",")[1:-1]: 87 | if "ул " in param.lower() or "ул." in param.lower() or "улица" in param.lower() \ 88 | or " пер" in param.lower() or "проспект" in param.lower() or "проезд" in param.lower(): 89 | street = param.strip() 90 | elif "район" in param.lower() or "р-н" in param.lower(): 91 | district = param.strip() 92 | 93 | if street.split()[-1].strip().isdigit(): 94 | block_number = street.split()[-1].strip() 95 | street = " ".join(street.split()[:-1]).strip() 96 | 97 | return city, district, street, block_number 98 | except Exception as e: 99 | with open("logs.txt", "a", encoding="utf8") as file: 100 | file.write(str(e) + " avito get_address\n") 101 | return ["Не указано"] * 4 102 | 103 | 104 | def get_selling_info(soup): 105 | try: 106 | per_meter = False # если цена указана за квадратный метр 107 | price = soup.find("span", class_="price-value-string js-price-value-string").text.strip() 108 | if "за сутки" in price: 109 | sell_type = "Аренда" 110 | rent_info = "посуточно" 111 | elif "в месяц" in price: 112 | sell_type = "Аренда" 113 | rent_info = "длительный срок" 114 | if "за " in price: 115 | per_meter = True 116 | else: 117 | sell_type = "Продажа" 118 | rent_info = "Не аренда" 119 | price = soup.find("span", class_="js-item-price").text.strip() 120 | # ошибка кодировки при записи, собираем сообщение вручную 121 | if rent_info == "посуточно": 122 | price = "от " + price + " за сутки" 123 | elif rent_info == "длительный срок": 124 | if per_meter: 125 | price = price + " в месяц за м2" 126 | else: 127 | price = price + " в месяц" 128 | except Exception as e: 129 | with open("logs.txt", "a", encoding="utf8") as file: 130 | file.write(str(e) + " avito get_selling info\n") 131 | sell_type, price, rent_info = ["Не указано"] * 3 132 | return sell_type, price, rent_info 133 | 134 | 135 | def get_deposit(soup): 136 | try: 137 | deposit = soup.find("div", class_="item-price-sub-price").text.strip() 138 | except Exception as e: 139 | with open("logs.txt", "a", encoding="utf8") as file: 140 | file.write(str(e) + " avito get_deposit\n") 141 | deposit = "Не указано" 142 | return deposit 143 | 144 | 145 | def get_seller_type(soup): 146 | try: 147 | seller_type = soup.find("div", class_="seller-info-prop seller-info-prop_short_margin") 148 | if seller_type is not None: 149 | seller_type = "Посредник" 150 | else: 151 | seller_type = "Собственник" 152 | except Exception as e: 153 | with open("logs.txt", "a", encoding="utf8") as file: 154 | file.write(str(e) + " avito get_seller_type\n") 155 | seller_type = "Не указано" 156 | return seller_type 157 | 158 | 159 | def get_seller_name(soup): 160 | try: 161 | seller_name = soup.find("div", class_="seller-info-name").find("a").text.strip() 162 | except Exception as e: 163 | with open("logs.txt", "a", encoding="utf8") as file: 164 | file.write(str(e) + " avito get_seller_name\n") 165 | seller_name = "Не указано" 166 | return seller_name 167 | 168 | 169 | def get_photos(soup): 170 | try: 171 | images = [] 172 | images_list = soup.find("ul", class_="gallery-list js-gallery-list").find_all("li", class_="gallery-list-item js-gallery-list-item") 173 | for image in images_list: 174 | link = image.find("span").get("style").split(":")[1].strip()[4:-2] 175 | images.append(link) 176 | images = "\n".join(images) 177 | except: 178 | # если нет фото, возьмем фото с "обложки" 179 | try: 180 | images = soup.find("span", class_="gallery-img-cover").get("style").split(":")[1].strip()[4:-2] 181 | except Exception as e: 182 | with open("logs.txt", "a", encoding="utf8") as file: 183 | file.write(str(e) + " avito get_photos\n") 184 | images = "Не указано" 185 | return images 186 | 187 | 188 | def get_description(soup): 189 | try: 190 | description = soup.find("div", class_="item-description-text").find("p").text.strip() 191 | except Exception as e: 192 | with open("logs.txt", "a", encoding="utf8") as file: 193 | file.write(str(e) + " avito get_description\n") 194 | description = "Не указано" 195 | return description 196 | 197 | 198 | def get_date(soup): 199 | try: 200 | date = soup.find("div", class_="title-info-metadata-item").text.split(",")[1].strip() 201 | if "сегодня" in date: 202 | date = str(datetime.datetime.today()).split()[0] 203 | elif "вчера" in date: 204 | date = str(datetime.datetime.today() - datetime.timedelta(days=1)).split()[0] 205 | else: 206 | date = "too old" 207 | except Exception as e: 208 | with open("logs.txt", "a", encoding="utf8") as file: 209 | file.write(str(e) + " avito get_date\n") 210 | date = "Не указано" 211 | return date 212 | 213 | 214 | def get_seller_phone(url): 215 | # телефон показывается в виде картинки, используем selenium и pytesseract 216 | vdisplay = Xvfb() 217 | vdisplay.start() 218 | driver = webdriver.Chrome(options=options) 219 | driver.set_window_size(1920, 1080) 220 | driver.get(url) 221 | 222 | try: 223 | button = driver.find_element_by_xpath('//a[@class="button item-phone-button js-item-phone-button ' 224 | 'button-origin button-origin-blue button-origin_full-width ' 225 | 'button-origin_large-extra item-phone-button_hide-phone ' 226 | 'item-phone-button_card js-item-phone-button_card"]') 227 | button.click() 228 | time.sleep(2) 229 | driver.save_screenshot("phone_number.png") 230 | 231 | image = driver.find_element_by_xpath('//div[@class="item-phone-big-number js-item-phone-big-number"]//*') 232 | 233 | cropped = Image.open("phone_number.png") 234 | x, y = image.location["x"], image.location["y"] 235 | width, height = image.size["width"], image.size["height"] 236 | cropped.crop((x, y, x + width, y + height)).save("phone.gif") 237 | 238 | phone = Image.open("phone.gif") 239 | phone_text = image_to_string(phone) 240 | except Exception as e: 241 | with open("logs.txt", "a", encoding="utf8") as file: 242 | file.write(str(e) + " avito get_seller_phone\n") 243 | phone_text = "Не указано" 244 | 245 | driver.quit() 246 | vdisplay.stop() 247 | 248 | return phone_text 249 | 250 | 251 | def get_apartment_params(soup): 252 | rooms_number, floor_number, total_floors, material, total_area, kitchen_area, living_area = ["Не указано"] * 7 253 | block_type = "Вторичка" 254 | try: 255 | params = soup.find_all("li", class_="item-params-list-item") 256 | for i in range(len(params)): 257 | info = params[i].text.strip() 258 | if "Количество комнат" in info: 259 | rooms_number = info.split(":")[1].strip() 260 | elif "Этажей в доме" in info: 261 | total_floors = info.split(":")[1].strip() 262 | elif "Этаж" in info: 263 | floor_number = info.split(":")[1].strip() 264 | elif "Тип дома" in info: 265 | material = info.split(":")[1].strip() 266 | elif "Общая площадь" in info: 267 | total_area = info.split(":")[1].split("м²")[0].strip() 268 | elif "Площадь кухни" in info: 269 | kitchen_area = info.split(":")[1].split("м²")[0].strip() 270 | elif "Жилая площадь" in info: 271 | living_area = info.split(":")[1].split("м²")[0].strip() 272 | elif "Официальный застройщик" in info or "Название объекта недвижимости" in info: 273 | block_type = "Новостройка" 274 | except Exception as e: 275 | with open("logs.txt", "a", encoding="utf8") as file: 276 | file.write(str(e) + " avito get_apartment_params\n") 277 | return rooms_number, floor_number, total_floors, material, total_area, kitchen_area, living_area, block_type 278 | 279 | 280 | def get_cottage_params(soup): 281 | house_type, total_floors, distance, material, total_area, land_area = ["Не указано"] * 6 282 | try: 283 | params = soup.find_all("li", class_="item-params-list-item") 284 | for i in range(len(params)): 285 | info = params[i].text.strip() 286 | if "Вид объекта" in info: 287 | house_type = info.split(":")[1].strip() 288 | elif "Этажей в доме" in info: 289 | total_floors = info.split(":")[1].strip() 290 | elif "Расстояние до города" in info: 291 | distance = info.split(":")[1].split("км")[0].strip() + " км" 292 | elif "Материал стен" in info: 293 | material = info.split(":")[1].strip() 294 | elif "Площадь дома" in info: 295 | total_area = info.split(":")[1].split("м²")[0].strip() 296 | elif "Площадь участка" in info: 297 | land_area = info.split(":")[1].split("сот")[0].strip() + " сот" 298 | except Exception as e: 299 | with open("logs.txt", "a", encoding="utf8") as file: 300 | file.write(str(e) + " avito get_cottage_params\n") 301 | return house_type, total_floors, distance, material, total_area, land_area 302 | 303 | 304 | def get_land_params(soup): 305 | distance, area = "Не указано", "Не указано" 306 | try: 307 | labels = soup.find_all("span", class_="item-params-label") 308 | params = soup.find("div", class_="item-params").find_all("span") 309 | for i in range(len(labels)): 310 | info = params[i * 2].text.strip() 311 | label = labels[i].text.strip() 312 | if "Расстояние до города" in label: 313 | distance = info.split(":")[1].split("км")[0].strip() + " км" 314 | elif "Площадь" in label: 315 | area = info.split(":")[1].split("сот")[0].strip() + " сот" 316 | except Exception as e: 317 | with open("logs.txt", "a", encoding="utf8") as file: 318 | file.write(str(e) + " avito get_land_params\n") 319 | return distance, area 320 | 321 | 322 | def get_commercial_params(soup): 323 | office_class, area = "Не указано", "Не указано" 324 | try: 325 | labels = soup.find_all("span", class_="item-params-label") 326 | params = soup.find("div", class_="item-params").find_all("span") 327 | for i in range(len(labels)): 328 | info = params[i * 2].text.strip() 329 | label = labels[i].text.strip() 330 | if "Площадь" in label: 331 | area = info.split(":")[1].split("м²")[0].strip() 332 | elif "Класс здания" in label: 333 | office_class = info.split(":")[1].strip() 334 | except Exception as e: 335 | with open("logs.txt", "a", encoding="utf8") as file: 336 | file.write(str(e) + " avito get_commercial_params\n") 337 | return office_class, area 338 | 339 | 340 | def get_apartment_data(url, html): 341 | soup = BeautifulSoup(html, "lxml") 342 | 343 | title = get_title(soup) 344 | if "сниму" not in title.lower() and "куплю" not in title.lower(): 345 | city, district, street, block_number = get_address(soup) 346 | sell_type, price, rent_info = get_selling_info(soup) 347 | rooms_number, floor_number, total_floors, material, total_area, kitchen_area, living_area, block_type = get_apartment_params(soup) 348 | #seller_type = get_seller_type(soup) 349 | #seller_name = get_seller_name(soup) 350 | images = get_photos(soup) 351 | description = get_description(soup) 352 | phone = get_seller_phone(url) 353 | date = get_date(soup) 354 | selling_detail = "Не указано" # на авито не указывается эта информация 355 | 356 | return [city, district, street, block_number, sell_type, rent_info, price, block_type, 357 | rooms_number, total_area, total_floors, material, selling_detail, images, 358 | description, date, phone, kitchen_area, living_area, floor_number] 359 | 360 | return None 361 | 362 | 363 | def get_cottage_data(url, html): 364 | soup = BeautifulSoup(html, "lxml") 365 | 366 | title = get_title(soup) 367 | if "сниму" not in title.lower() and "куплю" not in title.lower(): 368 | city, district, street, block_number = get_address(soup) 369 | sell_type, price, rent_info = get_selling_info(soup) 370 | house_type, total_floors, distance, material, total_area, land_area = get_cottage_params(soup) 371 | #seller_type = get_seller_type(soup) 372 | seller_name = get_seller_name(soup) 373 | images = get_photos(soup) 374 | description = get_description(soup) 375 | phone = get_seller_phone(url) 376 | date = get_date(soup) 377 | selling_detail, comforts, land_status = ["Не указано"] * 3 # на авито не указывается эта информация 378 | 379 | return [city, district, street, block_number, sell_type, rent_info, price, house_type, 380 | total_area, comforts, selling_detail, images, description, date, phone, material, 381 | total_floors, land_area, land_status, seller_name] 382 | return None 383 | 384 | 385 | def get_land_data(url, html): 386 | soup = BeautifulSoup(html, "lxml") 387 | 388 | title = get_title(soup) 389 | if "сниму" not in title.lower() and "куплю" not in title.lower(): 390 | # категория земель указывается в скобках в названии объявления 391 | if "(" in title: 392 | land_type = title[title.find("(") + 1:].split(")")[0] 393 | else: 394 | land_type = "Не указано" 395 | 396 | city, district, street, _ = get_address(soup) 397 | sell_type, price, _ = get_selling_info(soup) 398 | 399 | if "Аренда" in sell_type: 400 | deposit = get_deposit(soup) 401 | else: 402 | deposit = "Не аренда" 403 | 404 | distance, area = get_land_params(soup) 405 | seller_type = get_seller_type(soup) 406 | seller_name = get_seller_name(soup) 407 | images = get_photos(soup) 408 | description = get_description(soup) 409 | phone = get_seller_phone(url) 410 | date = get_date(soup) 411 | 412 | return [city, district, street, sell_type, deposit, land_type, distance, area, price, seller_type, images, 413 | description, seller_name, phone, date] 414 | return None 415 | 416 | 417 | def get_commercial_data(url, html): 418 | soup = BeautifulSoup(html, "lxml") 419 | 420 | title = get_title(soup) 421 | if "сниму" not in title.lower() and "куплю" not in title.lower(): 422 | # анализируем вид помещения по заголовку 423 | if "офис" in title.lower(): 424 | object_type = "Офисное помещение" 425 | elif "торг" in title.lower(): 426 | object_type = "Торговое помещение" 427 | elif "гостиница" in title.lower(): 428 | object_type = "Гостиница" 429 | elif "свобод" in title.lower(): 430 | object_type = "Помещение свободного назначения" 431 | elif "производ" in title.lower(): 432 | object_type = "Производственное помещение" 433 | elif "склад" in title.lower(): 434 | object_type = "Складское помещение" 435 | else: 436 | object_type = "Не указано" 437 | 438 | city, district, street, block_number = get_address(soup) 439 | sell_type, price, _ = get_selling_info(soup) 440 | 441 | # if "Аренда" in sell_type: 442 | # deposit = get_deposit(soup) 443 | # else: 444 | # deposit = "Не аренда" 445 | 446 | # если не офис, не заполняем поле office_class 447 | if object_type == "Офисное помещение": 448 | office_class, area = get_commercial_params(soup) 449 | else: 450 | _, area = get_commercial_params(soup) 451 | office_class = "Не офис" 452 | 453 | #seller_type = get_seller_type(soup) 454 | seller_name = get_seller_name(soup) 455 | images = get_photos(soup) 456 | description = get_description(soup) 457 | phone = get_seller_phone(url) 458 | date = get_date(soup) 459 | furniture, entrance = "Не указано", "Не указано" # на авито не указывается эта информация 460 | 461 | return [city, district, street, block_number, sell_type, price, object_type, office_class, 462 | furniture, entrance, area, date, phone, images, description, seller_name] 463 | return None 464 | 465 | 466 | def crawl_page(first_offer, html, category): 467 | global visited_urls, db 468 | soup = BeautifulSoup(html, "lxml") 469 | try: 470 | offers = soup.find("div", class_="catalog-list").find_all("div", class_="item_table") 471 | except: 472 | offers = [] 473 | if offers is None or not offers: 474 | print("Парсинг завершен avito") 475 | return True 476 | for offer in offers: 477 | try: 478 | if first_offer: 479 | # сохраняем самую первую запись как точку выхода 480 | modifier = "w" if category == "Квартиры" else "a" 481 | with open("breakpoints/avito.txt", modifier, encoding="utf8") as file: 482 | file.write("%s--%s\n" % (offer.find("a", class_="item-description-title-link").get("title"), 483 | offer.find("span", {"class": "price", "itemprop": "price"}).get("content"))) 484 | first_offer = False 485 | 486 | if offer.find("div", class_="js-item-date c-2").text.strip() == "2 дня назад": 487 | print("Парсинг завершен avito") 488 | return True 489 | 490 | key_info = (offer.find("a", class_="item-description-title-link").get("title"), offer.find("span", {"class": "price", "itemprop": "price"}).get("content")) 491 | 492 | if any(x == key_info for x in [break_apartment, break_cottage, break_land, break_commercial]): 493 | print("Парсинг завершен avito") 494 | return True 495 | 496 | url = "https://avito.ru" + offer.find("div", class_="description").find("h3").find("a").get("href") 497 | if url in visited_urls: 498 | print("avito not unique") 499 | time.sleep(random.uniform(5, 8)) 500 | continue 501 | else: 502 | visited_urls.append(url) 503 | 504 | data = [] 505 | if category == "Квартиры": 506 | data = get_apartment_data(url, get_html(url)) 507 | # записываем ключевую информацию, чтобы потом найти дубликаты 508 | with open("total_data.txt", "a", encoding="utf8") as file: 509 | file.write("%s--%s--%s--%s--%s--%s\n" % (data[2], data[3], data[4], data[8], data[-1], url)) 510 | elif category == "Дома": 511 | data = get_cottage_data(url, get_html(url)) 512 | with open("total_data.txt", "a", encoding="utf8") as file: 513 | file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[7], data[8], url)) 514 | elif category == "Участки": 515 | data = get_land_data(url, get_html(url)) 516 | with open("total_data.txt", "a", encoding="utf8") as file: 517 | file.write("%s--%s--%s--%s\n" % (data[2], data[5], data[7], url)) 518 | elif category == "Коммерческая_недвижимость": 519 | data = get_commercial_data(url, get_html(url)) 520 | with open("total_data.txt", "a", encoding="utf8") as file: 521 | file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[6], data[10], url)) 522 | 523 | if data[0] != "Не указано" and data is not None: 524 | try: 525 | db.insert_data(category, data) 526 | except: 527 | db.close() 528 | db = DataBase() 529 | db.insert_data(category, data) 530 | print("parsed page avito") 531 | 532 | #print(data) 533 | 534 | except Exception as e: 535 | with open("logs.txt", "a", encoding="utf8") as file: 536 | file.write(str(e) + " avito crawl_page\n") 537 | #print(str(e) + " avito crawl_page") 538 | 539 | time.sleep(random.uniform(5, 8)) 540 | 541 | 542 | def parse(category_url, base_url, category_name): 543 | page_part = "p=" 544 | parameters_part = "&s=104&s_trg=3&bt=1" 545 | 546 | total_pages = get_total_pages(get_html(category_url)) 547 | 548 | for page in range(1, total_pages + 1): 549 | url_gen = base_url + page_part + str(page) + parameters_part 550 | if page == 1: 551 | completed = crawl_page(True, get_html(url_gen), category_name) 552 | else: 553 | completed = crawl_page(False, get_html(url_gen), category_name) 554 | if completed: 555 | break 556 | 557 | 558 | def main(): 559 | global visited_urls 560 | url_apartments = "https://www.avito.ru/saratovskaya_oblast/kvartiry?p=1&s=104&s_trg=3&bt=1" 561 | base_url = "https://www.avito.ru/saratovskaya_oblast/kvartiry?" 562 | parse(url_apartments, base_url, "Квартиры") 563 | 564 | visited_urls = [] 565 | url_cottages = "https://www.avito.ru/saratovskaya_oblast/doma_dachi_kottedzhi?s=104&s_trg=3&bt=1" 566 | base_url = "https://www.avito.ru/saratovskaya_oblast/doma_dachi_kottedzhi?" 567 | parse(url_cottages, base_url, "Дома") 568 | 569 | visited_urls = [] 570 | url_lands = "https://www.avito.ru/saratovskaya_oblast/zemelnye_uchastki?s=104&s_trg=3&bt=1" 571 | base_url = "https://www.avito.ru/saratovskaya_oblast/zemelnye_uchastki?" 572 | parse(url_lands, base_url, "Участки") 573 | 574 | visited_urls = [] 575 | url_commercials = "https://www.avito.ru/saratovskaya_oblast/kommercheskaya_nedvizhimost?s=104&s_trg=3&bt=1" 576 | base_url = "https://www.avito.ru/saratovskaya_oblast/kommercheskaya_nedvizhimost?" 577 | parse(url_commercials, base_url, "Коммерческая_недвижимость") 578 | 579 | 580 | if __name__ == "__main__": 581 | main() 582 | db.close() 583 | -------------------------------------------------------------------------------- /cian_parsing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import time 6 | import random 7 | from fake_useragent import UserAgent 8 | import datetime 9 | from selenium import webdriver 10 | from xvfbwrapper import Xvfb 11 | from selenium.webdriver.chrome.options import Options 12 | from database import DataBase 13 | 14 | 15 | db = DataBase() 16 | visited_urls = [] 17 | 18 | # defining chrome options for selenium 19 | options = Options() 20 | options.add_argument("--no-sandbox") 21 | 22 | 23 | def get_html(url): 24 | req = requests.get(url, headers={"User-Agent": UserAgent().chrome}) 25 | return req.text.encode(req.encoding) 26 | 27 | 28 | def get_title(soup): 29 | try: 30 | title = soup.find("h1").text.strip() 31 | except Exception as e: 32 | #print(str(e) + " title") 33 | title = "Не указано" 34 | return title 35 | 36 | 37 | def get_address(soup): 38 | try: 39 | address = soup.find("address").text.strip() 40 | if "На карте" in address: 41 | address = address[:address.rfind("На карте")] 42 | # separating data from the address string 43 | district, street = "Не указано", "Не указано" 44 | city = address.split(",")[1].strip() 45 | block_number = address.split(",")[-1].strip() 46 | if "ул " in block_number.lower() or "ул." in block_number.lower() or "улица" in block_number.lower() \ 47 | or " пер" in block_number.lower() or "проезд" in block_number.lower() or "проспект" in block_number.lower(): 48 | street = block_number 49 | block_number = "Не указано" 50 | 51 | for param in address.split(",")[1:-1]: 52 | if "ул " in param.lower() or "ул." in param.lower() or "улица" in param.lower() or " пер" in param.lower() \ 53 | or "проезд" in param.lower() or "проспект" in param.lower(): 54 | street = param.strip() 55 | elif "район" in param.lower() or "р-н" in param.lower(): 56 | district = param.strip() 57 | 58 | if street.split()[-1].strip().isdigit(): 59 | block_number = street.split()[-1].strip() 60 | street = " ".join(street.split()[:-1]).strip() 61 | 62 | return city, district, street, block_number 63 | except Exception as e: 64 | with open("logs.txt", "a", encoding="utf8") as file: 65 | file.write(str(e) + " cian get_title\n") 66 | return ["Не указано"] * 4 67 | 68 | 69 | def get_price(soup): 70 | try: 71 | price = soup.find("span", {"itemprop": "price"}) 72 | if price is not None: 73 | price = price.text.strip() 74 | else: 75 | price = "от " + soup.find("span", {"itemprop": "lowPrice"}).text.strip() + \ 76 | " до " + soup.find("span", {"itemprop": "highPrice"}).text.strip() + "/мес." 77 | except Exception as e: 78 | with open("logs.txt", "a", encoding="utf8") as file: 79 | file.write(str(e) + " cian get_price\n") 80 | price = "Не указано" 81 | return price 82 | 83 | 84 | def get_selling_type(soup): 85 | try: 86 | paragraphs = [x for x in soup.find_all("p") if x.get("class") is not None 87 | and len(x.get("class")) == 1 and "description--" in x.get("class")[0]] 88 | if paragraphs: 89 | selling_type = paragraphs[0].text.strip() 90 | else: 91 | selling_type = "Не указано" 92 | except Exception as e: 93 | with open("logs.txt", "a", encoding="utf8") as file: 94 | file.write(str(e) + " cian get_selling_type\n") 95 | selling_type = "Не указано" 96 | return selling_type 97 | 98 | 99 | def get_seller_type(soup): 100 | try: 101 | divs = [x for x in soup.find_all("div") if x.get("class") is not None 102 | and len(x.get("class")) == 1 and "honest-container" in x.get("class")[0]] 103 | if not divs: 104 | seller_type = "Не указано" 105 | else: 106 | seller_type = divs[0].text.strip() 107 | if seller_type is not None and seller_type.lower() == "собственник": 108 | seller_type = "Собственник" 109 | else: 110 | seller_type = "Посредник" 111 | except Exception as e: 112 | with open("logs.txt", "a", encoding="utf8") as file: 113 | file.write(str(e) + " cian get_seller_type\n") 114 | seller_type = "Не указано" 115 | return seller_type 116 | 117 | 118 | def get_seller_name(soup): 119 | try: 120 | name = [x for x in soup.find_all("h2") if x.get("class") is not None and len(x.get("class")) == 1 121 | and "title--" in x.get("class")[0]] 122 | if name: 123 | name = name[0].text.strip() 124 | except Exception as e: 125 | with open("logs.txt", "a", encoding="utf8") as file: 126 | file.write(str(e) + " cian get_seller_name\n") 127 | name = "Не указано" 128 | return name 129 | 130 | 131 | def get_photos(url): 132 | try: 133 | driver = webdriver.Chrome() 134 | driver.get(url) 135 | 136 | images = [] 137 | images_list = driver.find_elements_by_class_name("fotorama__img") 138 | images_list = [x.get_attribute("src") for x in images_list if "-2." in x.get_attribute("src")] 139 | for image in images_list: 140 | link = image.replace("-2.", "-1.") 141 | images.append(link) 142 | images = "\n".join(images) 143 | except Exception as e: 144 | with open("logs.txt", "a", encoding="utf8") as file: 145 | file.write(str(e) + " cian get_photos\n") 146 | images = "Не указано" 147 | return images 148 | 149 | 150 | def get_description(soup): 151 | try: 152 | paragraphs = [x for x in soup.find_all("p") if x.get("class") is not None 153 | and len(x.get("class")) == 1 and "description-text--" in x.get("class")[0]] 154 | description = paragraphs[0].text.strip() 155 | except Exception as e: 156 | with open("logs.txt", "a", encoding="utf8") as file: 157 | file.write(str(e) + " cian get_description\n") 158 | description = "Не указано" 159 | return description 160 | 161 | 162 | def get_date(soup): 163 | try: 164 | date = soup.find("div", id="frontend-offer-card").find("main").find_all("div")[4].text.strip() 165 | if "вчера" in date: 166 | date = str(datetime.datetime.today() - datetime.timedelta(days=1)).split()[0] 167 | elif "сегодня" in date: 168 | date = str(datetime.datetime.today()).split()[0] 169 | else: 170 | date = "too old" 171 | except Exception as e: 172 | with open("logs.txt", "a", encoding="utf8") as file: 173 | file.write(str(e) + " cian get_date\n") 174 | date = "Не указано" 175 | return date 176 | 177 | 178 | def driver_get_phone_and_images(url): 179 | vdisplay = Xvfb() 180 | vdisplay.start() 181 | driver = webdriver.Chrome(options=options) 182 | driver.set_window_size(1920, 1080) 183 | driver.get(url) 184 | 185 | try: 186 | images = [] 187 | images_list = driver.find_elements_by_class_name("fotorama__img") 188 | images_list = [x.get_attribute("src") for x in images_list if "-2." in x.get_attribute("src")] 189 | for image in images_list: 190 | link = image.replace("-2.", "-1.") 191 | images.append(link) 192 | images = "\n".join(images) 193 | if not images: 194 | # берем с обложки 195 | images = driver.find_element_by_class_name("fotorama__img").get_attribute("src") 196 | except Exception as e: 197 | with open("logs.txt", "a", encoding="utf8") as file: 198 | file.write(str(e) + " cian get_images\n") 199 | images = "Не указано" 200 | 201 | try: 202 | button = [x for x in driver.find_elements_by_tag_name("button") if x.text.strip() == "Показать телефон"][-1] 203 | button.click() 204 | phone = "\n".join([x.text.strip() for x in driver.find_elements_by_tag_name("a") if x.get_attribute("class") is not None 205 | and "phone--" in x.get_attribute("class")]) 206 | except Exception as e: 207 | phone = "Не указано" 208 | with open("logs.txt", "a", encoding="utf8") as file: 209 | file.write(str(e) + " cian get_phone\n") 210 | driver.quit() 211 | vdisplay.stop() 212 | return images, phone 213 | 214 | 215 | def get_apartment_params(soup): 216 | block_type, rooms_number, total_floors, total_area, material, year, kitchen_area, living_area, floor = ["Не указано"] * 9 217 | try: 218 | main_params = [x.text.strip() for x in soup.find_all("div") if x.get("class") is not None 219 | and len(x.get("class")) == 1 and "info-title--" in x.get("class")[0]] 220 | main_values = [x.text.strip() for x in soup.find_all("div") if x.get("class") is not None 221 | and len(x.get("class")) == 1 and "info-text--" in x.get("class")[0]] 222 | for i in range(len(main_params)): 223 | if "Общая" in main_params[i]: 224 | total_area = main_values[i] 225 | elif "Построен" in main_params[i]: 226 | year = main_values[i] 227 | elif "Кухня" in main_params[i]: 228 | kitchen_area = main_values[i] 229 | elif "Жилая" in main_params[i]: 230 | living_area = main_values[i] 231 | 232 | desc_params = [x.text.strip() for x in soup.find_all("span") if x.get("class") is not None 233 | and len(x.get("class")) == 1 and "name--" in x.get("class")[0]] 234 | desc_values = [x.text.strip() for x in soup.find_all("span") if x.get("class") is not None 235 | and len(x.get("class")) == 1 and "value--" in x.get("class")[0]] 236 | for i in range(len(desc_params)): 237 | if "Тип жилья" in desc_params[i]: 238 | block_type = desc_values[i] 239 | elif "Количество комнат" in desc_params[i]: 240 | rooms_number = desc_values[i] 241 | elif "Этаж" in desc_params[i]: 242 | floor = desc_values[i] 243 | elif "Этажей в доме" in desc_params[i]: 244 | total_floors = desc_values[i] 245 | elif "Тип дома" in desc_params[i]: 246 | material = desc_values[i] 247 | 248 | if year == "Не указано": 249 | building_params = [x.text.strip() for x in soup.find_all("div") if x.get("class") is not None 250 | and len(x.get("class")) == 1 and "name--" in x.get("class")[0]] 251 | building_values = [x.text.strip() for x in soup.find_all("div") if x.get("class") is not None 252 | and len(x.get("class")) == 1 and "value--" in x.get("class")[0]] 253 | for i in range(len(building_params)): 254 | if "Год постройки" in building_params[i]: 255 | year = building_values[i] 256 | break 257 | except Exception as e: 258 | with open("logs.txt", "a", encoding="utf8") as file: 259 | file.write(str(e) + " cian get_apartment_params\n") 260 | return block_type, rooms_number, total_floors, total_area, material, year, kitchen_area, living_area, floor 261 | 262 | 263 | def get_cottage_params(soup): 264 | total_area, material, land_area, status, comforts, total_floors = ["Не указано"] * 6 265 | try: 266 | main_params = [x.text.strip() for x in soup.find_all("div") if x.get("class") is not None 267 | and len(x.get("class")) == 1 and "info-title--" in x.get("class")[0]] 268 | main_values = [x.text.strip() for x in soup.find_all("div") if x.get("class") is not None 269 | and len(x.get("class")) == 1 and "info-text--" in x.get("class")[0]] 270 | for i in range(len(main_params)): 271 | if "Общая" in main_params[i]: 272 | total_area = main_values[i] 273 | elif "Участок" in main_params[i]: 274 | land_area = main_values[i] 275 | elif "Тип дома" in main_params[i]: 276 | material = main_values[i] 277 | elif "Этажей в доме" in main_params[i]: 278 | total_floors = main_values[i] 279 | 280 | comforts_list = [x.text.strip() for x in soup.find_all("li") if x.get("class") is not None 281 | and len(x.get("class")) == 2 and "item--" in x.get("class")[0]] 282 | if comforts: 283 | comforts = "; ".join(comforts_list) 284 | 285 | desc_params = [x.text.strip() for x in soup.find_all("span") if x.get("class") is not None 286 | and len(x.get("class")) == 1 and "name--" in x.get("class")[0]] 287 | desc_values = [x.text.strip() for x in soup.find_all("span") if x.get("class") is not None 288 | and len(x.get("class")) == 1 and "value--" in x.get("class")[0]] 289 | for i in range(len(desc_params)): 290 | if "Статус участка" in desc_params[i]: 291 | status = desc_values[i] 292 | elif land_area == "Не указано" and "Площадь участка" in desc_params[i]: 293 | land_area = desc_values[i] 294 | elif material == "Не указано" and "Тип дома" in desc_params[i]: 295 | material = desc_values[i] 296 | except Exception as e: 297 | with open("logs.txt", "a", encoding="utf8") as file: 298 | file.write(str(e) + " cian get_cottage_params\n") 299 | return total_area, material, land_area, status, comforts, total_floors 300 | 301 | 302 | def get_commercial_params(soup): 303 | area, office_class, floor, furniture, entrance = ["Не указано"] * 5 304 | try: 305 | main_params = [x.text.strip() for x in soup.find_all("div") if x.get("class") is not None 306 | and len(x.get("class")) == 1 and "info-title--" in x.get("class")[0]] 307 | main_values = [x.text.strip() for x in soup.find_all("div") if x.get("class") is not None 308 | and len(x.get("class")) == 1 and "info-text--" in x.get("class")[0]] 309 | for i in range(len(main_params)): 310 | if "Класс" in main_params[i]: 311 | office_class = main_values[i] 312 | elif "Этаж" in main_params[i]: 313 | floor = main_values[i] 314 | elif "Площадь" in main_params[i]: 315 | area = main_values[i] 316 | 317 | desc_params = [x.text.strip() for x in soup.find_all("span") if x.get("class") is not None 318 | and len(x.get("class")) == 1 and "name--" in x.get("class")[0]] 319 | desc_values = [x.text.strip() for x in soup.find_all("span") if x.get("class") is not None 320 | and len(x.get("class")) == 1 and "value--" in x.get("class")[0]] 321 | for i in range(len(desc_params)): 322 | if "Вход" in desc_params[i]: 323 | entrance = desc_values[i] 324 | elif "Мебель" in desc_params[i]: 325 | furniture = desc_values[i] 326 | except Exception as e: 327 | with open("logs.txt", "a", encoding="utf8") as file: 328 | file.write(str(e) + " cian get_commercial_params\n") 329 | return area, office_class, floor, furniture, entrance 330 | 331 | 332 | def get_apartment_data(html, url): 333 | soup = BeautifulSoup(html, "lxml") 334 | 335 | # title = get_title(soup) 336 | city, district, street, block_number = get_address(soup) 337 | price = get_price(soup) 338 | block_type, rooms_number, total_floors, total_area, material, year, kitchen_area, living_area, floor = get_apartment_params(soup) 339 | selling_detail = get_selling_type(soup) 340 | if "продажа" in selling_detail.lower() or "ипотека" in selling_detail.lower(): 341 | rent_info = "Не аренда" 342 | else: 343 | rent_info = selling_detail 344 | selling_detail = "Не указано" 345 | #seller_type = get_seller_type(soup) 346 | description = get_description(soup) 347 | date = get_date(soup) 348 | images, phone = driver_get_phone_and_images(url) 349 | 350 | return [city, district, street, block_number, rent_info, price, block_type, 351 | rooms_number, total_area, total_floors, material, selling_detail, images, 352 | description, date, phone, kitchen_area, living_area, floor] 353 | 354 | 355 | def get_cottage_data(html, url): 356 | soup = BeautifulSoup(html, "lxml") 357 | 358 | title = get_title(soup) 359 | city, district, street, block_number = get_address(soup) 360 | price = get_price(soup) 361 | cottage_type = title.split(",")[0] 362 | total_area, material, land_area, status, comforts, total_floors = get_cottage_params(soup) 363 | selling_detail = get_selling_type(soup) 364 | if "продажа" in selling_detail.lower() or "ипотека" in selling_detail.lower(): 365 | rent_info = "Не аренда" 366 | else: 367 | rent_info = selling_detail 368 | selling_detail = "Не указано" 369 | description = get_description(soup) 370 | date = get_date(soup) 371 | images, phone = driver_get_phone_and_images(url) 372 | seller_name = get_seller_name(soup) 373 | 374 | return [city, district, street, block_number, rent_info, price, cottage_type, 375 | total_area, comforts, selling_detail, images, description, date, phone, material, 376 | total_floors, land_area, status, seller_name] 377 | 378 | 379 | def get_commercial_data(html, url): 380 | soup = BeautifulSoup(html, "lxml") 381 | 382 | title = get_title(soup) 383 | city, district, street, block_number = get_address(soup) 384 | price = get_price(soup) 385 | 386 | if "офис" in title.lower(): 387 | object_type = "Офисное помещение" 388 | elif "торговая площадь" in title.lower(): 389 | object_type = "Торговая площадь" 390 | elif "склад" in title.lower(): 391 | object_type = "Склад" 392 | elif "своб. назнач." in title.lower() or "свободное назначение" in title.lower(): 393 | object_type = "Свободного назначения" 394 | elif "гараж" in title.lower(): 395 | object_type = "Гараж" 396 | elif "автосервис" in title.lower(): 397 | object_type = "Автосервис" 398 | elif "производство" in title.lower(): 399 | object_type = "Производство" 400 | elif "готовый бизнес" in title.lower(): 401 | object_type = "Готовый бизнес" 402 | else: 403 | object_type = "Не указано" 404 | 405 | area, office_class, floor, furniture, entrance = get_commercial_params(soup) 406 | if object_type != "Офисное помещение": 407 | office_class = "Не офис" 408 | description = get_description(soup) 409 | date = get_date(soup) 410 | images, phone = driver_get_phone_and_images(url) 411 | seller_name = get_seller_name(soup) 412 | 413 | return [city, district, street, block_number, price, object_type, office_class, 414 | furniture, entrance, area, date, phone, images, description, seller_name] 415 | 416 | 417 | def crawl_page(page, html, category, sell_type): 418 | global visited_urls, db 419 | soup = BeautifulSoup(html, "lxml") 420 | if page != 1 and "".join([x.text.strip() for x in soup.find_all("li") 421 | if len(x.get("class")) == 2 and "list-item--active" in "".join(x.get("class"))]) == "1": 422 | print("Парсинг завершен cian") 423 | return True 424 | # так как пагинация динамическая и мы не можем получить число страниц, проверяем, есть ли на странице объявления 425 | try: 426 | offers = [x for x in soup.find("div", id="frontend-serp").find("div").find_all("div") 427 | if x.get("class") is not None and "offer-container" in x.get("class")[0]] 428 | except: 429 | offers = [] 430 | if offers is None or not offers: 431 | print("Парсинг завершен cian") 432 | return True 433 | for offer in offers: 434 | try: 435 | url = offer.find("a").get("href") 436 | if url in visited_urls: 437 | print("cian not unique") 438 | time.sleep(random.uniform(5, 8)) 439 | continue 440 | else: 441 | visited_urls.append(url) 442 | #print(url) 443 | 444 | data = [] 445 | if category == "Квартиры": 446 | data = get_apartment_data(get_html(url), url) 447 | # записываем ключевую информацию, чтобы потом найти дубликаты 448 | with open("total_data.txt", "a", encoding="utf8") as file: 449 | file.write("%s--%s--%s--%s--%s--%s\n" % (data[2], data[3], data[4], data[8], data[-1], url)) 450 | elif category == "Дома": 451 | data = get_cottage_data(get_html(url), url) 452 | with open("total_data.txt", "a", encoding="utf8") as file: 453 | file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[7], data[8], url)) 454 | elif category == "Коммерческая_недвижимость": 455 | data = get_commercial_data(get_html(url), url) 456 | with open("total_data.txt", "a", encoding="utf8") as file: 457 | file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[6], data[10], url)) 458 | 459 | # на каком месте находится дата объявления 460 | index_of_date = -1 461 | if category == "Квартиры" or category == "Коммерческая_недвижимость": 462 | index_of_date = -5 463 | elif category == "Дома": 464 | index_of_date = -7 465 | elif category == "Участки": 466 | index_of_date = -1 467 | if data[index_of_date] == "too old": 468 | print("Парсинг завершен cian") 469 | return True 470 | 471 | data.insert(4, sell_type) 472 | if data[0] != "Не указано": 473 | try: 474 | db.insert_data(category, data) 475 | except: 476 | db.close() 477 | db = DataBase() 478 | db.insert_data(category, data) 479 | print("parsed page cian") 480 | 481 | #print(*data, sep="\n") 482 | #print("--------------------------------------") 483 | 484 | except Exception as e: 485 | with open("logs.txt", "a", encoding="utf8") as file: 486 | file.write(str(e) + " cian crawl_page\n") 487 | 488 | time.sleep(random.uniform(5, 8)) 489 | 490 | 491 | def parse(category_url, category_name, sell_type): 492 | completed = False 493 | page = 1 494 | while not completed: 495 | url_gen = category_url[:category_url.rfind("=") + 1] + str(page) 496 | completed = crawl_page(page, get_html(url_gen), category_name, sell_type) 497 | page += 1 498 | 499 | 500 | def main(): 501 | global visited_urls 502 | url_cottages_sell = "https://saratov.cian.ru/cat.php?deal_type=sale&engine_version=2&object_type%5B0%5D=1&offer_type=suburban®ion=4609&totime=86400&page=1" 503 | parse(url_cottages_sell, "Дома", "Продажа") 504 | 505 | visited_urls = [] 506 | url_cottages_rent = "https://saratov.cian.ru/cat.php?deal_type=rent&engine_version=2&object_type%5B0%5D=1&offer_type=suburban®ion=4609&totime=86400&page=1" 507 | parse(url_cottages_rent, "Дома", "Аренда") 508 | 509 | visited_urls = [] 510 | url_commercials_sell = "https://saratov.cian.ru/cat.php?deal_type=sale&engine_version=2&offer_type=offices&office_type%5B0%5D=1&office_type%5B10%5D=12&office_type%5B1%5D=2&office_type%5B2%5D=3&office_type%5B3%5D=4&office_type%5B4%5D=5&office_type%5B5%5D=6&office_type%5B6%5D=7&office_type%5B7%5D=9&office_type%5B8%5D=10&office_type%5B9%5D=11®ion=4609&totime=86400&page=1" 511 | parse(url_commercials_sell, "Коммерческая_недвижимость", "Продажа") 512 | 513 | visited_urls = [] 514 | url_commercials_rent = "https://saratov.cian.ru/cat.php?deal_type=rent&engine_version=2&offer_type=offices&office_type%5B0%5D=1&office_type%5B10%5D=12&office_type%5B1%5D=2&office_type%5B2%5D=3&office_type%5B3%5D=4&office_type%5B4%5D=5&office_type%5B5%5D=6&office_type%5B6%5D=7&office_type%5B7%5D=9&office_type%5B8%5D=10&office_type%5B9%5D=11®ion=4609&totime=86400&page=1" 515 | parse(url_commercials_rent, "Коммерческая_недвижимость", "Аренда") 516 | 517 | visited_urls = [] 518 | url_apartments_sell = "https://saratov.cian.ru/cat.php?deal_type=sale&engine_version=2&offer_type=flat®ion=4609&room1=1&room2=1&room3=1&room4=1&room5=1&room6=1&room7=1&room9=1&totime=86400&page=1" 519 | parse(url_apartments_sell, "Квартиры", "Продажа") 520 | 521 | visited_urls = [] 522 | url_apartments_rent = "https://saratov.cian.ru/cat.php?deal_type=rent&engine_version=2&offer_type=flat®ion=4609&room1=1&room2=1&room3=1&room4=1&room5=1&room6=1&room7=1&room9=1&totime=86400&page=1" 523 | parse(url_apartments_rent, "Квартиры", "Аренда") 524 | 525 | 526 | if __name__ == "__main__": 527 | main() 528 | db.close() 529 | -------------------------------------------------------------------------------- /database.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import mysql.connector 4 | from mysql.connector import Error 5 | 6 | # using sensitive data placeholders, replace it with passwords 7 | host = "host" 8 | database = "db" 9 | user = "user" 10 | password = "pass" 11 | 12 | 13 | class DataBase: 14 | def __init__(self): 15 | try: 16 | self.conn = mysql.connector.connect(host=host, database=database, user=user, password=password) 17 | self.cursor = self.conn.cursor() 18 | except Error as error: 19 | print("Error while connecting to database", error) 20 | 21 | def close(self): 22 | self.cursor.close() 23 | self.conn.close() 24 | 25 | def create_table(self, category): 26 | if category == "Квартиры": 27 | self.cursor.execute("CREATE TABLE IF NOT EXISTS Квартиры" 28 | "(Город TEXT, Район TEXT, Улица TEXT, Номер_дома TEXT, " 29 | "Тип_сделки TEXT, Срок_аренды TEXT, Цена TEXT, Тип_дома TEXT, Количество_комнат TEXT, " 30 | "Общая_площадь TEXT, Количество_этажей TEXT, Материал_стен TEXT, Тип_продажи TEXT, " 31 | "Фото TEXT, Описание TEXT, Дата TEXT, Телефон TEXT, Площадь_кухни TEXT, Жилая_площадь TEXT, " 32 | "Этаж TEXT);") 33 | 34 | elif category == "Дома": 35 | self.cursor.execute("CREATE TABLE IF NOT EXISTS Дома" 36 | "(Город TEXT, Район TEXT, Улица TEXT, Номер_дома TEXT, Тип_сделки TEXT, Срок_аренды TEXT, " 37 | "Цена TEXT, Тип_дома TEXT, Площадь_дома TEXT, Удобства TEXT, Тип_продажи TEXT, " 38 | "Фото TEXT, Описание TEXT, Дата TEXT, Телефон TEXT, Материал_стен TEXT, " 39 | "Количество_этажей TEXT, Площадь_участка TEXT, Статус_участка TEXT, Имя_продавца TEXT);") 40 | 41 | elif category == "Коммерческая_недвижимость": 42 | self.cursor.execute("CREATE TABLE IF NOT EXISTS Коммерческая_недвижимость" 43 | "(Город TEXT, Район TEXT, Улица TEXT, Номер_дома TEXT, Тип_сделки TEXT, Цена TEXT, " 44 | "Тип_недвижимости TEXT, Класс_здания TEXT, Мебель TEXT, Вход TEXT, Общая_площадь TEXT, " 45 | "Дата TEXT, Телефон TEXT, Фото TEXT, Описание TEXT, Имя_продавца TEXT);") 46 | 47 | elif category == "Участки": 48 | self.cursor.execute("CREATE TABLE IF NOT EXISTS Участки" 49 | "(Город TEXT, Район TEXT, Улица TEXT, Тип_сделки TEXT, Залог TEXT, Статус_участка TEXT, " 50 | "Расстояние_до_города TEXT, Площадь_участка TEXT, Цена TEXT, Право_собственности TEXT, " 51 | "Фото TEXT, Описание TEXT, Имя_продавца TEXT, Телефон TEXT, Дата TEXT);") 52 | 53 | elif category == "Дубликаты": 54 | self.cursor.execute("CREATE TABLE IF NOT EXISTS Дубликаты (Заголовок TEXT, URLs TEXT);") 55 | 56 | def insert_data(self, table_name, data): 57 | data_string = ', '.join(['%s'] * len(data)) 58 | query = "INSERT INTO %s VALUES (%s);" % (table_name, data_string) 59 | self.cursor.execute(query, data) 60 | self.conn.commit() 61 | -------------------------------------------------------------------------------- /irr_parsing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import time 6 | import random 7 | from fake_useragent import UserAgent 8 | import datetime 9 | import base64 10 | from database import DataBase 11 | 12 | 13 | # на каких записях останавливаться 14 | with open("breakpoints/irr.txt", "r", encoding="utf8") as file: 15 | breakpoints = file.readlines() 16 | try: 17 | break_apartment_sell = tuple(breakpoints[0].strip().split("--")) 18 | except: 19 | break_apartment_sell = None 20 | try: 21 | break_apartment_rent = tuple(breakpoints[1].strip().split("--")) 22 | except: 23 | break_apartment_rent = None 24 | try: 25 | break_commercial_sell = tuple(breakpoints[2].strip().split("--")) 26 | except: 27 | break_commercial_sell = None 28 | try: 29 | break_commercial_rent = tuple(breakpoints[3].strip().split("--")) 30 | except: 31 | break_commercial_rent = None 32 | try: 33 | break_cottage_sell = tuple(breakpoints[4].strip().split("--")) 34 | except: 35 | break_cottage_sell = None 36 | try: 37 | break_cottage_rent = tuple(breakpoints[5].strip().split("--")) 38 | except: 39 | break_cottage_rent = None 40 | 41 | # получаем вчерашнюю дату 42 | today = datetime.datetime.today() 43 | yesterday = str(today - datetime.timedelta(days=2)).split()[0].split("-") 44 | if yesterday[1][0] == "0": 45 | yesterday[1] = yesterday[1][1:] 46 | if yesterday[2][0] == "0": 47 | yesterday[2] = yesterday[2][1:] 48 | months = { 49 | "1": "января", 50 | "2": "февраля", 51 | "3": "марта", 52 | "4": "апреля", 53 | "5": "мая", 54 | "6": "июня", 55 | "7": "июля", 56 | "8": "августа", 57 | "9": "сентября", 58 | "10": "октября", 59 | "11": "ноября", 60 | "12": "декабря" 61 | } 62 | date_break_point = yesterday[2] + " " + months[yesterday[1]] 63 | 64 | db = DataBase() 65 | visited_urls = [] 66 | 67 | 68 | def get_html(url): 69 | req = requests.get(url, headers={"User-Agent": UserAgent().chrome}) 70 | return req.text.encode(req.encoding) 71 | 72 | 73 | def get_total_pages(html): 74 | soup = BeautifulSoup(html, "lxml") 75 | total_pages = soup.find("div", class_="pagination__pages") 76 | if total_pages is not None: 77 | total_pages = total_pages.find_all("a", class_="pagination__pagesLink")[-1].text.strip() 78 | else: 79 | total_pages = 1 80 | return int(total_pages) 81 | 82 | 83 | def get_title(soup): 84 | try: 85 | title = soup.find("h1", class_="productPage__title").text.strip() 86 | except Exception as e: 87 | with open("logs.txt", "a", encoding="utf8") as file: 88 | file.write(str(e) + " irr get_title\n") 89 | title = "Не указано" 90 | return title 91 | 92 | 93 | def get_address(soup): 94 | try: 95 | address = soup.find("div", class_="productPage__infoTextBold js-scrollToMap").text.strip() 96 | city = address.split(",")[0] 97 | except Exception as e: 98 | with open("logs.txt", "a", encoding="utf8") as file: 99 | file.write(str(e) + " irr get_address\n") 100 | city = "Не указано" 101 | return city 102 | 103 | 104 | def get_material(soup): 105 | try: 106 | material = "Не указано" 107 | building_params = soup.find_all("div", class_="productPage__infoColumnBlock js-columnBlock")[-1].find_all("li", class_="productPage__infoColumnBlockText") 108 | for i in range(len(building_params)): 109 | info = building_params[i].text.strip() 110 | if "Материал стен" in info: 111 | material = info.split(":")[1].strip() 112 | except Exception as e: 113 | with open("logs.txt", "a", encoding="utf8") as file: 114 | file.write(str(e) + " irr get_material\n") 115 | material = "Не указано" 116 | return material 117 | 118 | 119 | def get_price(soup): 120 | try: 121 | price = " ".join(soup.find("div", class_="productPage__price").text.strip().split("\xa0")) 122 | fee = soup.find("div", class_="productPage__fee") 123 | if fee is not None: 124 | price += " (" + fee.text.strip() + ")" 125 | 126 | if "в месяц" in price: 127 | rent_info = "длительный срок" 128 | elif "за сутки" in price: 129 | rent_info = "посуточно" 130 | else: 131 | rent_info = "Не аренда" 132 | 133 | except Exception as e: 134 | with open("logs.txt", "a", encoding="utf8") as file: 135 | file.write(str(e) + " irr get_price\n") 136 | price, rent_info = "Не указано", "Не указано" 137 | return price, rent_info 138 | 139 | 140 | def get_block_type(soup): 141 | block_type = "Вторичка" 142 | try: 143 | seller_site = soup.find("a", class_="js-sellerSiteLink") 144 | if seller_site is not None: 145 | block_type = "Новостройка" 146 | except Exception as e: 147 | with open("logs.txt", "a", encoding="utf8") as file: 148 | file.write(str(e) + " irr get_block_type\n") 149 | return block_type 150 | 151 | 152 | def get_seller_info(soup): 153 | try: 154 | company_name = soup.find("div", class_="productPage__infoTextBold productPage__infoTextBold_inline").find("a") 155 | if company_name is not None: 156 | seller_type = "Компания" 157 | seller_name = company_name.text.strip() 158 | else: 159 | seller_type = "Частное лицо" 160 | seller_name = soup.find("div", class_="productPage__infoTextBold productPage__infoTextBold_inline").text.strip() 161 | except Exception as e: 162 | with open("logs.txt", "a", encoding="utf8") as file: 163 | file.write(str(e) + " irr get_seller_info\n") 164 | seller_name, seller_type = "Не указано", "Не указано" 165 | return seller_type, seller_name 166 | 167 | 168 | def get_photos(soup): 169 | try: 170 | images = [] 171 | images_list = soup.find("div", class_="lineGallery js-lineProductGallery").find_all("meta") 172 | for image in images_list: 173 | link = image.get("content") 174 | images.append(link) 175 | images = "\n".join(images) 176 | except Exception as e: 177 | with open("logs.txt", "a", encoding="utf8") as file: 178 | file.write(str(e) + " irr get_photos\n") 179 | images = "Не указано" 180 | return images 181 | 182 | 183 | def get_description(soup): 184 | try: 185 | description = " ".join(soup.find("p", class_="productPage__descriptionText js-productPageDescription").text.strip().split()) 186 | except Exception as e: 187 | with open("logs.txt", "a", encoding="utf8") as file: 188 | file.write(str(e) + " irr get_description\n") 189 | description = "Не указано" 190 | return description 191 | 192 | 193 | def get_date(soup): 194 | try: 195 | relative_date = soup.find("div", class_="productPage__createDate").find("span").text.strip() 196 | if "," in relative_date: 197 | if relative_date.split(",")[0] == "сегодня": 198 | date = str(datetime.datetime.today()).split()[0] + relative_date.split(",")[1] 199 | else: 200 | date = str(datetime.datetime.today() - datetime.timedelta(days=2)).split()[0] + relative_date.split(",")[1] 201 | else: 202 | date = relative_date 203 | except Exception as e: 204 | with open("logs.txt", "a", encoding="utf8") as file: 205 | file.write(str(e) + " irr get_date\n") 206 | date = "Не указано" 207 | return date 208 | 209 | 210 | def get_seller_phone(soup): 211 | try: 212 | ciphered_phone = soup.find("input", {"class": "js-backendVar", "name": "phoneBase64"}).get("value") 213 | except Exception as e: 214 | with open("logs.txt", "a", encoding="utf8") as file: 215 | file.write(str(e) + " irr get_seller-phone\n") 216 | ciphered_phone = "Не указано" 217 | return base64.b64decode(ciphered_phone).decode("utf-8") 218 | 219 | 220 | def get_apartment_params(soup): 221 | rooms_number, floor, total_floors, total_area, kitchen_area, living_area, furnish, district, street, block_number = ["Не указано"] * 10 222 | try: 223 | building_params = [] 224 | divs = soup.find_all("div", class_="productPage__infoColumnBlock js-columnBlock") 225 | for i in range(len(divs)): 226 | building_params.extend(divs[i].find_all("li", class_="productPage__infoColumnBlockText")) 227 | 228 | for i in range(len(building_params)): 229 | info = building_params[i].text.strip() 230 | if "Этаж:" in info: 231 | floor = info.split(":")[1].strip() 232 | elif "Этажей в здании" in info: 233 | total_floors = info.split(":")[1].strip() 234 | elif "Комнат в квартире" in info: 235 | rooms_number = info.split(":")[1].strip() 236 | elif "Общая площадь" in info: 237 | total_area = info.split(":")[1].strip() 238 | elif "Жилая площадь" in info: 239 | living_area = info.split(":")[1].strip() 240 | elif "Площадь кухни" in info: 241 | kitchen_area = info.split(":")[1].strip() 242 | elif "Ремонт" in info: 243 | furnish = info.split(":")[1].strip() 244 | if furnish == "1": 245 | # иногда выводит "1", хотя на странице не указан 246 | furnish = "Не указано" 247 | elif "Улица" in info: 248 | street = info.split(":")[1].strip() 249 | elif "Район города" in info: 250 | district = info.split(":")[1].strip() 251 | elif "Дом" in info: 252 | block_number = info.split(":")[1].strip() 253 | except Exception as e: 254 | with open("logs.txt", "a", encoding="utf8") as file: 255 | file.write(str(e) + " irr get_apartment_params\n") 256 | return rooms_number, floor, total_floors, total_area, kitchen_area, living_area, furnish, district, street, block_number 257 | 258 | 259 | def get_commercial_params(soup): 260 | building_type, parking, ceilings, area, entrance, district, street, block_number = ["Не указано"] * 8 261 | try: 262 | building_params = [] 263 | divs = soup.find_all("div", class_="productPage__infoColumnBlock js-columnBlock") 264 | for i in range(len(divs)): 265 | building_params.extend(divs[i].find_all("li", class_="productPage__infoColumnBlockText")) 266 | 267 | for i in range(len(building_params)): 268 | info = building_params[i].text.strip() 269 | if "Тип здания" in info: 270 | building_type = info.split(":")[1].strip() 271 | elif "Общая площадь" in info: 272 | area = info.split(":")[1].strip() 273 | elif "Парковка" in info: 274 | parking = "Парковка есть" 275 | elif "Высота потолков" in info: 276 | ceilings = info.split(":")[1].strip() 277 | elif "Вход" in info: 278 | entrance = info.strip() 279 | elif "Улица" in info: 280 | street = info.split(":")[1].strip() 281 | elif "Район города" in info: 282 | district = info.split(":")[1].strip() 283 | elif "Дом" in info: 284 | block_number = info.split(":")[1].strip() 285 | except Exception as e: 286 | with open("logs.txt", "a", encoding="utf8") as file: 287 | file.write(str(e) + " irr get_commercial_params\n") 288 | return building_type, parking, ceilings, area, entrance, district, street, block_number 289 | 290 | 291 | def get_cottage_params(soup): 292 | house_area, material, total_floors, land_area, status, comforts, district, street, block_number = ["Не указано"] * 9 293 | try: 294 | building_params = [] 295 | divs = soup.find_all("div", class_="productPage__infoColumnBlock js-columnBlock") 296 | for i in range(len(divs)): 297 | building_params.extend(divs[i].find_all("li", class_="productPage__infoColumnBlockText")) 298 | for i in range(len(building_params)): 299 | info = building_params[i].text.strip() 300 | if "Площадь участка" in info: 301 | land_area = info.split(":")[1].strip() 302 | elif "Площадь строения" in info: 303 | house_area = info.split(":")[1].strip() 304 | elif "Материал стен" in info: 305 | material = info.split(":")[1].strip() 306 | elif "Количество этажей" in info: 307 | total_floors = info.split(":")[1].strip() 308 | elif "Вид разрешенного использования" in info: 309 | status = info.split(":")[1].strip() 310 | elif any(x in info.lower() for x in ["отапливаемый", "отопление", "водопровод", "канализация", 311 | "свет", "газ", "вода", "интернет", "телефон"]): 312 | if comforts == "Не указано": 313 | comforts = info.strip() 314 | else: 315 | comforts += "; " + info.strip() 316 | elif "Улица" in info: 317 | street = info.split(":")[1].strip() 318 | elif "Район города" in info: 319 | district = info.split(":")[1].strip() 320 | elif "Дом" in info: 321 | block_number = info.split(":")[1].strip() 322 | except Exception as e: 323 | with open("logs.txt", "a", encoding="utf8") as file: 324 | file.write(str(e) + " irr get_cottage_params\n") 325 | return house_area, material, total_floors, land_area, status, comforts, district, street, block_number 326 | 327 | 328 | def get_apartment_data(html): 329 | soup = BeautifulSoup(html, "lxml") 330 | 331 | #title = get_title(soup) 332 | city = get_address(soup) 333 | material = get_material(soup) 334 | rooms_number, floor, total_floors, total_area, kitchen_area, living_area, furnish, district, street, block_number = get_apartment_params(soup) 335 | price, rent_info = get_price(soup) 336 | block_type = get_block_type(soup) 337 | #seller_type, seller_name = get_seller_info(soup) 338 | images = get_photos(soup) 339 | description = get_description(soup) 340 | phone = get_seller_phone(soup) 341 | date = get_date(soup) 342 | selling_detail = "Не указано" # на irr нет этой информации 343 | 344 | return [city, district, street, block_number, rent_info, price, block_type, 345 | rooms_number, total_area, total_floors, material, selling_detail, images, 346 | description, date, phone, kitchen_area, living_area, floor] 347 | 348 | 349 | def get_commercial_data(html): 350 | soup = BeautifulSoup(html, "lxml") 351 | 352 | title = get_title(soup) 353 | # анализируем вид помещения по заголовку 354 | if "офис" in title.lower(): 355 | object_type = "Офисное помещение" 356 | elif "торг" in title.lower(): 357 | object_type = "Торговое помещение" 358 | elif "гостиница" in title.lower(): 359 | object_type = "Гостиница" 360 | elif "производ" in title.lower(): 361 | object_type = "Производственное помещение" 362 | elif "склад" in title.lower(): 363 | object_type = "Складское помещение" 364 | elif "помещение" in title.lower(): 365 | object_type = "Помещение свободного назначения" 366 | else: 367 | object_type = "Не указано" 368 | 369 | city = get_address(soup) 370 | building_type, parking, ceilings, area, entrance, district, street, block_number = get_commercial_params(soup) 371 | price, rent_info = get_price(soup) 372 | seller_type, seller_name = get_seller_info(soup) 373 | images = get_photos(soup) 374 | description = get_description(soup) 375 | phone = get_seller_phone(soup) 376 | date = get_date(soup) 377 | office_class, furniture = "Не указано", "Не указано" # на irr нет этой информации 378 | 379 | return [city, district, street, block_number, price, object_type, office_class, 380 | furniture, entrance, area, date, phone, images, description, seller_name] 381 | 382 | 383 | def get_cottage_data(html): 384 | soup = BeautifulSoup(html, "lxml") 385 | 386 | title = get_title(soup) 387 | 388 | # определим тип объекта по заголовку 389 | if "дом" in title.lower(): 390 | object_type = "Дом" 391 | elif "участок" in title.lower(): 392 | object_type = "Участок" 393 | elif "таунхаус" in title.lower(): 394 | object_type = "Таунхаус" 395 | else: 396 | object_type = "Не указано" 397 | 398 | city = get_address(soup) 399 | price, rent_info = get_price(soup) 400 | house_area, material, total_floors, land_area, status, comforts, district, street, block_number = get_cottage_params(soup) 401 | _, seller_name = get_seller_info(soup) 402 | date = get_date(soup) 403 | images = get_photos(soup) 404 | description = get_description(soup) 405 | phone = get_seller_phone(soup) 406 | selling_detail = "Не указано" # на irr нет этой информации 407 | 408 | return [city, district, street, block_number, rent_info, price, object_type, 409 | house_area, comforts, selling_detail, images, description, date, phone, material, 410 | total_floors, land_area, status, seller_name] 411 | 412 | 413 | def crawl_page(first_offer, html, category, sell_type): 414 | global visited_urls, db 415 | soup = BeautifulSoup(html, "lxml") 416 | try: 417 | offers = soup.find("div", class_="listing js-productGrid ").find_all("div", class_="listing__item") 418 | except: 419 | offers = [] 420 | if offers is None or not offers: 421 | print("Парсинг завершен irr") 422 | return True 423 | for offer in offers: 424 | try: 425 | date = offer.find("span", class_="listing__itemDate").find("div", class_="updateProduct").text.strip() 426 | if date == date_break_point: 427 | print("Парсинг завершен irr") 428 | return True 429 | 430 | url = offer.find("div", class_="listing__itemTitleWrapper").find("a", class_="listing__itemTitle").get("href") 431 | if url in visited_urls: 432 | print("irr not unique") 433 | time.sleep(random.uniform(5, 8)) 434 | continue 435 | else: 436 | visited_urls.append(url) 437 | #print(url) 438 | 439 | data = [] 440 | if category == "Квартиры": 441 | data = get_apartment_data(get_html(url)) 442 | # записываем ключевую информацию, чтобы потом найти дубликаты 443 | with open("total_data.txt", "a", encoding="utf8") as file: 444 | file.write("%s--%s--%s--%s--%s--%s\n" % (data[2], data[3], data[4], data[8], data[-1], url)) 445 | elif category == "Дома": 446 | data = get_cottage_data(get_html(url)) 447 | with open("total_data.txt", "a", encoding="utf8") as file: 448 | file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[7], data[8], url)) 449 | elif category == "Коммерческая_недвижимость": 450 | data = get_commercial_data(get_html(url)) 451 | with open("total_data.txt", "a", encoding="utf8") as file: 452 | file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[6], data[10], url)) 453 | 454 | if first_offer: 455 | # сохраняем самую первую запись как точку выхода 456 | modifier = "w" if (category == "Квартиры" and sell_type == "Продажа") else "a" 457 | with open("breakpoints/irr.txt", modifier, encoding="utf8") as file: 458 | file.write("%s--%s\n" % (data[2], data[5])) 459 | first_offer = False 460 | 461 | key_info = (data[2], data[5]) 462 | 463 | if any(x == key_info for x in [break_apartment_sell, break_apartment_rent, break_commercial_sell, 464 | break_commercial_rent, break_cottage_sell, break_cottage_rent]): 465 | print("Парсинг завершен irr") 466 | return True 467 | 468 | data.insert(4, sell_type) 469 | if data[0] != "Не указано": 470 | try: 471 | db.insert_data(category, data) 472 | except: 473 | db.close() 474 | db = DataBase() 475 | db.insert_data(category, data) 476 | print("parsed page irr") 477 | #print(data) 478 | 479 | except Exception as e: 480 | with open("logs.txt", "a", encoding="utf8") as file: 481 | file.write(str(e) + " irr crawl_page\n") 482 | 483 | time.sleep(random.uniform(5, 8)) 484 | 485 | 486 | def parse(category_url, category_name, sell_type): 487 | page_part = "page" 488 | 489 | total_pages = get_total_pages(get_html(category_url)) 490 | 491 | for page in range(1, total_pages + 1): 492 | url_gen = category_url + page_part + str(page) 493 | if page == 1: 494 | completed = crawl_page(True, get_html(url_gen), category_name, sell_type) 495 | else: 496 | completed = crawl_page(False, get_html(url_gen), category_name, sell_type) 497 | if completed: 498 | break 499 | 500 | 501 | def main(): 502 | global visited_urls 503 | # на сайте есть разделения продажа/аренда 504 | # сначала парсим страницу с предложениями продажи 505 | url_apartments_sell = "https://saratovskaya-obl.irr.ru/real-estate/apartments-sale/sort/date_sort:desc/" 506 | parse(url_apartments_sell, "Квартиры", "Продажа") 507 | 508 | visited_urls = [] 509 | url_apartments_rent = "https://saratovskaya-obl.irr.ru/real-estate/rent/sort/date_sort:desc/" 510 | parse(url_apartments_rent, "Квартиры", "Аренда") 511 | 512 | visited_urls = [] 513 | url_commercials_sell = "https://saratovskaya-obl.irr.ru/real-estate/commercial-sale/sort/date_sort:desc/" 514 | parse(url_commercials_sell, "Коммерческая_недвижимость", "Продажа") 515 | 516 | visited_urls = [] 517 | url_commercials_rent = "https://saratovskaya-obl.irr.ru/real-estate/commercial/sort/date_sort:desc/" 518 | parse(url_commercials_rent, "Коммерческая_недвижимость", "Аренда") 519 | 520 | visited_urls = [] 521 | url_cottages_sell = "https://saratovskaya-obl.irr.ru/real-estate/out-of-town/sort/date_sort:desc/" 522 | parse(url_cottages_sell, "Дома", "Продажа") 523 | 524 | visited_urls = [] 525 | url_cottages_rent = "https://saratovskaya-obl.irr.ru/real-estate/out-of-town-rent/sort/date_sort:desc/" 526 | parse(url_cottages_rent, "Дома", "Аренда") 527 | 528 | 529 | if __name__ == "__main__": 530 | main() 531 | db.close() 532 | -------------------------------------------------------------------------------- /kvadrat64_parsing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import time 6 | import random 7 | from fake_useragent import UserAgent 8 | import datetime 9 | from selenium import webdriver 10 | from xvfbwrapper import Xvfb 11 | from selenium.webdriver.chrome.options import Options 12 | from database import DataBase 13 | 14 | 15 | # на каких записях останавливаться 16 | with open("breakpoints/kvadrat.txt", "r", encoding="utf8") as file: 17 | breakpoints = file.readlines() 18 | try: 19 | break_apartment_sell = tuple(breakpoints[0].strip().split("--")) 20 | except: 21 | break_apartment_sell = None 22 | try: 23 | break_apartment_rent = tuple(breakpoints[1].strip().split("--")) 24 | except: 25 | break_apartment_rent = None 26 | try: 27 | break_cottage_sell = tuple(breakpoints[2].strip().split("--")) 28 | except: 29 | break_cottage_sell = None 30 | try: 31 | break_cottage_rent = tuple(breakpoints[3].strip().split("--")) 32 | except: 33 | break_cottage_rent = None 34 | try: 35 | break_commercial_sell = tuple(breakpoints[4].strip().split("--")) 36 | except: 37 | break_commercial_sell = None 38 | try: 39 | break_commercial_rent = tuple(breakpoints[5].strip().split("--")) 40 | except: 41 | break_commercial_rent = None 42 | try: 43 | break_dacha_sell = tuple(breakpoints[6].strip().split("--")) 44 | except: 45 | break_dacha_sell = None 46 | try: 47 | break_saratov_land_sell = tuple(breakpoints[7].strip().split("--")) 48 | except: 49 | break_saratov_land_sell = None 50 | try: 51 | break_region_land_sell = tuple(breakpoints[8].strip().split("--")) 52 | except: 53 | break_region_land_sell = None 54 | 55 | # defining chrome options for selenium 56 | options = Options() 57 | options.add_argument("--no-sandbox") 58 | 59 | db = DataBase() 60 | visited_urls = [] 61 | 62 | 63 | def transform_date(date_str): 64 | """ 65 | Преобразуем дату, чтобы сравнить datetime-объекты 66 | """ 67 | day, month, year = date_str.split("-") 68 | if day[0] == "0": 69 | day = day[1] 70 | if month[0] == "0": 71 | month = month[1] 72 | 73 | date = datetime.datetime(int(year), int(month), int(day)) 74 | return date 75 | 76 | 77 | def get_html(url): 78 | # сайт использует кодировку windows-1251, поэтому меняем на utf-8 79 | req = requests.get(url, headers={"User-Agent": UserAgent().chrome}) 80 | return req.text.encode(req.encoding) 81 | 82 | 83 | def get_total_pages(html): 84 | soup = BeautifulSoup(html, "lxml") 85 | try: 86 | total_pages = soup.find("div", class_="a t100") 87 | if total_pages is not None: 88 | total_pages = total_pages.find_all("a", class_="phase")[-1].text.strip() 89 | else: 90 | total_pages = 0 91 | except Exception as e: 92 | total_pages = 0 93 | with open("logs.txt", "a", encoding="utf8") as file: 94 | file.write(str(e) + " kvadrat get_total_pages\n") 95 | return int(total_pages) 96 | 97 | 98 | def get_title(soup): 99 | try: 100 | title = soup.find("td", class_="hh").text.strip() 101 | except Exception as e: 102 | title = "Не указано" 103 | with open("logs.txt", "a", encoding="utf8") as file: 104 | file.write(str(e) + " kvadrat get_title\n") 105 | return title 106 | 107 | 108 | def get_price(soup): 109 | try: 110 | price = soup.find("td", class_="thprice").text.strip() 111 | except Exception as e: 112 | with open("logs.txt", "a", encoding="utf8") as file: 113 | file.write(str(e) + " kvadrat get_price\n") 114 | price = "Не указано" 115 | return price 116 | 117 | 118 | def get_commercial_price(soup): 119 | price = "Не указано" 120 | try: 121 | aggregated = [x.find_all("span", class_="d") for x in soup.find_all("td", class_="tddec2")] # список из всех ссылок из tddec2 122 | flat_aggregated = [item for sublist in aggregated for item in sublist] # из двумерного списка делаем одномерный 123 | price_params = [x.text.strip() for x in flat_aggregated] 124 | for param in price_params: 125 | if "за м²" in param: 126 | price = "м2".join(param.split("м²")) 127 | except Exception as e: 128 | with open("logs.txt", "a", encoding="utf8") as file: 129 | file.write(str(e) + " kvadrat get_price\n") 130 | return price 131 | 132 | 133 | def get_selling_type(soup): 134 | try: 135 | # если продажа, ищем тип продажи 136 | selling_type = "; ".join([x.text.strip() for x in soup.find("td", class_="tddec2").find_all("span", class_="d")]) 137 | if not selling_type: 138 | selling_type = "Не продажа" 139 | # если аренда - срок аренды 140 | rent_info = [x.text.strip() for x in soup.find_all("td", class_="tddec2")[-2].find_all("span", class_="d")] 141 | for info in rent_info: 142 | if "аренда" in info: 143 | rent_info = info 144 | break 145 | if not rent_info: 146 | rent_info = "Не аренда" 147 | except Exception as e: 148 | with open("logs.txt", "a", encoding="utf8") as file: 149 | file.write(str(e) + " kvadrat get_selling_type\n") 150 | selling_type = "Не указано" 151 | rent_info = "Не указано" 152 | return selling_type, rent_info 153 | 154 | 155 | def get_photos(soup): 156 | try: 157 | images = [] 158 | # список ссылок на картинки в полном размере 159 | td_images = soup.find("td", class_="tdimg").find_all("a") 160 | for image_item in td_images: 161 | link = "https://kvadrat64.ru/" + image_item.get("href") 162 | html_gallery = BeautifulSoup(get_html(link), "lxml") 163 | image = html_gallery.find("img", {"style": "cursor:pointer;"}) 164 | if image is not None: 165 | images.append("https://kvadrat64.ru/" + image.get("src")) 166 | images = "\n".join(images) 167 | # если нет картинок в галерее, пытаемся вытащить с облоджки 168 | if not images: 169 | images = "https://kvadrat64.ru/" + soup.find("div", id="mainfotoid").find("img").get("src") 170 | except Exception as e: 171 | with open("logs.txt", "a", encoding="utf8") as file: 172 | file.write(str(e) + " kvadrat get_photos\n") 173 | images = "Не указано" 174 | return images 175 | 176 | 177 | def get_description(soup): 178 | try: 179 | description = soup.find("p", class_="dinfo").text.strip().replace("\r", "") 180 | except Exception as e: 181 | with open("logs.txt", "a", encoding="utf8") as file: 182 | file.write(str(e) + " kvadrat get_description\n") 183 | description = "Не указано" 184 | return description 185 | 186 | 187 | def get_date(soup): 188 | try: 189 | date = soup.find("div", class_="tdate").text.strip().split(",")[1] 190 | if "сделать" in date: 191 | date = date.split("сделать")[0].split("создано")[1].strip() 192 | else: 193 | date = date.split("VIP")[0].split("создано")[1].strip() 194 | date = transform_date(date) 195 | except Exception as e: 196 | with open("logs.txt", "a", encoding="utf8") as file: 197 | file.write(str(e) + " kvadrat get_date\n") 198 | date = "Не указано" 199 | return date 200 | 201 | 202 | def get_seller_name(soup): 203 | try: 204 | name = soup.find_all("td", class_="tddec2")[-1].find("span").text.strip() 205 | except Exception as e: 206 | with open("logs.txt", "a", encoding="utf8") as file: 207 | file.write(str(e) + " kvadrat get_seller_name\n") 208 | name = "Не указано" 209 | return name 210 | 211 | 212 | def get_seller_phone(url, soup): 213 | phone = "Не указано" 214 | # телефон появляется динамически, используем selenium 215 | try: 216 | # иногда посредники указывают телефон в самом тексте; проверяем это 217 | tddec = soup.find_all("td", class_="tddec2")[-1].find_all(text=True) 218 | found = False 219 | for i in range(len(tddec)): 220 | if "Персона для контактов" in tddec[i]: 221 | phone = tddec[i + 1].split(",")[-1].strip() 222 | found = True 223 | elif "Контактный телефон" in tddec[i]: 224 | found = False 225 | 226 | if "".join(phone.split()).isalpha(): 227 | phone = "Не указано" 228 | 229 | if not found: 230 | vdisplay = Xvfb() 231 | vdisplay.start() 232 | driver = webdriver.Chrome(options=options) 233 | driver.set_window_size(1920, 1080) 234 | driver.get(url) 235 | 236 | button = driver.find_element_by_xpath('//span[@class="showphone"]') 237 | button.click() 238 | time.sleep(3) 239 | seller_info = driver.find_elements_by_xpath('//td[@class="tddec2"]')[-1].text 240 | for info in seller_info.split("\n"): 241 | if "Контактный телефон" in info: 242 | phone = info.split(":")[1].strip() 243 | driver.quit() 244 | vdisplay.stop() 245 | except Exception as e: 246 | with open("logs.txt", "a", encoding="utf8") as file: 247 | file.write(str(e) + " kvadrat get_seller_phone\n") 248 | phone = "Не указано" 249 | return phone 250 | 251 | 252 | def get_apartment_params(soup): 253 | block_type, total_area, kitchen_area, living_area, floor, total_floors, material = ["Не указано"] * 7 254 | try: 255 | ### 256 | # из-за кривой структуры сайта, формируем все сами в удобный формат 257 | params_raw = str(soup.find("td", class_="tddec")).split("
") 258 | params = BeautifulSoup(params_raw[0], "lxml").find("td", class_="tddec").text.strip().split("\xa0") 259 | for param in params_raw[1:]: 260 | params.append(BeautifulSoup(param, "lxml").text.strip()) 261 | ### 262 | new_block = False # в новостройке ли квартира 263 | add_info = "" # дата сдачи, застройщик (указываем в одноц графе) 264 | for param in params: 265 | if "Площадь общая" in param: 266 | total_area = param.split(":")[1].split("м²")[0].strip() + " м2" 267 | elif "Кухня" in param: 268 | kitchen_area = param.split(":")[1].split("м²")[0].strip() + " м2" 269 | elif "Жилая" in param: 270 | living_area = param.split(":")[1].split("м²")[0].strip() + " м2" 271 | elif "этажей в доме" in param: 272 | total_floors = param.split(":")[1].split("/")[1] 273 | floor = param.split(":")[1].split("/")[0].split()[1] 274 | elif "cтроение" in param: 275 | material = param.split(":")[1].strip() 276 | elif "Застройщик" in param or "Дата сдачи" in param or "Стадия строительства" in param: 277 | new_block = True 278 | add_info += param.split(":")[1] + ";" 279 | 280 | if new_block: 281 | block_type = "Новостройка " + add_info 282 | else: 283 | block_type = "Вторичка" 284 | except Exception as e: 285 | with open("logs.txt", "a", encoding="utf8") as file: 286 | file.write(str(e) + " kvadrat get_apartment_params\n") 287 | return block_type, total_area, kitchen_area, living_area, floor, total_floors, material 288 | 289 | 290 | def get_cottage_params(soup): 291 | total_area, material, comforts, total_floors, land_area = ["Не указано"] * 5 292 | try: 293 | ### 294 | # из-за кривой структуры сайта, формируем все сами в удобный формат 295 | params_raw = str(soup.find("td", class_="tddec")).split("
") 296 | params = BeautifulSoup(params_raw[0], "lxml").find("td", class_="tddec").text.strip().split("\xa0") 297 | for param in params_raw[1:]: 298 | params.append(BeautifulSoup(param, "lxml").text.strip()) 299 | ### 300 | for param in params: 301 | if "Площадь общая" in param: 302 | total_area = param.split(":")[1].split("м²")[0].strip() + " м2" 303 | elif "cтроение" in param: 304 | material = param.split(":")[1].strip() 305 | elif "Площадь участка" in param: 306 | land_area = param.split(":")[1].strip() 307 | elif "Этажей" in param: 308 | total_floors = param.split(":")[1].strip() 309 | elif "Коммуникации" in param: 310 | comforts = param.split(":")[1].strip() 311 | except Exception as e: 312 | with open("logs.txt", "a", encoding="utf8") as file: 313 | file.write(str(e) + " kvadrat get_cottage_params\n") 314 | return total_area, material, comforts, total_floors, land_area 315 | 316 | 317 | def get_commercial_params(soup): 318 | object_type, area = ["Не указано"] * 2 319 | try: 320 | ### 321 | # из-за кривой структуры сайта, формируем все сами в удобный формат 322 | params_raw = str(soup.find("td", class_="tddec")).split("
") 323 | params = BeautifulSoup(params_raw[0], "lxml").find("td", class_="tddec").text.strip().split("\xa0") 324 | for param in params_raw[1:]: 325 | params.append(BeautifulSoup(param, "lxml").text.strip()) 326 | ### 327 | for param in params: 328 | if "Объект" in param: 329 | object_type = param.split(":")[1].strip() 330 | elif "площадь" in param: 331 | area = param.split(":")[1].strip() 332 | except Exception as e: 333 | with open("logs.txt", "a", encoding="utf8") as file: 334 | file.write(str(e) + " kvadrat get_commercial_params\n") 335 | return object_type, area 336 | 337 | 338 | def get_dacha_params(soup): 339 | total_area = "Не указано" 340 | try: 341 | ### 342 | # из-за кривой структуры сайта, формируем все сами в удобный формат 343 | params_raw = str(soup.find("td", class_="tddec")).split("
") 344 | params = BeautifulSoup(params_raw[0], "lxml").find("td", class_="tddec").text.strip().split("\xa0") 345 | for param in params_raw[1:]: 346 | params.append(BeautifulSoup(param, "lxml").text.strip()) 347 | ### 348 | for param in params: 349 | if "Площадь дома" in param: 350 | total_area = param.split(":")[1].strip() 351 | break 352 | except Exception as e: 353 | with open("logs.txt", "a", encoding="utf8") as file: 354 | file.write(str(e) + " kvadrat get_dacha_params\n") 355 | return total_area 356 | 357 | 358 | def get_land_params(soup): 359 | total_area, land_type = ["Не указано"] * 2 360 | try: 361 | ### 362 | # из-за кривой структуры сайта, формируем все сами в удобный формат 363 | params_raw = str(soup.find("td", class_="tddec")).split("
") 364 | params = BeautifulSoup(params_raw[0], "lxml").find("td", class_="tddec").text.strip().split("\xa0") 365 | for param in params_raw[1:]: 366 | params.append(BeautifulSoup(param, "lxml").text.strip()) 367 | ### 368 | for param in params: 369 | if "Площадь участка" in param: 370 | total_area = param.split(":")[1].strip() 371 | elif "Тип земли" in params: 372 | land_type = param.split(":")[1].strip() 373 | except Exception as e: 374 | with open("logs.txt", "a", encoding="utf8") as file: 375 | file.write(str(e) + " kvadrat get_land_params\n") 376 | return total_area, land_type 377 | 378 | 379 | def get_apartment_data(html, url): 380 | soup = BeautifulSoup(html, "lxml") 381 | 382 | title = get_title(soup) 383 | if "сниму" not in title.lower(): 384 | address = ",".join(title.split(",")[1:]).strip() 385 | address = address[:address.rfind(" на карте")] 386 | if "сдам" in address.lower(): 387 | address = " ".join(address.split()[1:]) 388 | if "(" in address: 389 | address = address[:address.rfind("(")] 390 | 391 | city = address.split(",")[-1].strip() 392 | district = address.split(",")[-2].strip() 393 | block_number = address.split(",")[-3].strip() 394 | street = address.split(",")[-4].strip() 395 | 396 | rooms_number = title.split(",")[0] 397 | block_type, total_area, kitchen_area, living_area, floor, total_floors, material = get_apartment_params(soup) 398 | price = get_price(soup) 399 | selling_detail, rent_info = get_selling_type(soup) # чистая продажа/ипотека/без посредников; если аренда, срок аренды 400 | if not selling_detail: 401 | selling_detail = "Не продажа" 402 | images = get_photos(soup) 403 | description = get_description(soup) 404 | phone = get_seller_phone(url, soup) 405 | date = get_date(soup) 406 | 407 | return [city, district, street, block_number, rent_info, price, block_type, 408 | rooms_number, total_area, total_floors, material, selling_detail, images, 409 | description, date, phone, kitchen_area, living_area, floor] 410 | return None 411 | 412 | 413 | def get_cottage_data(html, url): 414 | soup = BeautifulSoup(html, "lxml") 415 | 416 | title = get_title(soup) 417 | if "сниму" not in title.lower(): 418 | address = ",".join(title.split(",")[1:]).strip() 419 | address = address[:address.rfind(" на карте")] 420 | if "(" in address: 421 | address = address[:address.rfind("(")] 422 | 423 | if address == address.upper(): 424 | city, street, block_number = address.split(",") + (["Не указано"] * (3 - len(address.split(",")))) 425 | district = "Не указано" 426 | else: 427 | city = address.split(",")[-1].strip() 428 | district = address.split(",")[-2].strip() 429 | block_number = address.split(",")[-3].strip() 430 | street = address.split(",")[-4].strip() 431 | 432 | cottage_type = title.split(",")[0] 433 | if "сдам" in cottage_type.lower(): 434 | cottage_type = " ".join(cottage_type.split()[1:]) 435 | price = get_price(soup) 436 | total_area, material, comforts, total_floors, land_area = get_cottage_params(soup) 437 | selling_detail, rent_info = get_selling_type(soup) # чистая продажа/ипотека/без посредников; если аренда, срок аренды 438 | if not selling_detail: 439 | selling_detail = "Не продажа" 440 | images = get_photos(soup) 441 | description = get_description(soup) 442 | phone = get_seller_phone(url, soup) 443 | seller_name = get_seller_name(soup) 444 | date = get_date(soup) 445 | status = "Не указано" # нет такой информации 446 | 447 | return [city, district, street, block_number, rent_info, price, cottage_type, 448 | total_area, comforts, selling_detail, images, description, date, phone, material, 449 | total_floors, land_area, status, seller_name] 450 | return None 451 | 452 | 453 | def get_commercial_data(html, url): 454 | soup = BeautifulSoup(html, "lxml") 455 | 456 | title = get_title(soup) 457 | if "сниму" not in title.lower(): 458 | address = ",".join(title.split(",")[1:]).strip() 459 | address = address[:address.rfind(" на карте")] 460 | if "(" in address: 461 | address = address[:address.rfind("(")] 462 | 463 | city = address.split(",")[-1].strip() 464 | district = address.split(",")[-2].strip() 465 | block_number = address.split(",")[-3].strip() 466 | street = address.split(",")[-4].strip() 467 | 468 | object_type, area = get_commercial_params(soup) 469 | price = get_commercial_price(soup) 470 | images = get_photos(soup) 471 | description = get_description(soup) 472 | phone = get_seller_phone(url, soup) 473 | date = get_date(soup) 474 | seller_name = get_seller_name(soup) 475 | office_class, furniture, entrance = ["Не указано"] * 3 476 | 477 | return [city, district, street, block_number, price, object_type, office_class, 478 | furniture, entrance, area, date, phone, images, description, seller_name] 479 | return None 480 | 481 | 482 | def get_land_data(html, url): 483 | soup = BeautifulSoup(html, "lxml") 484 | 485 | title = get_title(soup) 486 | if "сниму" not in title.lower(): 487 | address = ",".join(title.split(",")[1:]).strip() 488 | address = address[:address.rfind("(")].strip() 489 | 490 | city = address.split(",")[0] 491 | if len(address.split(",")) > 1: 492 | district = address.split(",")[1].strip() 493 | else: 494 | district = "Не указано" 495 | street = "Не указано" 496 | 497 | if city.lower() == "саратов": 498 | distance = "В черте города" 499 | else: 500 | distance = title[title.find("(") + 1:title.find(")")] 501 | 502 | area, land_type = get_land_params(soup) 503 | price = get_price(soup) 504 | images = get_photos(soup) 505 | description = get_description(soup) 506 | phone = get_seller_phone(url, soup) 507 | date = get_date(soup) 508 | seller_name = get_seller_name(soup) 509 | sell_type = "Продажа" 510 | deposit, seller_type = ["Не указано"] * 2 511 | 512 | return [city, district, street, sell_type, deposit, land_type, distance, area, price, seller_type, images, 513 | description, seller_name, phone, date] 514 | return None 515 | 516 | 517 | def crawl_page(first_offer, html, category, sell_type): 518 | global visited_urls, db 519 | soup = BeautifulSoup(html, "lxml") 520 | try: 521 | #offers = soup.find_all("a", class_="site3adv") + soup.find_all("a", class_="site3") 522 | offers = soup.find_all("a", class_="site3") 523 | except: 524 | offers = [] 525 | if offers is None or not offers: 526 | print("Парсинг завершен kvadrat") 527 | return True 528 | for offer in offers: 529 | try: 530 | url = "http://kvadrat64.ru/" + offer.get("href") 531 | if url in visited_urls: 532 | print("kvadrat not unique") 533 | time.sleep(random.uniform(5, 8)) 534 | continue 535 | else: 536 | visited_urls.append(url) 537 | #print(url) 538 | 539 | data = [] 540 | if category == "Квартиры": 541 | data = get_apartment_data(get_html(url), url) 542 | # записываем ключевую информацию, чтобы потом найти дубликаты 543 | with open("total_data.txt", "a", encoding="utf8") as file: 544 | file.write("%s--%s--%s--%s--%s--%s\n" % (data[2], data[3], data[4], data[8], data[-1], url)) 545 | elif category == "Дома": 546 | data = get_cottage_data(get_html(url), url) 547 | with open("total_data.txt", "a", encoding="utf8") as file: 548 | file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[7], data[8], url)) 549 | elif category == "Участки": 550 | data = get_land_data(get_html(url), url) 551 | with open("total_data.txt", "a", encoding="utf8") as file: 552 | file.write("%s--%s--%s--%s\n" % (data[2], data[5], data[7], url)) 553 | elif category == "Коммерческая_недвижимость": 554 | data = get_commercial_data(get_html(url), url) 555 | with open("total_data.txt", "a", encoding="utf8") as file: 556 | file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[6], data[10], url)) 557 | 558 | if first_offer: 559 | # сохраняем самую первую запись как точку выхода 560 | modifier = "w" if (category == "Квартиры" and sell_type == "Продажа") else "a" 561 | with open("breakpoints/kvadrat.txt", modifier, encoding="utf8") as file: 562 | file.write("%s--%s\n" % (data[2], data[5])) 563 | first_offer = False 564 | 565 | key_info = (data[2], data[5]) 566 | 567 | if any(x == key_info for x in [break_apartment_sell, break_apartment_rent, break_cottage_sell, 568 | break_cottage_rent, break_commercial_sell, break_commercial_rent, 569 | break_dacha_sell, break_saratov_land_sell, break_region_land_sell]): 570 | print("Парсинг завершен kvadrat") 571 | return True 572 | 573 | data.insert(4, sell_type) 574 | 575 | # на каком месте находится дата объявления 576 | index_of_date = -1 577 | if category == "Квартиры" or category == "Коммерческая_недвижимость": 578 | index_of_date = -5 579 | elif category == "Дома": 580 | index_of_date = -7 581 | elif category == "Участки": 582 | index_of_date = -1 583 | 584 | if data[index_of_date] != "Не указано" and data[index_of_date] < datetime.datetime.today() - datetime.timedelta(days=1): 585 | # сраниваем форматы datetime, чтобы знать, когда закончить парсинг 586 | print("Парсинг завершен kvadrat") 587 | return True 588 | else: 589 | # переводим в строковый формат 590 | data[index_of_date] = str(data[index_of_date]).split()[0] 591 | 592 | if data[0] != "Не указано" and data is not None: 593 | try: 594 | db.insert_data(category, data) 595 | except: 596 | db.close() 597 | db = DataBase() 598 | db.insert_data(category, data) 599 | print("parsed page kvadrat") 600 | 601 | #print(data) 602 | 603 | except Exception as e: 604 | with open("logs.txt", "a", encoding="utf8") as file: 605 | file.write(str(e) + " kvadrat crawl_page\n") 606 | 607 | time.sleep(random.uniform(5, 8)) 608 | 609 | 610 | def parse(category_url, category_name, sell_type): 611 | 612 | total_pages = get_total_pages(get_html(category_url)) 613 | 614 | for page in range(1, total_pages + 1): 615 | if (category_name == "Дома" and sell_type == "Продажа" and "sellzagbank" not in category_url) or category_name == "Участки": 616 | url = category_url.split("-") 617 | url_gen = "-".join(url[:2]) + "-" + str(page) + "-" + url[3] 618 | else: 619 | url_gen = category_url[:category_url.rfind("-") + 1] + str(page) + ".html" 620 | 621 | if page == 1: 622 | completed = crawl_page(True, get_html(url_gen), category_name, sell_type) 623 | else: 624 | completed = crawl_page(False, get_html(url_gen), category_name, sell_type) 625 | if completed: 626 | break 627 | 628 | 629 | def main(): 630 | global visited_urls 631 | url_apartments_sell = "http://kvadrat64.ru/sellflatbank-50-1.html" 632 | parse(url_apartments_sell, "Квартиры", "Продажа") 633 | 634 | visited_urls = [] 635 | url_apartments_rent = "https://kvadrat64.ru/giveflatbank-50-1.html" 636 | parse(url_apartments_rent, "Квартиры", "Аренда") 637 | 638 | visited_urls = [] 639 | url_cottages_sell = "https://kvadrat64.ru/search-103-1-50664.html" 640 | parse(url_cottages_sell, "Дома", "Продажа") 641 | 642 | visited_urls = [] 643 | url_cottages_rent = "https://kvadrat64.ru/giveflatbank-9-1.html" 644 | parse(url_cottages_rent, "Дома", "Аренда") 645 | 646 | visited_urls = [] 647 | url_commercials_sell = "https://kvadrat64.ru/sellcombank-1000-1.html" 648 | parse(url_commercials_sell, "Коммерческая_недвижимость", "Продажа") 649 | 650 | visited_urls = [] 651 | url_commercials_rent = "https://kvadrat64.ru/givecombank-1000-1.html" 652 | parse(url_commercials_rent, "Коммерческая_недвижимость", "Аренда") 653 | 654 | visited_urls = [] 655 | url_dachas_sell = "https://kvadrat64.ru/sellzagbank-1000-1.html" 656 | parse(url_dachas_sell, "Дома", "Продажа") 657 | 658 | visited_urls = [] 659 | url_saratov_lands_sell = "https://kvadrat64.ru/search-41-1-24435.html" 660 | parse(url_saratov_lands_sell, "Участки", "Продажа") 661 | 662 | visited_urls = [] 663 | url_region_lands_sell = "https://kvadrat64.ru/search-412-1-24450.html" 664 | parse(url_region_lands_sell, "Участки", "Продажа") 665 | 666 | 667 | if __name__ == "__main__": 668 | main() 669 | db.close() 670 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import gc 3 | from multiprocessing import Process 4 | import os 5 | import datetime 6 | from database import DataBase 7 | 8 | t1, t2, t3, t4, t5, t6 = [None] * 6 9 | 10 | 11 | def cls(): 12 | os.system('cls' if os.name == 'nt' else 'clear') 13 | 14 | 15 | def main(): 16 | global t1, t2, t3, t4, t5, t6 17 | if all(p is not None for p in [t1, t2, t3, t4, t5, t6]): 18 | for p in [t1, t2, t3, t4, t5, t6]: 19 | if p.is_alive(): 20 | p.terminate() 21 | p.join() 22 | 23 | import avito_parsing 24 | import irr_parsing 25 | import kvadrat64_parsing 26 | import ya_realty_parsing 27 | import cian_parsing 28 | import youla_parsing 29 | 30 | cls() 31 | print("Job started", datetime.datetime.today()) 32 | 33 | db = DataBase() 34 | db.create_table("Квартиры") 35 | db.create_table("Дома") 36 | db.create_table("Коммерческая_недвижимость") 37 | db.create_table("Участки") 38 | db.create_table("Дубликаты") 39 | 40 | if os.path.isfile("logs.txt"): 41 | os.remove("logs.txt") 42 | 43 | total_data = {} 44 | try: 45 | if os.path.isfile("total_data.txt"): 46 | with open("total_data.txt", "r", encoding="utf8") as file: 47 | for line in file.readlines(): 48 | data = line.strip().split("--") 49 | params = tuple(data[:-1]) 50 | url = data[-1] 51 | total_data[params] = list(set(total_data.get(params, []) + [url])) 52 | 53 | for data in total_data: 54 | if all(x != "Не указано" for x in data): # avoid writing dummy records 55 | if len(total_data[data]) > 1: 56 | db.insert_data("Дубликаты", [", ".join(data), "\n".join(total_data[data])]) 57 | except Exception as e: 58 | print(e) 59 | 60 | if os.path.isfile("total_data.txt"): 61 | os.remove("total_data.txt") 62 | 63 | t1 = Process(target=ya_realty_parsing.main) 64 | t2 = Process(target=irr_parsing.main) 65 | t3 = Process(target=youla_parsing.main) 66 | t1.start() 67 | t2.start() 68 | t3.start() 69 | t1.join() 70 | t2.join() 71 | t3.join() 72 | 73 | t4 = Process(target=kvadrat64_parsing.main) 74 | t5 = Process(target=cian_parsing.main) 75 | t6 = Process(target=avito_parsing.main) 76 | t4.start() 77 | t5.start() 78 | t6.start() 79 | t4.join() 80 | t5.join() 81 | t6.join() 82 | 83 | db.close() 84 | gc.collect() 85 | print("Job finished", datetime.datetime.today()) 86 | 87 | 88 | if __name__ == '__main__': 89 | import schedule 90 | import time 91 | 92 | schedule.every().day.at("10:00").do(main) 93 | 94 | while True: 95 | schedule.run_pending() 96 | time.sleep(1) 97 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | asn1crypto==0.24.0 2 | beautifulsoup4==4.6.1 3 | certifi==2018.4.16 4 | cffi==1.11.5 5 | chardet==3.0.4 6 | cryptography==3.2 7 | EasyProcess==0.2.3 8 | httmock==1.2.6 9 | http-request-randomizer==1.2.3 10 | idna==2.7 11 | mysql-connector-python==8.0.12 12 | Pillow==8.1.1 13 | protobuf==3.6.1 14 | psutil==5.6.6 15 | pycparser==2.18 16 | pyOpenSSL==18.0.0 17 | python-dateutil==2.7.3 18 | PyVirtualDisplay==0.2.1 19 | requests==2.20.0 20 | selenium==3.14.0 21 | six==1.11.0 22 | urllib3==1.24.2 23 | xvfbwrapper==0.2.9 -------------------------------------------------------------------------------- /ya_realty_parsing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import time 6 | import random 7 | from fake_useragent import UserAgent 8 | import datetime 9 | from selenium import webdriver 10 | from xvfbwrapper import Xvfb 11 | from selenium.webdriver.chrome.options import Options 12 | from database import DataBase 13 | 14 | # на каких записях останавливаться 15 | with open("breakpoints/ya.txt", "r", encoding="utf8") as file: 16 | breakpoints = file.readlines() 17 | try: 18 | break_apartment_sell = tuple(breakpoints[0].strip().split("--")) 19 | except: 20 | break_apartment_sell = None 21 | try: 22 | break_apartment_rent = tuple(breakpoints[1].strip().split("--")) 23 | except: 24 | break_apartment_rent = None 25 | try: 26 | break_cottage_sell = tuple(breakpoints[2].strip().split("--")) 27 | except: 28 | break_cottage_sell = None 29 | try: 30 | break_cottage_rent = tuple(breakpoints[3].strip().split("--")) 31 | except: 32 | break_cottage_rent = None 33 | try: 34 | break_commercial_sell = tuple(breakpoints[4].strip().split("--")) 35 | except: 36 | break_commercial_sell = None 37 | try: 38 | break_commercial_rent = tuple(breakpoints[5].strip().split("--")) 39 | except: 40 | break_commercial_rent = None 41 | 42 | # defining chrome options for selenium 43 | options = Options() 44 | options.add_argument("--no-sandbox") 45 | 46 | db = DataBase() 47 | visited_urls = [] 48 | 49 | 50 | def transform_date(date): 51 | """ 52 | Преобразуем дату, чтобы сравнить datetime-объекты 53 | """ 54 | day, month, year = date.split() 55 | months = { 56 | "января": 1, 57 | "февраля": 2, 58 | "марта": 3, 59 | "апреля": 4, 60 | "мая": 5, 61 | "июня": 6, 62 | "июля": 7, 63 | "августа": 8, 64 | "сентября": 9, 65 | "октября": 10, 66 | "ноября": 11, 67 | "декабря": 12 68 | } 69 | 70 | date = datetime.datetime(int(year), months[month], int(day)) 71 | return date 72 | 73 | 74 | def get_html(url): 75 | req = requests.get(url, headers={"User-Agent": UserAgent().chrome, "Referer": url, 76 | "Accept-Language": "ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7", 77 | "Connection": "keep-alive", "Origin": "https://realty.yandex.ru", 78 | "DNT": "1"}) 79 | return req.text.encode(req.encoding) 80 | 81 | 82 | def get_title(soup): 83 | try: 84 | title = soup.find("h1", class_="offer-card__header-text").text.strip() 85 | except Exception as e: 86 | with open("logs.txt", "a", encoding="utf8") as file: 87 | file.write(str(e) + " ya get_title\n") 88 | title = "Не указано" 89 | return title 90 | 91 | 92 | def get_address(soup): 93 | try: 94 | address = soup.find("h2", class_="offer-card__address ellipsis").text.strip() 95 | # separating data from the address string 96 | district, street = "Не указано", "Не указано" 97 | city = address.split(",")[0] 98 | block_number = address.split(",")[-1].strip() 99 | if "ул " in block_number.lower() or "ул." in block_number.lower() or "улица" in block_number.lower() \ 100 | or " пер" in block_number.lower() or "проспект" in block_number.lower() or "проезд" in block_number.lower(): 101 | street = block_number 102 | block_number = "Не указано" 103 | 104 | for param in address.split(",")[1:-1]: 105 | if "ул " in param.lower() or "ул." in param.lower() or "улица" in param.lower() \ 106 | or " пер" in param.lower() or "проспект" in param.lower() or "проезд" in param.lower(): 107 | street = param.strip() 108 | elif "район" in param.lower() or "р-н" in param.lower(): 109 | district = param.strip() 110 | 111 | if street.split()[-1].strip().isdigit(): 112 | block_number = street.split()[-1].strip() 113 | street = " ".join(street.split()[:-1]).strip() 114 | 115 | return city, district, street, block_number 116 | 117 | except Exception as e: 118 | with open("logs.txt", "a", encoding="utf8") as file: 119 | file.write(str(e) + " ya get_address\n") 120 | return ["Не указано"] * 4 121 | 122 | 123 | def get_block_type(soup): 124 | try: 125 | block_type = soup.find("div", class_="offer-card__building-type") 126 | if block_type is None: 127 | block_type = "Вторичка" 128 | else: 129 | block_type = block_type.text.strip() 130 | except Exception as e: 131 | with open("logs.txt", "a", encoding="utf8") as file: 132 | file.write(str(e) + " ya get_block_type\n") 133 | block_type = "Не указано" 134 | return block_type 135 | 136 | 137 | def get_price(soup): 138 | try: 139 | price = soup.find("h3", class_="offer-price offer-card__price offer-card__price").text.strip() 140 | except Exception as e: 141 | with open("logs.txt", "a", encoding="utf8") as file: 142 | file.write(str(e) + " ya get_price\n") 143 | price = "Не указано" 144 | return price 145 | 146 | 147 | def get_selling_type(soup): 148 | try: 149 | selling_type = soup.find("div", class_="offer-card__terms").text.strip() 150 | except Exception as e: 151 | with open("logs.txt", "a", encoding="utf8") as file: 152 | file.write(str(e) + " ya get_selling_type\n") 153 | selling_type = "Не указано" 154 | return selling_type 155 | 156 | 157 | def get_seller_type(soup): 158 | try: 159 | seller_type = soup.find("div", class_="offer-card__author-note").text.strip() 160 | except Exception as e: 161 | with open("logs.txt", "a", encoding="utf8") as file: 162 | file.write(str(e) + " ya get_seller_type\n") 163 | seller_type = "Не указано" 164 | return seller_type 165 | 166 | 167 | def get_seller_name(soup): 168 | try: 169 | name = soup.find("div", class_="offer-card__author-name").text.strip() 170 | except: 171 | name = "Не указано" 172 | return name 173 | 174 | 175 | def get_photos(soup): 176 | try: 177 | images = [] 178 | images_list = soup.find("div", class_="offer-card__photos-wrapper").find_all("a") 179 | for image in images_list: 180 | link = "https://realty.yandex.ru" + image.get("href") 181 | images.append(link) 182 | images = "\n".join(images) 183 | except Exception as e: 184 | with open("logs.txt", "a", encoding="utf8") as file: 185 | file.write(str(e) + " ya get_photos\n") 186 | images = "Не указано" 187 | return images 188 | 189 | 190 | def get_description(soup): 191 | try: 192 | description = soup.find("div", class_="offer-card__desc-text").text.strip() 193 | except Exception as e: 194 | with open("logs.txt", "a", encoding="utf8") as file: 195 | file.write(str(e) + " ya get_description\n") 196 | description = "Не указано" 197 | return description 198 | 199 | 200 | def get_date(soup, which_page): 201 | # 0 - page with offers, 1 - offer itself 202 | try: 203 | if which_page == 0: 204 | date = soup.find("div", class_="OffersSerpItem__publish-date").text.strip() 205 | else: 206 | date = soup.find("div", class_="offer-card__lot-date").text.strip() 207 | if "назад" in date: 208 | time_passed = int(date.split()[0]) 209 | if "минут" in date: 210 | date = str(datetime.datetime.today() - datetime.timedelta(minutes=time_passed)).split()[0] 211 | elif "часов" in date or "часа" in date or "час" in date: 212 | date = str(datetime.datetime.today() - datetime.timedelta(hours=time_passed)).split()[0] 213 | elif "сейчас" in date: 214 | date = str(datetime.datetime.today()).split()[0] 215 | elif date == "вчера": 216 | date = str(datetime.datetime.today() - datetime.timedelta(days=1)).split()[0] 217 | elif len(date.split()) >= 3: 218 | transformed_date = transform_date(date) 219 | days_passed = str(datetime.datetime.today() - transformed_date).split()[0] 220 | if int(days_passed) > 1: 221 | date = "too old" 222 | except Exception as e: 223 | with open("logs.txt", "a", encoding="utf8") as file: 224 | file.write(str(e) + " ya get_date\n") 225 | date = "Не указано" 226 | return date 227 | 228 | 229 | def get_seller_phone(url): 230 | phone = "Не указано" 231 | try: 232 | vdisplay = Xvfb() 233 | vdisplay.start() 234 | driver = webdriver.Chrome(options=options) 235 | driver.set_window_size(1920, 1080) 236 | driver.get(url) 237 | 238 | button = driver.find_element_by_xpath("/html/body/div[1]/div[2]/div/div[2]/div[2]/div[2]/div/div[1]/div[3]/div[1]/span/button") 239 | button.click() 240 | time.sleep(2) 241 | phone = driver.find_element_by_xpath('//div[@class="helpful-info__contact-phones-string"]').text 242 | driver.quit() 243 | vdisplay.stop() 244 | except Exception as e: 245 | with open("logs.txt", "a", encoding="utf8") as file: 246 | file.write(str(e) + " ya get_seller_phone\n") 247 | return phone 248 | 249 | 250 | def get_apartment_params(soup): 251 | rooms_number, floor, total_floors, total_area, material, year, kitchen_area, living_area = ["Не указано"] * 8 252 | try: 253 | params = [x.text.strip() for x in soup.find_all("div", class_="offer-card__feature-name")] 254 | values = [x.text.strip() for x in soup.find_all("div", class_="offer-card__feature-value")] 255 | for i in range(len(params)): 256 | if "Количество комнат" in params[i]: 257 | rooms_number = values[i] 258 | elif "Год постройки" in params[i]: 259 | year = values[i] 260 | elif "Этаж" in params[i]: 261 | floor, total_floors = values[i].split(" из ") 262 | elif "Общая площадь" in params[i]: 263 | total_area = values[i] 264 | elif "Кухня" in params[i]: 265 | total_area = values[i] 266 | elif "Жилая" in params[i]: 267 | total_area = values[i] 268 | elif "Тип здания" in params[i]: 269 | material = values[i] 270 | 271 | if year == "Не указано": 272 | new_block_params = [x.text.strip() for x in soup.find_all("div", class_="offer-card__site-subtitle-item")] 273 | for param in new_block_params: 274 | if "строится" in param: 275 | year = param 276 | break 277 | 278 | if year == "Не указано": 279 | new_params = [x.text.strip() for x in soup.find_all("div", class_="offer-card__main-feature-note")] 280 | values = [x.text.strip() for x in soup.find_all("div", class_="offer-card__main-feature-title")] 281 | for i in range(len(new_params)): 282 | if "год постройки" in new_params[i]: 283 | year = values[i] 284 | break 285 | 286 | except Exception as e: 287 | with open("logs.txt", "a", encoding="utf8") as file: 288 | file.write(str(e) + " ya get_apartment_params\n") 289 | return rooms_number, floor, total_floors, total_area, material, year, kitchen_area, living_area 290 | 291 | 292 | def get_cottage_params(soup): 293 | total_area, land_area, comforts, year, material, total_floors, land_status = ["Не указано"] * 7 294 | try: 295 | params = [x.text.strip() for x in soup.find_all("div", class_="offer-card__feature-name")] 296 | values = [x.text.strip() for x in soup.find_all("div", class_="offer-card__feature-value")] 297 | for i in range(len(params)): 298 | if "Год постройки" in params[i]: 299 | year = values[i] 300 | elif "Общая площадь" in params[i]: 301 | total_area = values[i] 302 | elif "Площадь участка" in params[i]: 303 | land_area = values[i] 304 | elif "Тип дома" in params[i]: 305 | material = values[i] 306 | elif "Количество этажей" in params[i]: 307 | total_floors = values[i] 308 | elif "Тип участка" in params[i]: 309 | land_status = values[i] 310 | elif any(x in params[i].lower() for x in ["отапливаемый", "отопление", "водопровод", "канализация", 311 | "электроснабжение", "свет", "газ", "вода", "интернет", 312 | "телефон", "душ"]): 313 | if comforts == "Не указано": 314 | comforts = params[i].strip() 315 | else: 316 | comforts += "; " + params[i].strip() 317 | 318 | if year == "Не указано": 319 | new_block_params = [x.text.strip() for x in soup.find_all("div", class_="offer-card__site-subtitle-item")] 320 | for param in new_block_params: 321 | if "строится" in param: 322 | year = param 323 | except Exception as e: 324 | with open("logs.txt", "a", encoding="utf8") as file: 325 | file.write(str(e) + " ya get_cottage_params\n") 326 | return total_area, land_area, comforts, year, material, total_floors, land_status 327 | 328 | 329 | def get_commercial_params(soup): 330 | entrance, furniture, additions, area = ["Не указано"] * 4 331 | try: 332 | params = [x.text.strip() for x in soup.find_all("div", class_="offer-card__feature-name")] 333 | values = [x.text.strip() for x in soup.find_all("div", class_="offer-card__feature-value")] 334 | for i in range(len(params)): 335 | if "Мебель" in params[i]: 336 | furniture = values[i] 337 | elif "Вход" in params[i]: 338 | entrance = values[i] 339 | elif any(x in params[i].lower() for x in ["кондиционер", "интернет", "пожарная сигнализация", 340 | "вентиляция", "охраняемая парковка", "сигнализация", "лифт"])\ 341 | and values[i].strip() == "да": 342 | if additions == "Не указано": 343 | additions = params[i].strip() 344 | else: 345 | additions += "; " + params[i].strip() 346 | 347 | new_params = [x.text.strip() for x in soup.find_all("div", class_="offer-card__main-feature-note")] 348 | values = [x.text.strip() for x in soup.find_all("div", class_="offer-card__main-feature-title")] 349 | for j in range(len(new_params)): 350 | if "общая" in new_params[j]: 351 | area = values[j] 352 | break 353 | except Exception as e: 354 | with open("logs.txt", "a", encoding="utf8") as file: 355 | file.write(str(e) + " ya get_commercial_params\n") 356 | return entrance, furniture, additions, area 357 | 358 | 359 | def get_apartment_data(html, url): 360 | soup = BeautifulSoup(html, "lxml") 361 | 362 | #title = get_title(soup) 363 | city, district, street, block_number = get_address(soup) 364 | block_type = get_block_type(soup) 365 | price = get_price(soup) 366 | rooms_number, floor, total_floors, total_area, material, year, kitchen_area, living_area = get_apartment_params(soup) 367 | selling_detail = get_selling_type(soup) 368 | if "продажа" in selling_detail.lower() or "ипотека" in selling_detail.lower(): 369 | rent_info = "Не аренда" 370 | else: 371 | rent_info = selling_detail 372 | selling_detail = "Не указано" 373 | 374 | #seller_type = get_seller_type(soup) 375 | images = get_photos(soup) 376 | description = get_description(soup) 377 | phone = get_seller_phone(url) 378 | date = get_date(soup, 1) 379 | 380 | return [city, district, street, block_number, rent_info, price, block_type, 381 | rooms_number, total_area, total_floors, material, selling_detail, images, 382 | description, date, phone, kitchen_area, living_area, floor] 383 | 384 | 385 | def get_cottage_data(html, url): 386 | soup = BeautifulSoup(html, "lxml") 387 | 388 | title = get_title(soup) 389 | city, district, street, block_number = get_address(soup) 390 | cottage_type = title.split(",")[0] 391 | price = get_price(soup) 392 | total_area, land_area, comforts, year, material, total_floors, land_status = get_cottage_params(soup) 393 | selling_detail = get_selling_type(soup) 394 | if "продажа" in selling_detail.lower() or "ипотека" in selling_detail.lower(): 395 | rent_info = "Не аренда" 396 | else: 397 | rent_info = selling_detail 398 | selling_detail = "Не указано" 399 | 400 | images = get_photos(soup) 401 | description = get_description(soup) 402 | phone = get_seller_phone(url) 403 | date = get_date(soup, 1) 404 | seller_name = get_seller_name(soup) 405 | 406 | return [city, district, street, block_number, rent_info, price, cottage_type, 407 | total_area, comforts, selling_detail, images, description, date, phone, material, 408 | total_floors, land_area, land_status, seller_name] 409 | 410 | 411 | def get_commercial_data(html, url): 412 | soup = BeautifulSoup(html, "lxml") 413 | 414 | title = get_title(soup) 415 | city, district, street, block_number = get_address(soup) 416 | price = get_price(soup) 417 | object_type = title.split(",")[0] 418 | entrance, furniture, additions, area = get_commercial_params(soup) 419 | phone = get_seller_phone(url) 420 | images = get_photos(soup) 421 | description = get_description(soup) 422 | seller_name = get_seller_name(soup) 423 | date = get_date(soup, 1) 424 | office_class = "Не указано" 425 | 426 | return [city, district, street, block_number, price, object_type, office_class, 427 | furniture, entrance, area, date, phone, images, description, seller_name] 428 | 429 | 430 | def crawl_page(first_offer, html, category, sell_type): 431 | global visited_urls, db 432 | soup = BeautifulSoup(html, "lxml") 433 | # так как пагинация динамическая и мы не можем получить число страниц, проверяем, есть ли на странице объявления 434 | try: 435 | offers = soup.find("ol", class_="OffersSerp__list").find_all("li", class_="OffersSerp__list-item_type_offer") 436 | except: 437 | offers = [] 438 | if offers is None or not offers: 439 | print("Парсинг завершен ya") 440 | return True 441 | k = 0 442 | for offer in offers: 443 | try: 444 | date = get_date(soup, 0) 445 | if date == "too old": 446 | print("Парсинг завершен ya") 447 | return True 448 | 449 | url = "https://realty.yandex.ru" + offer.find("a", class_="OffersSerpItem__link").get("href") 450 | if url in visited_urls: 451 | print("ya not unique") 452 | time.sleep(random.uniform(10, 15)) 453 | continue 454 | else: 455 | visited_urls.append(url) 456 | #print(url) 457 | 458 | data = [] 459 | if category == "Квартиры": 460 | data = get_apartment_data(get_html(url), url) 461 | # записываем ключевую информацию, чтобы потом найти дубликаты 462 | with open("total_data.txt", "a", encoding="utf8") as file: 463 | file.write("%s--%s--%s--%s--%s--%s\n" % (data[2], data[3], data[4], data[8], data[-1], url)) 464 | elif category == "Дома": 465 | data = get_cottage_data(get_html(url), url) 466 | with open("total_data.txt", "a", encoding="utf8") as file: 467 | file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[7], data[8], url)) 468 | elif category == "Коммерческая_недвижимость": 469 | data = get_commercial_data(get_html(url), url) 470 | with open("total_data.txt", "a", encoding="utf8") as file: 471 | file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[6], data[10], url)) 472 | 473 | if first_offer: 474 | # сохраняем самую первую запись как точку выхода 475 | modifier = "w" if (category == "Квартиры" and sell_type == "Продажа") else "a" 476 | with open("breakpoints/ya.txt", modifier, encoding="utf8") as file: 477 | file.write("%s--%s\n" % (data[2], data[5])) 478 | first_offer = False 479 | 480 | key_info = (data[2], data[5]) 481 | 482 | if any(x == key_info for x in [break_apartment_sell, break_apartment_rent, break_cottage_sell, 483 | break_cottage_rent, break_commercial_sell, break_commercial_rent]): 484 | print("Парсинг завершен ya") 485 | return True 486 | 487 | data.insert(4, sell_type) 488 | #print(*data, sep="\n") 489 | #print("--------------------------------------") 490 | if data[0] != "Не указано": 491 | try: 492 | db.insert_data(category, data) 493 | except: 494 | db.close() 495 | db = DataBase() 496 | db.insert_data(category, data) 497 | print("parsed page ya") 498 | 499 | 500 | except Exception as e: 501 | with open("logs.txt", "a", encoding="utf8") as file: 502 | file.write(str(e) + " ya crawl_page\n") 503 | #print(e) 504 | #print("Ошибка в crawl_page") 505 | 506 | k += 1 507 | if k % 5 == 0: # после каждого пятого запроса, делаем паузу побольше 508 | time.sleep(100) 509 | else: 510 | time.sleep(random.uniform(10, 15)) 511 | 512 | 513 | def parse(category_url, category_name, sell_type): 514 | completed = False 515 | page = 0 516 | while not completed: 517 | url_gen = category_url[:category_url.rfind("=") + 1] + str(page) 518 | if page == 0: 519 | completed = crawl_page(True, get_html(url_gen), category_name, sell_type) 520 | else: 521 | completed = crawl_page(False, get_html(url_gen), category_name, sell_type) 522 | page += 1 523 | 524 | 525 | def main(): 526 | global visited_urls 527 | url_apartments_sell = "https://realty.yandex.ru/saratovskaya_oblast/kupit/kvartira/?sort=DATE_DESC&page=0" 528 | parse(url_apartments_sell, "Квартиры", "Продажа") 529 | 530 | visited_urls = [] 531 | url_apartments_rent = "https://realty.yandex.ru/saratovskaya_oblast/snyat/kvartira/?sort=DATE_DESC&page=0" 532 | parse(url_apartments_rent, "Квартиры", "Аренда") 533 | 534 | visited_urls = [] 535 | url_cottages_sell = "https://realty.yandex.ru/saratovskaya_oblast/kupit/dom/?sort=DATE_DESC&page=0" 536 | parse(url_cottages_sell, "Дома", "Продажа") 537 | 538 | visited_urls = [] 539 | url_cottages_rent = "https://realty.yandex.ru/saratovskaya_oblast/snyat/dom/?sort=DATE_DESC&page=0" 540 | parse(url_cottages_rent, "Дома", "Аренда") 541 | 542 | visited_urls = [] 543 | url_commercials_sell = "https://realty.yandex.ru/saratovskaya_oblast/kupit/kommercheskaya-nedvizhimost/?sort=DATE_DESC&page=0" 544 | parse(url_commercials_sell, "Коммерческая_недвижимость", "Продажа") 545 | 546 | visited_urls = [] 547 | url_commercials_rent = "https://realty.yandex.ru/saratovskaya_oblast/snyat/kommercheskaya-nedvizhimost/?sort=DATE_DESC&page=0" 548 | parse(url_commercials_rent, "Коммерческая_недвижимость", "Аренда") 549 | 550 | 551 | if __name__ == "__main__": 552 | main() 553 | db.close() 554 | -------------------------------------------------------------------------------- /youla_parsing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import time 6 | import random 7 | from fake_useragent import UserAgent 8 | import datetime 9 | from selenium import webdriver 10 | from xvfbwrapper import Xvfb 11 | from selenium.webdriver.chrome.options import Options 12 | from database import DataBase 13 | 14 | db = DataBase() 15 | visited_urls = [] 16 | 17 | # defining chrome options for selenium 18 | options = Options() 19 | options.add_argument("--no-sandbox") 20 | 21 | 22 | def get_html(url): 23 | req = requests.get(url, headers={"User-Agent": UserAgent().chrome}) 24 | return req.text.encode(req.encoding) 25 | 26 | 27 | def get_date(html, k): 28 | soup = BeautifulSoup(html, "lxml") 29 | 30 | try: 31 | date = soup.find_all("span", class_="hidden-xs")[k].text.strip() 32 | if "сегодня" in date: 33 | return str(datetime.datetime.today()).split()[0] 34 | elif "вчера" in date: 35 | return str(datetime.datetime.today() - datetime.timedelta(days=1)).split()[0] 36 | else: 37 | return "too old" 38 | except Exception as e: 39 | date = "Не указано" 40 | with open("logs.txt", "a", encoding="utf8") as file: 41 | file.write(str(e) + " youla get_date\n") 42 | return date 43 | 44 | 45 | def get_category(html, k): 46 | soup = BeautifulSoup(html, "lxml") 47 | 48 | try: 49 | title = soup.find_all("div", class_="product_item__title")[k].text.split(",")[0].strip() 50 | if "Квартира" in title: 51 | return "Квартира" 52 | elif "Дом" in title: 53 | return "Дом" 54 | elif "Коттедж" in title: 55 | return "Коттедж" 56 | elif "Таунхаус" in title: 57 | return "Таунхаус" 58 | elif "Дача" in title: 59 | return "Дача" 60 | elif "Участок" in title: 61 | return "Участок" 62 | except Exception as e: 63 | with open("logs.txt", "a", encoding="utf8") as file: 64 | file.write(str(e) + " youla get_category\n") 65 | return None 66 | 67 | 68 | def get_address(driver): 69 | try: 70 | address = driver.find_element_by_tag_name("table").find_elements_by_tag_name("tbody")[0].find_elements_by_tag_name("span")[0].text.strip() 71 | # separating data from the address string 72 | district, street = "Не указано", "Не указано" 73 | city = address.split(",")[0] 74 | block_number = address.split(",")[-1].strip() 75 | if "ул " in block_number.lower() or "ул." in block_number.lower() or "улица" in block_number.lower() \ 76 | or " пер" in block_number.lower() or "проспект" in block_number.lower() or "проезд" in block_number.lower(): 77 | street = block_number 78 | block_number = "Не указано" 79 | 80 | for param in address.split(",")[1:-1]: 81 | if "ул " in param.lower() or "ул." in param.lower() or "улица" in param.lower() \ 82 | or " пер" in param.lower() or "проспект" in param.lower() or "проезд" in param.lower(): 83 | street = param.strip() 84 | elif "район" in param.lower() or "р-н" in param.lower(): 85 | district = param.strip() 86 | 87 | if street.split()[-1].strip().isdigit(): 88 | block_number = street.split()[-1].strip() 89 | if block_number == "unnamed road": 90 | block_number = "Не указано" 91 | street = " ".join(street.split()[:-1]).strip() 92 | 93 | return city, district, street, block_number 94 | except Exception as e: 95 | with open("logs.txt", "a", encoding="utf8") as file: 96 | file.write(str(e) + " youla get_address\n") 97 | return ["Не указано"] * 4 98 | 99 | 100 | def get_selling_type(url): 101 | sell_type, rent_info = "Не указано", "Не указано" 102 | if "prodaja" in url: 103 | sell_type = "Продажа" 104 | elif "arenda" in url: 105 | if "posutochno" in url: 106 | sell_type = "Аренда" 107 | rent_info = "посуточно" 108 | else: 109 | sell_type = "Аренда" 110 | rent_info = "длительный срок" 111 | return sell_type, rent_info 112 | 113 | 114 | def get_price(driver): 115 | try: 116 | price = driver.find_element_by_css_selector("div[class='sticky-inner-wrapper']").find_element_by_tag_name("span").text.strip() 117 | except Exception as e: 118 | with open("logs.txt", "a", encoding="utf8") as file: 119 | file.write(str(e) + " youla get_price\n") 120 | price = "Не указано" 121 | return price 122 | 123 | 124 | def get_seller_info(driver): 125 | seller_type, seller_name = "Не указано", "Не указано" 126 | try: 127 | block = driver.find_element_by_css_selector("div[data-test-component='ProductOwner']").find_element_by_tag_name("div") 128 | seller_name = block.find_element_by_tag_name("a").text.strip() 129 | seller_name = seller_name[:seller_name.rfind("(")] 130 | seller_type = block.find_element_by_tag_name("div").text.strip() 131 | except Exception as e: 132 | with open("logs.txt", "a", encoding="utf8") as file: 133 | file.write(str(e) + " youla get_seller_info\n") 134 | return seller_type, seller_name 135 | 136 | 137 | def get_photos(driver): 138 | try: 139 | images = "\n".join([x.get_attribute("src") for x in driver.find_elements_by_tag_name("div") 140 | if x.get_attribute("src") is not None]) 141 | if not images: 142 | images = driver.find_element_by_css_selector("div[data-test-component='ProductGallery']").find_element_by_tag_name("img").get_attribute("src") 143 | except Exception as e: 144 | images = "Не указано" 145 | with open("logs.txt", "a", encoding="utf8") as file: 146 | file.write(str(e) + " youla get_photos\n") 147 | return images 148 | 149 | 150 | def get_description(driver): 151 | try: 152 | description = driver.find_element_by_tag_name("table").find_elements_by_tag_name("tbody")[1].find_element_by_tag_name("td").text.strip() 153 | except Exception as e: 154 | description = "Не указано" 155 | with open("logs.txt", "a", encoding="utf8") as file: 156 | file.write(str(e) + " youla get_description\n") 157 | return description 158 | 159 | 160 | def get_seller_phone(driver): 161 | try: 162 | button = driver.find_element_by_css_selector("button[data-test-action='PhoheNumberClick']") 163 | button.click() 164 | time.sleep(3) 165 | phone = driver.find_element_by_xpath('//*[@id="app"]/div[2]/div[10]/div/div/div/div[2]/div[2]/div/a').text.strip() 166 | except Exception as e: 167 | phone = "Не указано" 168 | with open("logs.txt", "a", encoding="utf8") as file: 169 | file.write(str(e) + " youla get_seller_phone\n") 170 | return phone 171 | 172 | 173 | def get_apartment_params(driver): 174 | material, lift, year, rooms_number, floor, total_floors, total_area, kitchen_area, repair = ["Не указано"] * 9 175 | try: 176 | expand = driver.find_element_by_tag_name("table").find_elements_by_tag_name("tbody")[2].find_element_by_tag_name("div") 177 | expand.click() 178 | params = driver.find_element_by_tag_name("table").find_elements_by_tag_name("tbody")[2].find_elements_by_tag_name("th") 179 | values = driver.find_element_by_tag_name("table").find_elements_by_tag_name("tbody")[2].find_elements_by_tag_name("td") 180 | for i in range(len(params)): 181 | if "Комнат в квартире" in params[i].text.strip(): 182 | rooms_number = values[i].text.strip() 183 | elif "Общая площадь" in params[i].text.strip(): 184 | total_area = values[i].text.strip() 185 | elif "Этаж" in params[i].text.strip(): 186 | floor = values[i].text.strip().split()[0] 187 | elif "Этажность дома" in params[i].text.strip(): 188 | total_floors = values[i].text.strip() 189 | elif "Площадь кухни" in params[i].text.strip(): 190 | kitchen_area = values[i].text.strip() 191 | elif "Ремонт" in params[i].text.strip(): 192 | repair = values[i].text.strip() 193 | elif "Лифт" in params[i].text.strip(): 194 | lift = values[i].text.strip() 195 | elif "Тип дома" in params[i].text.strip(): 196 | material = values[i].text.strip() 197 | elif "Год постройки" in params[i].text.strip(): 198 | year = values[i].text.strip() 199 | except Exception as e: 200 | with open("logs.txt", "a", encoding="utf8") as file: 201 | file.write(str(e) + " youla get_apartment_params\n") 202 | return material, lift, year, rooms_number, floor, total_floors, total_area, kitchen_area, repair 203 | 204 | 205 | def get_cottage_params(driver): 206 | total_area, material, total_floors, bedrooms, land_area, status, comforts = ["Не указано"] * 7 207 | try: 208 | expand = driver.find_element_by_tag_name("table").find_elements_by_tag_name("tbody")[2].find_element_by_tag_name("div") 209 | expand.click() 210 | params = driver.find_element_by_tag_name("table").find_elements_by_tag_name("tbody")[2].find_elements_by_tag_name("th") 211 | values = driver.find_element_by_tag_name("table").find_elements_by_tag_name("tbody")[2].find_elements_by_tag_name("td") 212 | for i in range(len(params)): 213 | if "Площадь дома" in params[i].text.strip(): 214 | total_area = values[i].text.strip() 215 | elif "Материал дома" in params[i].text.strip(): 216 | material = values[i].text.strip() 217 | elif "Количество спален" in params[i].text.strip(): 218 | bedrooms = values[i].text.strip() 219 | elif "Площадь участка" in params[i].text.strip(): 220 | land_area = values[i].text.strip() 221 | elif "Этажей" in params[i].text.strip(): 222 | total_floors = values[i].text.strip() 223 | elif "Тип участка" in params[i].text.strip(): 224 | status = values[i].text.strip() 225 | elif any(x in params[i].text.strip() for x in ["Электричество", "Газ", "Водоснабжение", "Отопление", "Гараж", "Санузлы"]): 226 | if comforts == "Не указано": 227 | comforts = params[i].text.strip() + " - " + values[i].text.strip().lower() + "; " 228 | else: 229 | comforts += params[i].text.strip() + " - " + values[i].text.strip().lower() + "; " 230 | except Exception as e: 231 | with open("logs.txt", "a", encoding="utf8") as file: 232 | file.write(str(e) + " youla get_cottage_params\n") 233 | return total_area, material, total_floors, bedrooms, land_area, status, comforts 234 | 235 | 236 | def get_apartment_data(url): 237 | vdisplay = Xvfb() 238 | vdisplay.start() 239 | driver = webdriver.Chrome(options=options) 240 | driver.set_window_size(1920, 1080) 241 | driver.get(url) 242 | 243 | city, district, street, block_number = get_address(driver) 244 | sell_type, rent_info = get_selling_type(url) 245 | if "продажа" in sell_type.lower(): 246 | rent_info = "Не аренда" 247 | material, lift, year, rooms_number, floor, total_floors, total_area, kitchen_area, repair = get_apartment_params(driver) 248 | block_type = "Вторичка" 249 | living_area = "Не указано" 250 | price = get_price(driver) 251 | if "Аренда" in sell_type: 252 | if "posutochno" in url: 253 | price += "/день" 254 | else: 255 | price += "/мес." 256 | #seller_type, seller_name = get_seller_info(driver) 257 | images = get_photos(driver) 258 | description = get_description(driver) 259 | phone = get_seller_phone(driver) 260 | selling_detail = "Не указано" 261 | 262 | driver.quit() 263 | vdisplay.stop() 264 | 265 | return [city, district, street, block_number, sell_type, rent_info, price, block_type, 266 | rooms_number, total_area, total_floors, material, selling_detail, images, 267 | description, phone, kitchen_area, living_area, floor] 268 | 269 | 270 | def get_cottage_data(url, category): 271 | vdisplay = Xvfb() 272 | vdisplay.start() 273 | driver = webdriver.Chrome(options=options) 274 | driver.set_window_size(1920, 1080) 275 | driver.get(url) 276 | 277 | if "doma" in url: 278 | cottage_type = "Дом" 279 | elif "uchastka" in url: 280 | cottage_type = "Участок" 281 | else: 282 | cottage_type = "Не указано" 283 | 284 | city, district, street, block_number = get_address(driver) 285 | sell_type, rent_info = get_selling_type(url) 286 | if "продажа" in sell_type.lower(): 287 | rent_info = "Не аренда" 288 | price = get_price(driver) 289 | if "Аренда" in sell_type: 290 | if "posutochno" in url: 291 | price += "/день" 292 | else: 293 | price += "/мес." 294 | total_area, material, total_floors, bedrooms, land_area, status, comforts = get_cottage_params(driver) 295 | _, seller_name = get_seller_info(driver) 296 | images = get_photos(driver) 297 | description = get_description(driver) 298 | phone = get_seller_phone(driver) 299 | selling_detail = "Не указано" 300 | 301 | driver.quit() 302 | vdisplay.stop() 303 | 304 | if category == "Участок": 305 | material, total_floors = "Участок", "Участок" 306 | 307 | return [city, district, street, block_number, sell_type, rent_info, price, cottage_type, 308 | total_area, comforts, selling_detail, images, description, phone, material, 309 | total_floors, land_area, status, seller_name] 310 | 311 | 312 | def crawl_page(html): 313 | global visited_urls, db 314 | soup = BeautifulSoup(html, "lxml") 315 | # так как пагинация динамическая и мы не можем получить число страниц, проверяем, есть ли на странице объявления 316 | offers = soup.find_all("li", class_="product_item") 317 | if offers is None or not offers: 318 | print("Парсинг завершен youla") 319 | return True 320 | k = 0 321 | for offer in offers: 322 | try: 323 | category = get_category(html, k) 324 | date = get_date(html, k) 325 | if date == "too old" and len(offer.get("class")) == 1: 326 | print("Парсинг завершен youla") 327 | return True 328 | elif date == "too old": 329 | date = str(datetime.datetime.today() - datetime.timedelta(days=2)).split()[0] 330 | k += 1 331 | url = "https://youla.ru" + offer.find("a").get("href") 332 | if url in visited_urls: 333 | print("youla not unique") 334 | time.sleep(random.uniform(10, 15)) 335 | continue 336 | else: 337 | visited_urls.append(url) 338 | #print(url) 339 | 340 | if category is None or "saratov" not in url: 341 | time.sleep(random.uniform(5, 8)) 342 | continue 343 | 344 | data = [] 345 | if category == "Квартира": 346 | data = get_apartment_data(url) 347 | data.insert(15, date) 348 | if data[0] != "Не указано": 349 | try: 350 | db.insert_data("Квартиры", data) 351 | except: 352 | db.close() 353 | db = DataBase() 354 | db.insert_data("Квартиры", data) 355 | with open("total_data.txt", "a", encoding="utf8") as file: 356 | file.write("%s--%s--%s--%s--%s--%s\n" % (data[2], data[3], data[4], data[8], data[-1], url)) 357 | elif any(x in category for x in ["Дом", "Коттедж", "Таунхаус", "Дача", "Участок"]): 358 | data = get_cottage_data(url, category) 359 | data.insert(13, date) 360 | if data[0] != "Не указано": 361 | try: 362 | db.insert_data("Дома", data) 363 | except: 364 | db.close() 365 | db = DataBase() 366 | db.insert_data("Дома", data) 367 | with open("total_data.txt", "a", encoding="utf8") as file: 368 | file.write("%s--%s--%s--%s--%s\n" % (data[2], data[3], data[7], data[8], url)) 369 | 370 | #print(*data, sep="\n") 371 | #print("--------------------------------------") 372 | print("parsed page youla") 373 | 374 | except Exception as e: 375 | with open("logs.txt", "a", encoding="utf8") as file: 376 | file.write(str(e) + " youla crawl_page\n") 377 | #print(e) 378 | #print("Ошибка в crawl_page") 379 | 380 | 381 | def parse(url): 382 | completed = False 383 | page = 1 384 | while not completed: 385 | url_gen = url[:url.rfind("=") + 1] + str(page) 386 | completed = crawl_page(get_html(url_gen)) 387 | page += 1 388 | 389 | 390 | def main(): 391 | url = "https://youla.ru/saratov/nedvijimost?attributes[sort_field]=date_published&attributes[term_of_placement][from]=-1%20day&attributes[term_of_placement][to]=now&page=1" 392 | parse(url) 393 | 394 | 395 | if __name__ == "__main__": 396 | main() 397 | db.close() 398 | --------------------------------------------------------------------------------