├── .gitignore ├── README.md ├── seloger_scraper.py └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # seloger_scraper 2 | Parser for SeLoger.com apartments information 3 | 4 | ## Usage: 5 | 6 | **seloger_scraper.py [-h] [-o OUTPUT] [-p PAGES] [-b APARTMENT_BASE_URL] URLs [URLs ...]** 7 | 8 | Positional arguments: 9 | 10 | **URLs**: List of URLs with all desired parameters from SeLoger.com (So you go to Seloger.com, enter search query, go for the second page of the result, copy the URL without 2 at the end and post it here) 11 | 12 | Optional arguments: 13 | 14 | **-h, --help** show help message and exit 15 | 16 | **-o OUTPUT, --output OUTPUT** output file path 17 | 18 | **-p PAGES, --pages PAGES** number of pages to go through for each base URL. 19 | 20 | **-b APARTMENT_BASE_URL, --apartment_base_url APARTMENT_BASE_URL** Base URL for constracting apartment URL. Default value should work fine. 21 | -------------------------------------------------------------------------------- /seloger_scraper.py: -------------------------------------------------------------------------------- 1 | import traceback 2 | from argparse import ArgumentParser 3 | 4 | import requests 5 | import bs4 6 | import re 7 | import csv 8 | import argparse 9 | 10 | from multiprocessing import Pool 11 | 12 | 13 | class SeLogerScrapper: 14 | def __init__(self, thread_number=8): 15 | self.thread_number = thread_number 16 | 17 | @staticmethod 18 | def get_list_link(base_urls, pages_count): 19 | for link in base_urls: 20 | for page in range(1, pages_count + 1): 21 | yield link + str(page) 22 | 23 | @staticmethod 24 | def get_apartment_links_from_url(url): 25 | response = requests.get(url) 26 | soup = bs4.BeautifulSoup(response.text, "lxml") 27 | result_set = {article.attrs.get('data-listing-id') 28 | for article in soup.select('article.listing.life_annuity')} 29 | return list(result_set) 30 | 31 | def get_apartment_links(self, base_urls, pages_count): 32 | pool = Pool(self.thread_number) 33 | apartment_id_list = [] 34 | 35 | results_by_page = pool.map(SeLogerScrapper.get_apartment_links_from_url, 36 | self.get_list_link(base_urls, pages_count)) 37 | 38 | for result in results_by_page: 39 | apartment_id_list.extend(result) 40 | 41 | return apartment_id_list 42 | 43 | @staticmethod 44 | def get_apartment_url(apartment_id_list, base_apartment_url): 45 | 46 | for id in apartment_id_list: 47 | yield base_apartment_url + id + '.htm' 48 | 49 | @staticmethod 50 | def get_apartment_info_from_url(url): 51 | try: 52 | response = requests.get(url) 53 | soup = bs4.BeautifulSoup(response.text, "lxml") 54 | apartment_info = {"url": url} 55 | title_tag = soup.find("h1", class_="detail-title") 56 | apartment_info["name"] = next(title_tag.stripped_strings) 57 | 58 | resume_info = soup.find("div", class_="resume__infos") 59 | price_string = next(resume_info.find(id="price").stripped_strings) 60 | coma = price_string.find(',') 61 | if coma != -1: 62 | price_string = price_string[:coma] 63 | apartment_info["price"] = int(''.join(filter(str.isdigit, price_string))) 64 | 65 | description = soup.find(class_="detail__description") 66 | apartment_info["neighborhood"] = \ 67 | SeLogerScrapper.get_string_number(description.find(class_="detail-subtitle").find("span").string) 68 | apartment_info["description"] = str(description.find("p", class_="description").string) 69 | 70 | parameter_list = description.find("ol", class_="description-liste") 71 | apartment_info.update(SeLogerScrapper.process_criteria(parameter_list.find_all('li'))) 72 | except Exception: 73 | print(url) 74 | print(traceback.format_exc()) 75 | return {} 76 | 77 | return apartment_info 78 | 79 | @staticmethod 80 | def process_criteria(criteria): 81 | processed_criteria = {"furnished": 0, "balcony": 0, "separate_toilet": 0} 82 | for criterion in criteria: 83 | if not criterion.string: 84 | continue 85 | elif "m²" in criterion.string: 86 | processed_criteria["floor_size"] = SeLogerScrapper.get_string_number(criterion.string) 87 | elif "etages" in criterion.string.lower(): #i know it will overlap with etage, but considering order - it's ok 88 | processed_criteria["floors_total"] = SeLogerScrapper.get_string_number(criterion.string) 89 | elif "etage" in criterion.string.lower(): 90 | processed_criteria["floor"] = SeLogerScrapper.get_string_number(criterion.string) 91 | elif "pièce" in criterion.string.lower(): 92 | processed_criteria["rooms_count"] = SeLogerScrapper.get_string_number(criterion.string) 93 | elif "meublé" in criterion.string.lower(): 94 | processed_criteria["furnished"] = 1 95 | elif "balcon" in criterion.string.lower() or "terrasse" in criterion.string.lower(): 96 | processed_criteria["balcony"] = 1 97 | elif "toilettes séparées" in criterion.string.lower(): 98 | processed_criteria["separate_toilet"] = 1 99 | 100 | return processed_criteria 101 | 102 | @staticmethod 103 | def get_string_number(string): 104 | if 'rdc' in string.lower(): 105 | return 1 106 | return int(re.search(r'\d+', string).group()) 107 | 108 | def get_apartments_info(self, base_urls, apartment_base_url, pages_count=100): 109 | apartment_id = self.get_apartment_links(base_urls, pages_count) 110 | 111 | pool = Pool(self.thread_number) 112 | return pool.map(SeLogerScrapper.get_apartment_info_from_url, 113 | self.get_apartment_url(apartment_id, apartment_base_url)) 114 | 115 | 116 | if __name__ == '__main__': 117 | selogerscrapper = SeLogerScrapper() 118 | 119 | parser = ArgumentParser(description="Scrapper for SeLoger.com") 120 | 121 | parser.add_argument('urls', metavar='URLs', type=str, nargs='+', help='List of base search URLs') 122 | parser.add_argument('-o', '--output', default='apartments.csv', type=str) 123 | parser.add_argument('-p', '--pages', default=100, type=int) 124 | parser.add_argument('-b', '--apartment_base_url', 125 | default="http://www.seloger.com/annonces/locations/appartement/paris-17eme-75/", type=str) 126 | # according to my experience the last part dosen't matter - the apartments are returned by id, 127 | # but it have to be there to work correctly. So I left one of the Paris districts there, but it works all the same 128 | # for other districts/cities 129 | 130 | arguments = parser.parse_args() 131 | 132 | with open(arguments.output, mode='w', encoding='utf-8') as csvfile: 133 | fieldnames = ["floor_size", "price", "furnished", "balcony", "floor", "rooms_count", "neighborhood", 134 | "separate_toilet", "floors_total", "name", "description", "url"] 135 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 136 | 137 | writer.writeheader() 138 | writer.writerows(selogerscrapper.get_apartments_info(arguments.urls, 139 | arguments.apartment_base_url, arguments.pages)) 140 | 141 | 142 | 143 | 144 | 145 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | 167 | --------------------------------------------------------------------------------