├── .gitignore ├── LICENSE.txt ├── MANIFEST.in ├── README.rst ├── REQUIREMENTS.txt ├── VERSION.txt ├── craigslist ├── __init__.py ├── base.py ├── craigslist.py └── utils.py ├── setup.cfg └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | *.db 3 | *~ 4 | *.egg 5 | *.egg-info 6 | dist 7 | build 8 | eggs 9 | sdist 10 | develop-eggs 11 | .installed.cfg 12 | pip-log.txt 13 | .DS_Store 14 | .venv 15 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT-Zero License 2 | 3 | Copyright (c) 2015 Julio M Alegria 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so. 11 | 12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 13 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 14 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 15 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 16 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 17 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 18 | THE SOFTWARE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include craigslist * 2 | include *.rst 3 | include *.txt -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | python-craigslist 2 | ================= 3 | 4 | A simple `Craigslist `__ wrapper. 5 | 6 | License: `MIT-Zero `__. 7 | 8 | Disclaimer 9 | ---------- 10 | 11 | * I don't work for or have any affiliation with Craigslist. 12 | * This module was implemented for educational purposes. It should not be used for crawling or downloading data from Craigslist. 13 | 14 | Installation 15 | ------------ 16 | 17 | :: 18 | 19 | pip install python-craigslist 20 | 21 | Classes 22 | ------- 23 | 24 | Base class: 25 | 26 | * ``CraigslistBase`` 27 | 28 | Subclasses: 29 | 30 | * ``CraigslistCommunity`` (craigslist.org > community) 31 | * ``CraigslistHousing`` (craigslist.org > housing) 32 | * ``CraigslistJobs`` (craigslist.org > jobs) 33 | * ``CraigslistForSale`` (craigslist.org > for sale) 34 | * ``CraigslistEvents`` (craigslist.org > event calendar) 35 | * ``CraigslistServices`` (craigslist.org > services) 36 | * ``CraigslistGigs`` (craigslist.org > gigs) 37 | * ``CraigslistResumes`` (craigslist.org > resumes) 38 | 39 | Examples 40 | -------- 41 | 42 | Looking for a room in San Francisco? 43 | 44 | .. code:: python 45 | 46 | from craigslist import CraigslistHousing 47 | cl_h = CraigslistHousing(site='sfbay', area='sfc', category='roo', 48 | filters={'max_price': 1200, 'private_room': True}) 49 | 50 | # You can get an approximate amount of results with the following call: 51 | print(cl_h.get_results_approx_count()) 52 | 53 | 992 54 | 55 | for result in cl_h.get_results(sort_by='newest', geotagged=True): 56 | print(result) 57 | 58 | { 59 | 'id': u'4851150747', 60 | 'name': u'Near SFSU, UCSF and NEWLY FURNISHED - CLEAN, CONVENIENT and CLEAN!', 61 | 'url': u'http://sfbay.craigslist.org/sfc/roo/4851150747.html', 62 | 'datetime': u'2015-01-27 23:44', 63 | 'price': u'$1100', 64 | 'where': u'inner sunset / UCSF', 65 | 'has_image': False, 66 | 'has_map': True, 67 | 'geotag': (37.738473, -122.494721) 68 | } 69 | # ... 70 | 71 | Maybe a software engineering internship in Silicon Valley? 72 | 73 | .. code:: python 74 | 75 | from craigslist import CraigslistJobs 76 | cl_j = CraigslistJobs(site='sfbay', area='sby', category='sof', 77 | filters={'is_internship': True, 'employment_type': ['full-time', 'part-time']}) 78 | 79 | for result in cl_j.get_results(): 80 | print(result) 81 | 82 | { 83 | 'id': u'5708651182', 84 | 'name': u'GAME DEVELOPER INTERNSHIP AT TYNKER - AVAILABLE NOW!', 85 | 'url': u'http://sfbay.craigslist.org/pen/eng/5708651182.html', 86 | 'datetime': u'2016-07-30 13:30', 87 | 'price': None, 88 | 'where': u'mountain view', 89 | 'has_image': True, 90 | 'has_map': True, 91 | 'geotag': None 92 | } 93 | # ... 94 | 95 | Events with free food in New York? 96 | 97 | .. code:: python 98 | 99 | from craigslist import CraigslistEvents 100 | cl_e = CraigslistEvents(site='newyork', filters={'free': True, 'food': True}) 101 | 102 | for result in cl_e.get_results(sort_by='newest', limit=5): 103 | print(result) 104 | 105 | { 106 | 'id': u'4866178242', 107 | 'name': u'Lituation Thursdays @ Le Reve', 108 | 'url': u'http://newyork.craigslist.org/mnh/eve/4866178242.html', 109 | 'datetime': u'1/29', 110 | 'price': None, 111 | 'where': u'Midtown East', 112 | 'has_image': True, 113 | 'has_map': True, 114 | 'geotag': None 115 | } 116 | # ... 117 | 118 | Where to get `filters` from? 119 | ---------------------------- 120 | 121 | Every subclass has its own set of filters. To get a list of all the filters 122 | supported by a specific subclass, use the ``.show_filters()`` class-method: 123 | 124 | .. code:: python 125 | 126 | >>> from craigslist import CraigslistJobs, CraigslistForSale 127 | >>> CraigslistJobs.show_filters() 128 | 129 | Base filters: 130 | * query = ... 131 | * search_titles = True/False 132 | * has_image = True/False 133 | * posted_today = True/False 134 | * bundle_duplicates = True/False 135 | * search_distance = ... 136 | * zip_code = ... 137 | 138 | CraigslistJobs filters: 139 | * is_internship = True/False 140 | * is_nonprofit = True/False 141 | * is_telecommuting = True/False 142 | * employment_type = u'full-time', u'part-time', u'contract', u"employee's choice" 143 | 144 | 145 | >>> CraigslistForSale.show_filters(category='cta') 146 | 147 | Base filters: 148 | * query = ... 149 | * search_titles = True/False 150 | * has_image = True/False 151 | * posted_today = True/False 152 | * bundle_duplicates = True/False 153 | * search_distance = ... 154 | * zip_code = ... 155 | 156 | CraigslistForSale filters with category 'cta': 157 | * min_price = ... 158 | * max_price = ... 159 | * make = ... 160 | * model = ... 161 | * min_year = ... 162 | * max_year = ... 163 | * min_miles = ... 164 | * max_miles = ... 165 | * min_engine_displacement = ... 166 | * max_engine_displacement = ... 167 | * condition = u'new', u'like new', u'excellent', u'good', u'fair', u'salvage' 168 | * auto_cylinders = u'3 cylinders', u'4 cylinders', u'5 cylinders', u'6 cylinders', u'8 cylinders', u'10 cylinders', u'12 cylinders', u'other' 169 | * auto_drivetrain = u'fwd', u'rwd', u'4wd' 170 | * auto_fuel_type = u'gas', u'diesel', u'hybrid', u'electric', u'other' 171 | * auto_paint = u'black', u'blue', u'brown', u'green', u'grey', u'orange', u'purple', u'red', u'silver', u'white', u'yellow', u'custom' 172 | * auto_size = u'compact', u'full-size', u'mid-size', u'sub-compact' 173 | * auto_title_status = u'clean', u'salvage', u'rebuilt', u'parts only', u'lien', u'missing' 174 | * auto_transmission = u'manual', u'automatic', u'other' 175 | * auto_bodytype = u'bus', u'convertible', u'coupe', u'hatchback', u'mini-van', u'offroad', u'pickup', u'sedan', u'truck', u'SUV', u'wagon', u'van', u'other' 176 | 177 | Where to get ``site`` and ``area`` from? 178 | ---------------------------------------- 179 | 180 | When initializing any of the subclasses, you'll need to provide the ``site``, and optionall the ``area``, from where you want to query data. 181 | 182 | To get the correct ``site``, follow these steps: 183 | 184 | 1. Go to `craigslist.org/about/sites `__. 185 | 2. Find the country or city you're interested on, and click on it. 186 | 3. You'll be directed to ``.craigslist.org``. The value of ```` in the URL is the one you should use. 187 | 188 | Not all sites have areas. To check if your site has areas, check for links next to the title of the Craigslist page, on the top center. For example, for New York you'll see: 189 | 190 | .. image:: https://user-images.githubusercontent.com/1008637/45307206-bb404d80-b51e-11e8-8e6d-edfbdbd0a6fa.png 191 | 192 | Click on the one you're interested, and you'll be redirected to ``.craigslist.org/``. The value of ```` in the URL is the one you should use. If there are no areas next to the title, it means your site has no areas, and you can leave that argument unset. 193 | 194 | Where to get ``category`` from? 195 | ------------------------------- 196 | 197 | You can additionally provide a ``category`` when initializing any of the subclasses. To get a list of all the categories 198 | supported by a specific subclass, use the ``.show_categories()`` class-method: 199 | 200 | .. code:: python 201 | 202 | >>> from craigslist import CraigslistServices 203 | >>> CraigslistServices.show_categories() 204 | 205 | CraigslistServices categories: 206 | * aos = automotive services 207 | * bts = beauty services 208 | * cms = cell phone / mobile services 209 | * cps = computer services 210 | * crs = creative services 211 | * cys = cycle services 212 | * evs = event services 213 | * fgs = farm & garden services 214 | * fns = financial services 215 | * hws = health/wellness services 216 | * hss = household services 217 | * lbs = labor / hauling / moving 218 | * lgs = legal services 219 | * lss = lessons & tutoring 220 | * mas = marine services 221 | * pas = pet services 222 | * rts = real estate services 223 | * sks = skilled trade services 224 | * biz = small biz ads 225 | * trv = travel/vacation services 226 | * wet = writing / editing / translation 227 | 228 | Is there a limit for the number of results? 229 | -------------------------------------------- 230 | 231 | Yes, Craigslist caps the results for any search to 3000. 232 | 233 | Support 234 | ------- 235 | 236 | If you find any bug or you want to propose a new feature, please use the `issues tracker `__. I'll be happy to help you! :-) 237 | -------------------------------------------------------------------------------- /REQUIREMENTS.txt: -------------------------------------------------------------------------------- 1 | requests>=2.25.0 2 | urllib3>=1.26.0 3 | beautifulsoup4>=4.9.0 4 | six -------------------------------------------------------------------------------- /VERSION.txt: -------------------------------------------------------------------------------- 1 | 1.1.4 -------------------------------------------------------------------------------- /craigslist/__init__.py: -------------------------------------------------------------------------------- 1 | from .craigslist import ( 2 | CraigslistCommunity, CraigslistEvents, CraigslistForSale, CraigslistGigs, 3 | CraigslistHousing, CraigslistJobs, CraigslistResumes, CraigslistServices) 4 | 5 | __all__ = [ 6 | 'CraigslistCommunity', 'CraigslistEvents', 'CraigslistForSale', 'CraigslistGigs', 7 | 'CraigslistHousing', 'CraigslistJobs', 'CraigslistResumes', 'CraigslistServices'] 8 | -------------------------------------------------------------------------------- /craigslist/base.py: -------------------------------------------------------------------------------- 1 | import logging 2 | try: 3 | from Queue import Queue # PY2 4 | except ImportError: 5 | from queue import Queue # PY3 6 | from threading import Thread 7 | try: 8 | from urlparse import urljoin # PY2 9 | except ImportError: 10 | from urllib.parse import urljoin # PY3 11 | 12 | from six import iteritems 13 | from six.moves import range 14 | 15 | from . import utils 16 | 17 | ALL_SITES = utils.get_all_sites() # All the Craiglist sites 18 | RESULTS_PER_REQUEST = 100 # Craigslist returns 100 results per request 19 | 20 | 21 | class CraigslistBase(object): 22 | """ Base class for all Craiglist wrappers. """ 23 | 24 | url_templates = { 25 | 'base': 'http://%(site)s.craigslist.org', 26 | 'no_area': 'http://%(site)s.craigslist.org/search/%(category)s', 27 | 'area': 'http://%(site)s.craigslist.org/search/%(area)s/%(category)s' 28 | } 29 | 30 | default_site = 'sfbay' 31 | default_category = None 32 | 33 | base_filters = { 34 | 'query': {'url_key': 'query', 'value': None}, 35 | 'search_titles': {'url_key': 'srchType', 'value': 'T'}, 36 | 'has_image': {'url_key': 'hasPic', 'value': 1}, 37 | 'posted_today': {'url_key': 'postedToday', 'value': 1}, 38 | 'bundle_duplicates': {'url_key': 'bundleDuplicates', 'value': 1}, 39 | 'search_distance': {'url_key': 'search_distance', 'value': None}, 40 | 'zip_code': {'url_key': 'postal', 'value': None}, 41 | } 42 | extra_filters = {} 43 | __list_filters = {} # Cache for list filters requested by URL 44 | 45 | # Set to True to subclass defines the customize_results() method 46 | custom_result_fields = False 47 | 48 | sort_by_options = { 49 | 'newest': 'date', 50 | 'price_asc': 'priceasc', 51 | 'price_desc': 'pricedsc', 52 | } 53 | 54 | def __init__(self, site=None, area=None, category=None, filters=None, 55 | log_level=logging.WARNING): 56 | # Logging 57 | self.set_logger(log_level, init=True) 58 | 59 | self.site = site or self.default_site 60 | if self.site not in ALL_SITES: 61 | msg = "'%s' is not a valid site" % self.site 62 | self.logger.error(msg) 63 | raise ValueError(msg) 64 | 65 | if area: 66 | if not self.is_valid_area(area): 67 | msg = "'%s' is not a valid area for site '%s'" % (area, site) 68 | self.logger.error(msg) 69 | raise ValueError(msg) 70 | self.area = area 71 | 72 | self.category = category or self.default_category 73 | 74 | url_template = self.url_templates['area' if area else 'no_area'] 75 | self.url = url_template % {'site': self.site, 'area': self.area, 76 | 'category': self.category} 77 | 78 | self.filters = self.get_filters(filters) 79 | 80 | def get_filters(self, filters): 81 | """Parses filters passed by the user into GET parameters.""" 82 | 83 | list_filters = self.get_list_filters(self.url) 84 | 85 | # If a search has few results, results for "similar listings" will be 86 | # included. The solution is a bit counter-intuitive, but to force this 87 | # not to happen, we set searchNearby=True, but not pass any 88 | # nearbyArea=X, thus showing no similar listings. 89 | parsed_filters = {'searchNearby': 1} 90 | 91 | for key, value in iteritems((filters or {})): 92 | try: 93 | filter_ = (self.base_filters.get(key) or 94 | self.extra_filters.get(key) or 95 | list_filters[key]) 96 | if filter_['value'] is None: 97 | parsed_filters[filter_['url_key']] = value 98 | elif isinstance(filter_['value'], dict): 99 | valid_options = filter_['value'] 100 | if not utils.isiterable(value) or isinstance(value, str): 101 | value = [value] # Force to list 102 | options = [] 103 | for opt in value: 104 | try: 105 | options.append(valid_options[opt]) 106 | except KeyError: 107 | self.logger.warning( 108 | "'%s' is not a valid option for %s" 109 | % (opt, key) 110 | ) 111 | parsed_filters[filter_['url_key']] = options 112 | elif value: # Don't add filter if ...=False 113 | parsed_filters[filter_['url_key']] = filter_['value'] 114 | except KeyError: 115 | self.logger.warning("'%s' is not a valid filter", key) 116 | 117 | return parsed_filters 118 | 119 | def set_logger(self, log_level, init=False): 120 | if init: 121 | self.logger = logging.getLogger('python-craiglist') 122 | self.handler = logging.StreamHandler() 123 | self.logger.addHandler(self.handler) 124 | self.logger.setLevel(log_level) 125 | self.handler.setLevel(log_level) 126 | 127 | def is_valid_area(self, area): 128 | base_url = self.url_templates['base'] 129 | response = utils.requests_get(base_url % {'site': self.site}, 130 | logger=self.logger) 131 | soup = utils.bs(response.content) 132 | sublinks = soup.find('ul', {'class': 'sublinks'}) 133 | return sublinks and sublinks.find('a', text=area) is not None 134 | 135 | def get_results_approx_count(self, soup=None): 136 | """ 137 | Gets (approx) amount of results to be returned by `get_results`. 138 | 139 | Note that this number could be not exactly the same as the actual 140 | len of results returned (although from my tests usually within +/-10). 141 | Also note that this will make an extra request to Craigslist (if `soup` 142 | is not provided). 143 | """ 144 | 145 | if soup is None: 146 | response = utils.requests_get(self.url, params=self.filters, 147 | logger=self.logger) 148 | self.logger.info('GET %s', response.url) 149 | self.logger.info('Response code: %s', response.status_code) 150 | response.raise_for_status() # Something failed? 151 | soup = utils.bs(response.content) 152 | 153 | totalcount = soup.find('span', {'class': 'totalcount'}) 154 | return int(totalcount.text) if totalcount else None 155 | 156 | def get_results(self, limit=None, start=0, sort_by=None, geotagged=False, 157 | include_details=False): 158 | """ 159 | Gets results from Craigslist based on the specified filters. 160 | 161 | If geotagged=True, the results will include the (lat, lng) in the 162 | 'geotag' attrib (this will make the process a little bit longer). 163 | """ 164 | 165 | if sort_by: 166 | try: 167 | self.filters['sort'] = self.sort_by_options[sort_by] 168 | except KeyError: 169 | msg = ("'%s' is not a valid sort_by option, " 170 | "use: 'newest', 'price_asc' or 'price_desc'" % sort_by) 171 | self.logger.error(msg) 172 | raise ValueError(msg) 173 | 174 | total_so_far = start 175 | results_yielded = 0 176 | total = 0 177 | 178 | while True: 179 | self.filters['s'] = start 180 | response = utils.requests_get(self.url, params=self.filters, 181 | logger=self.logger) 182 | self.logger.info('GET %s', response.url) 183 | self.logger.info('Response code: %s', response.status_code) 184 | response.raise_for_status() # Something failed? 185 | 186 | soup = utils.bs(response.content) 187 | if not total: 188 | total = self.get_results_approx_count(soup=soup) 189 | 190 | rows = soup.find('ul', {'class': 'rows'}) 191 | for row in rows.find_all('li', {'class': 'result-row'}, 192 | recursive=False): 193 | if limit is not None and results_yielded >= limit: 194 | break 195 | self.logger.debug('Processing %s of %s results ...', 196 | total_so_far + 1, total or '(undefined)') 197 | 198 | yield self.process_row(row, geotagged, include_details) 199 | 200 | results_yielded += 1 201 | total_so_far += 1 202 | 203 | if results_yielded == limit: 204 | break 205 | if (total_so_far - start) < RESULTS_PER_REQUEST: 206 | break 207 | start = total_so_far 208 | 209 | def process_row(self, row, geotagged=False, include_details=False): 210 | id = row.attrs['data-pid'] 211 | repost_of = row.attrs.get('data-repost-of') 212 | 213 | link = row.find('a', {'class': 'hdrlnk'}) 214 | name = link.text 215 | url = urljoin(self.url, link.attrs['href']) 216 | 217 | time = row.find('time') 218 | if time: 219 | datetime = time.attrs['datetime'] 220 | else: 221 | pl = row.find('span', {'class': 'pl'}) 222 | datetime = pl.text.split(':')[0].strip() if pl else None 223 | price = row.find('span', {'class': 'result-price'}) 224 | where = row.find('span', {'class': 'result-hood'}) 225 | if where: 226 | where = where.text.strip()[1:-1] # remove () 227 | tags_span = row.find('span', {'class': 'result-tags'}) 228 | tags = tags_span.text if tags_span else '' 229 | 230 | result = {'id': id, 231 | 'repost_of': repost_of, 232 | 'name': name, 233 | 'url': url, 234 | # NOTE: Keeping 'datetime' for backwards 235 | # compatibility, use 'last_updated' instead. 236 | 'datetime': datetime, 237 | 'last_updated': datetime, 238 | 'price': price.text if price else None, 239 | 'where': where, 240 | 'has_image': 'pic' in tags, 241 | 'geotag': None, 242 | # In very few cases, a posting will be included in the result 243 | # list but it has already been deleted (or it has been 244 | # deleted after the list was retrieved). In that case, this 245 | # field will be marked as True. If you want to be extra 246 | # careful, always check this field is False before using a 247 | # result. 248 | 'deleted': False} 249 | 250 | if geotagged or include_details: 251 | detail_soup = self.fetch_content(result['url']) 252 | if detail_soup: 253 | if geotagged: 254 | self.geotag_result(result, detail_soup) 255 | if include_details: 256 | self.include_details(result, detail_soup) 257 | 258 | if self.custom_result_fields: 259 | self.customize_result(result) 260 | 261 | return result 262 | 263 | def customize_result(self, result): 264 | """ Adds custom/delete/alter fields to result. """ 265 | # Override in subclass to add category-specific fields. 266 | # FYI: `attrs` will only be presented if include_details was True. 267 | pass 268 | 269 | def geotag_result(self, result, soup): 270 | """ Adds (lat, lng) to result. """ 271 | 272 | self.logger.debug('Geotagging result ...') 273 | 274 | map_ = soup.find('div', {'id': 'map'}) 275 | if map_: 276 | result['geotag'] = (float(map_.attrs['data-latitude']), 277 | float(map_.attrs['data-longitude'])) 278 | 279 | return result 280 | 281 | def include_details(self, result, soup): 282 | """ Adds description, images to result """ 283 | 284 | self.logger.debug('Adding details to result...') 285 | 286 | body = soup.find('section', id='postingbody') 287 | 288 | if not body: 289 | # This should only happen when the posting has been deleted by its 290 | # author. 291 | result['deleted'] = True 292 | return 293 | 294 | # We need to massage the data a little bit because it might include 295 | # some inner elements that we want to ignore. 296 | body_text = (getattr(e, 'text', e) for e in body 297 | if not getattr(e, 'attrs', None)) 298 | result['body'] = ''.join(body_text).strip() 299 | 300 | # Add created time (in case it's different from last updated). 301 | postinginfos = soup.find('div', {'class': 'postinginfos'}) 302 | for p in postinginfos.find_all('p'): 303 | if 'posted' in p.text: 304 | time = p.find('time') 305 | if time: 306 | # This date is in ISO format. I'm removing the T literal 307 | # and the timezone to make it the same format as 308 | # 'last_updated'. 309 | created = time.attrs['datetime'].replace('T', ' ') 310 | result['created'] = created.rsplit(':', 1)[0] 311 | 312 | # Add images' urls. 313 | image_tags = soup.find_all('img') 314 | # If there's more than one picture, the first one will be repeated. 315 | image_tags = image_tags[1:] if len(image_tags) > 1 else image_tags 316 | images = [] 317 | for img in image_tags: 318 | try: 319 | img_link = img['src'].replace('50x50c', '600x450') 320 | images.append(img_link) 321 | except KeyError: 322 | continue # Some posts contain empty tags. 323 | result['images'] = images 324 | 325 | # Add list of attributes as unparsed strings. These values are then 326 | # processed by `parse_attrs`, and are available to be post-processed 327 | # by subclasses. 328 | attrgroups = soup.find_all('p', {'class': 'attrgroup'}) 329 | attrs = [] 330 | for attrgroup in attrgroups: 331 | for attr in attrgroup.find_all('span'): 332 | attr_text = attr.text.strip() 333 | if attr_text: 334 | attrs.append(attr_text) 335 | result['attrs'] = attrs 336 | if attrs: 337 | self.parse_attrs(result) 338 | 339 | # If an address is included, add it to `address`. 340 | mapaddress = soup.find('div', {'class': 'mapaddress'}) 341 | if mapaddress: 342 | result['address'] = mapaddress.text 343 | 344 | def parse_attrs(self, result): 345 | """Parses raw attributes into structured fields in the result dict.""" 346 | 347 | # Parse binary fields first by checking their presence. 348 | attrs = set(attr.lower() for attr in result['attrs']) 349 | for key, options in iteritems(self.extra_filters): 350 | if options['value'] != 1: 351 | continue # Filter is not binary 352 | if options.get('attr', '') in attrs: 353 | result[key] = True 354 | # Values from list filters are sometimes shown as {filter}: {value} 355 | # e.g. "transmission: automatic", although usually they are shown only 356 | # with the {value}, e.g. "laundry in bldg". By stripping the content 357 | # before the colon (if any) we reduce it to a single case. 358 | attrs_after_colon = set( 359 | attr.split(': ', 1)[-1] for attr in result['attrs']) 360 | for key, options in iteritems(self.get_list_filters(self.url)): 361 | for option in options['value'].keys(): 362 | if option in attrs_after_colon: 363 | result[key] = option 364 | break 365 | 366 | def fetch_content(self, url): 367 | response = utils.requests_get(url, logger=self.logger) 368 | self.logger.info('GET %s', response.url) 369 | self.logger.info('Response code: %s', response.status_code) 370 | 371 | if response.ok: 372 | return utils.bs(response.content) 373 | 374 | self.logger.warning("GET %s returned not OK response code: %s " 375 | "(skipping)", url, response.status_code) 376 | return None 377 | 378 | def geotag_results(self, results, workers=8): 379 | """ 380 | Adds (lat, lng) to each result. This process is done using N threads, 381 | where N is the amount of workers defined (default: 8). 382 | """ 383 | 384 | results = list(results) 385 | queue = Queue() 386 | 387 | for result in results: 388 | queue.put(result) 389 | 390 | def geotagger(): 391 | while not queue.empty(): 392 | self.logger.debug('%s results left to geotag ...', 393 | queue.qsize()) 394 | self.geotag_result(queue.get()) 395 | queue.task_done() 396 | 397 | threads = [] 398 | for _ in range(workers): 399 | thread = Thread(target=geotagger) 400 | thread.start() 401 | threads.append(thread) 402 | 403 | for thread in threads: 404 | thread.join() 405 | return results 406 | 407 | @classmethod 408 | def get_list_filters(cls, url): 409 | if cls.__list_filters.get(url) is None: 410 | cls.__list_filters[url] = utils.get_list_filters(url) 411 | return cls.__list_filters[url] 412 | 413 | @classmethod 414 | def show_categories(cls): 415 | url = cls.url_templates["no_area"] % { 416 | "site": cls.default_site, 417 | "category": cls.default_category, 418 | } 419 | response = utils.requests_get(url) 420 | soup = utils.bs(response.content) 421 | 422 | cat_html = soup.find_all("input", {"class": "catcheck multi_checkbox"}) 423 | cat_ids = [html.get('data-abb') for html in cat_html] 424 | cat_html = soup.find_all("a", {"class": "category"}) 425 | cat_names = [html.contents[0] for html in cat_html] 426 | 427 | print("%s categories:" % cls.__name__) 428 | for cat_name, cat_id in sorted(zip(cat_names, cat_ids)): 429 | print("* %s = %s" % (cat_id, cat_name)) 430 | 431 | @classmethod 432 | def show_filters(cls, category=None): 433 | print('Base filters:') 434 | for key, options in iteritems(cls.base_filters): 435 | value_as_str = '...' if options['value'] is None else 'True/False' 436 | print('* %s = %s' % (key, value_as_str)) 437 | 438 | if category is None: 439 | print("\n%s filters:" % cls.__name__) 440 | else: 441 | print("\n%s filters for category '%s':" % (cls.__name__, category)) 442 | for key, options in iteritems(cls.extra_filters): 443 | value_as_str = '...' if options['value'] is None else 'True/False' 444 | print('* %s = %s' % (key, value_as_str)) 445 | url = cls.url_templates['no_area'] % { 446 | 'site': cls.default_site, 447 | 'category': category or cls.default_category, 448 | } 449 | list_filters = cls.get_list_filters(url) 450 | for key, options in iteritems(list_filters): 451 | value_as_str = ', '.join( 452 | repr(opt) for opt in options['value'].keys()) 453 | print('* %s = %s' % (key, value_as_str)) 454 | -------------------------------------------------------------------------------- /craigslist/craigslist.py: -------------------------------------------------------------------------------- 1 | from .base import CraigslistBase 2 | 3 | 4 | class CraigslistCommunity(CraigslistBase): 5 | """ Craigslist community wrapper. """ 6 | 7 | default_category = 'ccc' 8 | 9 | 10 | class CraigslistEvents(CraigslistBase): 11 | """ Craigslist events wrapper. """ 12 | 13 | default_category = 'eee' 14 | custom_result_fields = True 15 | 16 | extra_filters = { 17 | # art/film 18 | 'art': {'url_key': 'event_art', 'value': 1, 'attr': 'art/film'}, 19 | 'film': {'url_key': 'event_art', 'value': 1, 'attr': 'art/film'}, 20 | # career 21 | 'career': {'url_key': 'event_career', 'value': 1, 'attr': 'career'}, 22 | # charitable 23 | 'charitable': { 24 | 'url_key': 'event_fundraiser_vol', 'value': 1, 25 | 'attr': 'charitable'}, 26 | 'fundraiser': { 27 | 'url_key': 'event_fundraiser_vol', 'value': 1, 28 | 'attr': 'charitable'}, 29 | # competiton 30 | 'athletics': { 31 | 'url_key': 'event_athletics', 'value': 1, 'attr': 'competition'}, 32 | 'competition': { 33 | 'url_key': 'event_athletics', 'value': 1, 'attr': 'competition'}, 34 | # dance 35 | 'dance': {'url_key': 'event_dance', 'value': 1, 'attr': 'dance'}, 36 | # fest/fair 37 | 'festival': { 38 | 'url_key': 'event_festival', 'value': 1, 'attr': 'fest/fair'}, 39 | 'fair': {'url_key': 'event_festival', 'value': 1, 'attr': 'fest/fair'}, 40 | # fitness/health 41 | 'fitness': { 42 | 'url_key': 'event_fitness_wellness', 'value': 1, 43 | 'attr': 'fitness/health'}, 44 | 'health': { 45 | 'url_key': 'event_fitness_wellness', 'value': 1, 46 | 'attr': 'fitness/health'}, 47 | # food/drink 48 | 'food': {'url_key': 'event_food', 'value': 1, 'attr': 'food/drink'}, 49 | 'drink': {'url_key': 'event_food', 'value': 1, 'attr': 'food/drink'}, 50 | # free 51 | 'free': {'url_key': 'event_free', 'value': 1, 'attr': 'free'}, 52 | # kid friendly 53 | 'kid_friendly': { 54 | 'url_key': 'event_kidfriendly', 'value': 1, 55 | 'attr': 'kid friendly'}, 56 | # literary 57 | 'literary': { 58 | 'url_key': 'event_literary', 'value': 1, 'attr': 'literary'}, 59 | # music 60 | 'music': {'url_key': 'event_music', 'value': 1, 'attr': 'music'}, 61 | # outdoor 62 | 'outdoor': {'url_key': 'event_outdoor', 'value': 1, 'attr': 'outdoor'}, 63 | # sale 64 | 'sale': {'url_key': 'event_sale', 'value': 1, 'attr': 'sale'}, 65 | # singles 66 | 'singles': {'url_key': 'event_singles', 'value': 1, 'attr': 'singles'}, 67 | # tech 68 | 'tech': {'url_key': 'event_geek', 'value': 1, 'attr': 'tech'}, 69 | } 70 | 71 | def customize_result(self, result): 72 | for attr in result.get('attrs', []): 73 | # Get venue. 74 | if attr.lower().startswith('venue: '): 75 | result['venue'] = attr[7:] 76 | 77 | 78 | class CraigslistForSale(CraigslistBase): 79 | """ Craigslist for sale wrapper. """ 80 | 81 | default_category = 'sss' 82 | custom_result_fields = True 83 | 84 | extra_filters = { 85 | # price 86 | 'min_price': {'url_key': 'min_price', 'value': None}, 87 | 'max_price': {'url_key': 'max_price', 'value': None}, 88 | # make and model 89 | 'make': {'url_key': 'auto_make_model', 'value': None}, 90 | 'model': {'url_key': 'auto_make_model', 'value': None}, 91 | # model year 92 | 'min_year': {'url_key': 'min_auto_year', 'value': None}, 93 | 'max_year': {'url_key': 'max_auto_year', 'value': None}, 94 | # odometer 95 | 'min_miles': {'url_key': 'min_auto_miles', 'value': None}, 96 | 'max_miles': {'url_key': 'max_auto_miles', 'value': None}, 97 | # engine displacement (cc) 98 | 'min_engine_displacement': { 99 | 'url_key': 'min_engine_displacement_cc', 'value': None}, 100 | 'max_engine_displacement': { 101 | 'url_key': 'max_engine_displacement_cc', 'value': None}, 102 | } 103 | 104 | def customize_result(self, result): 105 | for attr in result.get('attrs', []): 106 | attr_lower = attr.lower() 107 | # Get miles. 108 | if attr_lower.startswith('odometer: '): 109 | result['miles'] = attr[10:] 110 | # Get engine displacement 111 | if attr_lower.startswith('engine displacement (cc): '): 112 | result['engine_displacement'] = attr[26:] 113 | 114 | 115 | class CraigslistGigs(CraigslistBase): 116 | """ Craigslist gigs wrapper. """ 117 | 118 | default_category = 'ggg' 119 | custom_result_fields = True 120 | 121 | extra_filters = { 122 | # paid/unpaid 123 | 'is_paid': {'url_key': 'is_paid', 'value': None}, 124 | } 125 | 126 | def __init__(self, *args, **kwargs): 127 | try: 128 | is_paid = kwargs['filters']['is_paid'] 129 | kwargs['filters']['is_paid'] = 'yes' if is_paid else 'no' 130 | except KeyError: 131 | pass 132 | super(CraigslistGigs, self).__init__(*args, **kwargs) 133 | 134 | def customize_result(self, result): 135 | for attr in result.get('attrs', []): 136 | # Get compensation. 137 | if attr.lower().startswith('compensation: '): 138 | result['compensation'] = attr[14:] 139 | result['is_paid'] = 'compensation' in result 140 | 141 | 142 | class CraigslistHousing(CraigslistBase): 143 | """ Craigslist housing wrapper. """ 144 | 145 | default_category = 'hhh' 146 | custom_result_fields = True 147 | 148 | extra_filters = { 149 | # price 150 | 'min_price': {'url_key': 'min_price', 'value': None}, 151 | 'max_price': {'url_key': 'max_price', 'value': None}, 152 | # bedrooms 153 | 'min_bedrooms': {'url_key': 'min_bedrooms', 'value': None}, 154 | 'max_bedrooms': {'url_key': 'max_bedrooms', 'value': None}, 155 | # bathrooms 156 | 'min_bathrooms': {'url_key': 'min_bathrooms', 'value': None}, 157 | 'max_bathrooms': {'url_key': 'max_bathrooms', 'value': None}, 158 | # ft2 159 | 'min_ft2': {'url_key': 'minSqft', 'value': None}, 160 | 'max_ft2': {'url_key': 'maxSqft', 'value': None}, 161 | # private room 162 | 'private_room': { 163 | 'url_key': 'private_room', 'value': 1, 'attr': 'private room'}, 164 | # private bath 165 | 'private_bath': { 166 | 'url_key': 'private_bath', 'value': 1, 'attr': 'private bath'}, 167 | # cats ok 168 | 'cats_ok': { 169 | 'url_key': 'pets_cat', 'value': 1, 'attr': 'cats are ok - purrr'}, 170 | # dogs ok 171 | 'dogs_ok': { 172 | 'url_key': 'pets_dog', 'value': 1, 'attr': 'dogs are ok - wooof'}, 173 | # furnished 174 | 'is_furnished': { 175 | 'url_key': 'is_furnished', 'value': 1, 'attr': 'furnished'}, 176 | # no smoking 177 | 'no_smoking': { 178 | 'url_key': 'no_smoking', 'value': 1, 'attr': 'no smoking'}, 179 | # wheelchair access 180 | 'wheelchair_acccess': { 181 | 'url_key': 'wheelchaccess', 'value': 1, 182 | 'attr': 'wheelchair accessible'}, 183 | # EV charging 184 | 'ev_charging': { 185 | 'url_key': 'ev_charging', 'value': 1, 'attr': 'ev charging'}, 186 | # no appliation fee 187 | 'no_application_fee': {'url_key': 'application_fee', 'value': 1}, 188 | # no broker fee 189 | 'no_broker_fee': {'url_key': 'broker_fee', 'value': 1}, 190 | } 191 | 192 | def customize_result(self, result): 193 | for attr in result.get('attrs', []): 194 | attr_lower = attr.lower() 195 | # Get bedrooms and bathrooms. 196 | if attr_lower.endswith('br') or attr_lower.endswith('ba'): 197 | for elem in attr_lower.split(' / '): 198 | if elem.endswith('br'): 199 | # Don't convert to int, too risky 200 | result['bedrooms'] = elem[:-2] 201 | elif elem.endswith('ba'): 202 | # Don't convert to int, too risky 203 | result['bathrooms'] = elem[:-2] 204 | # Get area. 205 | elif attr_lower.endswith('ft2') or attr_lower.endswith('m2'): 206 | result['area'] = attr_lower 207 | # Get availability. 208 | elif attr_lower.startswith('available '): 209 | result['available'] = attr[10:] 210 | 211 | 212 | class CraigslistJobs(CraigslistBase): 213 | """ Craigslist jobs wrapper. """ 214 | 215 | default_category = 'jjj' 216 | custom_result_fields = True 217 | 218 | extra_filters = { 219 | # internship 220 | 'is_internship': { 221 | 'url_key': 'is_internship', 'value': 1, 'attr': 'internship'}, 222 | # non-profit 223 | 'is_nonprofit': { 224 | 'url_key': 'is_nonprofit', 'value': 1, 225 | 'attr': 'non-profit organization'}, 226 | # telecommute 227 | 'is_telecommuting': { 228 | 'url_key': 'is_telecommuting', 'value': 1, 229 | 'attr': 'telecommuting okay'}, 230 | } 231 | 232 | def customize_result(self, result): 233 | for attr in result.get('attrs', []): 234 | # Get compensation. 235 | if attr.lower().startswith('compensation: '): 236 | result['compensation'] = attr[14:] 237 | 238 | 239 | class CraigslistResumes(CraigslistBase): 240 | """ Craigslist resumes wrapper. """ 241 | 242 | default_category = 'rrr' 243 | 244 | extra_filters = { 245 | # TODO: Please create an issue or PR if interested in this category. 246 | } 247 | 248 | 249 | class CraigslistServices(CraigslistBase): 250 | """ Craigslist services wrapper. """ 251 | 252 | default_category = 'bbb' 253 | -------------------------------------------------------------------------------- /craigslist/utils.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import requests 3 | from requests.exceptions import RequestException 4 | 5 | ALL_SITES_URL = 'http://www.craigslist.org/about/sites' 6 | SITE_URL = 'http://%s.craigslist.org' 7 | USER_AGENT = 'Mozilla/5.0' 8 | 9 | 10 | def bs(content): 11 | return BeautifulSoup(content, 'html.parser') 12 | 13 | 14 | def isiterable(var): 15 | try: 16 | return iter(var) and True 17 | except TypeError: 18 | return False 19 | 20 | 21 | def requests_get(*args, **kwargs): 22 | """ 23 | Retries if a RequestException is raised (could be a connection error or 24 | a timeout). 25 | """ 26 | 27 | logger = kwargs.pop('logger', None) 28 | # Set default User-Agent header if not defined. 29 | kwargs.setdefault('headers', {}).setdefault('User-Agent', USER_AGENT) 30 | 31 | try: 32 | return requests.get(*args, **kwargs) 33 | except RequestException as exc: 34 | if logger: 35 | logger.warning('Request failed (%s). Retrying ...', exc) 36 | return requests.get(*args, **kwargs) 37 | 38 | 39 | def get_all_sites(): 40 | response = requests.get(ALL_SITES_URL) 41 | response.raise_for_status() # Something failed? 42 | soup = BeautifulSoup(response.content, 'html.parser') 43 | sites = set() 44 | 45 | for box in soup.findAll('div', {'class': 'box'}): 46 | for a in box.findAll('a'): 47 | # Remove protocol and get subdomain 48 | site = a.attrs['href'].rsplit('//', 1)[1].split('.')[0] 49 | sites.add(site) 50 | 51 | return sites 52 | 53 | 54 | def get_all_areas(site): 55 | response = requests.get(SITE_URL % site) 56 | response.raise_for_status() # Something failed? 57 | soup = BeautifulSoup(response.content, 'html.parser') 58 | raw = soup.select('ul.sublinks li a') 59 | sites = set(a.attrs['href'].rsplit('/')[1] for a in raw) 60 | return sites 61 | 62 | 63 | def get_list_filters(url): 64 | list_filters = {} 65 | response = requests_get(url) 66 | soup = bs(response.content) 67 | for list_filter in soup.find_all('div', class_='search-attribute'): 68 | filter_key = list_filter.attrs['data-attr'] 69 | filter_labels = list_filter.find_all('label') 70 | options = {opt.text.strip(): opt.find('input').get('value') 71 | for opt in filter_labels} 72 | list_filters[filter_key] = {'url_key': filter_key, 'value': options} 73 | return list_filters 74 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.rst -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | try: 4 | from setuptools import setup 5 | except ImportError: 6 | from distutils.core import setup 7 | 8 | with open('VERSION.txt', 'r') as v: 9 | version = v.read().strip() 10 | 11 | with open('REQUIREMENTS.txt', 'r') as r: 12 | requires = r.read().split() 13 | 14 | with open('README.rst', 'r') as r: 15 | readme = r.read() 16 | 17 | download_url = ( 18 | 'https://github.com/juliomalegria/python-craigslist/tarball/%s' 19 | ) 20 | 21 | 22 | setup( 23 | name='python-craigslist', 24 | packages=['craigslist'], 25 | version=version, 26 | description=('Simple Craigslist wrapper.'), 27 | long_description=readme, 28 | author='Julio M Alegria', 29 | author_email='juliomalegria@gmail.com', 30 | url='https://github.com/juliomalegria/python-craigslist', 31 | download_url=download_url % version, 32 | install_requires=requires, 33 | license='MIT-Zero' 34 | ) 35 | --------------------------------------------------------------------------------