├── .gitignore
├── LICENSE.txt
├── MANIFEST.in
├── README.rst
├── REQUIREMENTS.txt
├── VERSION.txt
├── craigslist
    ├── __init__.py
    ├── base.py
    ├── craigslist.py
    └── utils.py
├── setup.cfg
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | *.db
 3 | *~
 4 | *.egg
 5 | *.egg-info
 6 | dist
 7 | build
 8 | eggs
 9 | sdist
10 | develop-eggs
11 | .installed.cfg
12 | pip-log.txt
13 | .DS_Store
14 | .venv
15 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT-Zero License
 2 | 
 3 | Copyright (c) 2015 Julio M Alegria
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so.
11 | 
12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
15 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
16 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
17 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
18 | THE SOFTWARE.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include craigslist *
2 | include *.rst
3 | include *.txt


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | python-craigslist
  2 | =================
  3 | 
  4 | A simple `Craigslist <http://www.craigslist.org>`__ wrapper.
  5 | 
  6 | License: `MIT-Zero <https://romanrm.net/mit-zero>`__.
  7 | 
  8 | Disclaimer
  9 | ----------
 10 | 
 11 | * I don't work for or have any affiliation with Craigslist.
 12 | * This module was implemented for educational purposes. It should not be used for crawling or downloading data from Craigslist.
 13 | 
 14 | Installation
 15 | ------------
 16 | 
 17 | ::
 18 | 
 19 |     pip install python-craigslist
 20 | 
 21 | Classes
 22 | -------
 23 | 
 24 | Base class:
 25 | 
 26 | * ``CraigslistBase``
 27 | 
 28 | Subclasses:
 29 | 
 30 | * ``CraigslistCommunity`` (craigslist.org > community)
 31 | * ``CraigslistHousing`` (craigslist.org > housing)
 32 | * ``CraigslistJobs`` (craigslist.org > jobs)
 33 | * ``CraigslistForSale`` (craigslist.org > for sale)
 34 | * ``CraigslistEvents`` (craigslist.org > event calendar)
 35 | * ``CraigslistServices`` (craigslist.org > services)
 36 | * ``CraigslistGigs`` (craigslist.org > gigs)
 37 | * ``CraigslistResumes`` (craigslist.org > resumes)
 38 | 
 39 | Examples
 40 | --------
 41 | 
 42 | Looking for a room in San Francisco?
 43 | 
 44 | .. code:: python
 45 | 
 46 |     from craigslist import CraigslistHousing
 47 |     cl_h = CraigslistHousing(site='sfbay', area='sfc', category='roo',
 48 |                              filters={'max_price': 1200, 'private_room': True})
 49 | 
 50 |     # You can get an approximate amount of results with the following call:
 51 |     print(cl_h.get_results_approx_count())
 52 | 
 53 |     992
 54 | 
 55 |     for result in cl_h.get_results(sort_by='newest', geotagged=True):
 56 |         print(result)
 57 | 
 58 |     {
 59 |         'id': u'4851150747',
 60 |         'name': u'Near SFSU, UCSF and NEWLY FURNISHED - CLEAN, CONVENIENT and CLEAN!',
 61 |         'url': u'http://sfbay.craigslist.org/sfc/roo/4851150747.html',
 62 |         'datetime': u'2015-01-27 23:44',
 63 |         'price': u'$1100',
 64 |         'where': u'inner sunset / UCSF',
 65 |         'has_image': False,
 66 |         'has_map': True,
 67 |         'geotag': (37.738473, -122.494721)
 68 |     }
 69 |     # ...
 70 | 
 71 | Maybe a software engineering internship in Silicon Valley?
 72 | 
 73 | .. code:: python
 74 | 
 75 |     from craigslist import CraigslistJobs
 76 |     cl_j = CraigslistJobs(site='sfbay', area='sby', category='sof',
 77 |                           filters={'is_internship': True, 'employment_type': ['full-time', 'part-time']})
 78 | 
 79 |     for result in cl_j.get_results():
 80 |         print(result)
 81 | 
 82 |     {
 83 |         'id': u'5708651182',
 84 |         'name': u'GAME DEVELOPER INTERNSHIP AT TYNKER - AVAILABLE NOW!',
 85 | 	'url': u'http://sfbay.craigslist.org/pen/eng/5708651182.html',
 86 | 	'datetime': u'2016-07-30 13:30',
 87 | 	'price': None,
 88 | 	'where': u'mountain view',
 89 | 	'has_image': True,
 90 | 	'has_map': True,
 91 | 	'geotag': None
 92 |     }
 93 |     # ...
 94 | 
 95 | Events with free food in New York?
 96 | 
 97 | .. code:: python
 98 | 
 99 |     from craigslist import CraigslistEvents
100 |     cl_e = CraigslistEvents(site='newyork', filters={'free': True, 'food': True})
101 | 
102 |     for result in cl_e.get_results(sort_by='newest', limit=5):
103 |         print(result)
104 | 
105 |     {
106 |         'id': u'4866178242',
107 |         'name': u'Lituation Thursdays @ Le Reve',
108 |         'url': u'http://newyork.craigslist.org/mnh/eve/4866178242.html',
109 |         'datetime': u'1/29',
110 |         'price': None,
111 |         'where': u'Midtown East',
112 |         'has_image': True,
113 |         'has_map': True,
114 |         'geotag': None
115 |     }
116 |     # ...
117 | 
118 | Where to get `filters` from?
119 | ----------------------------
120 | 
121 | Every subclass has its own set of filters. To get a list of all the filters
122 | supported by a specific subclass, use the ``.show_filters()`` class-method:
123 | 
124 | .. code:: python
125 | 
126 |    >>> from craigslist import CraigslistJobs, CraigslistForSale
127 |    >>> CraigslistJobs.show_filters()
128 | 
129 |    Base filters:
130 |    * query = ...
131 |    * search_titles = True/False
132 |    * has_image = True/False
133 |    * posted_today = True/False
134 |    * bundle_duplicates = True/False
135 |    * search_distance = ...
136 |    * zip_code = ...
137 |    
138 |    CraigslistJobs filters:
139 |    * is_internship = True/False
140 |    * is_nonprofit = True/False
141 |    * is_telecommuting = True/False
142 |    * employment_type = u'full-time', u'part-time', u'contract', u"employee's choice"
143 | 
144 | 
145 |    >>> CraigslistForSale.show_filters(category='cta')
146 | 
147 |    Base filters:
148 |    * query = ...
149 |    * search_titles = True/False
150 |    * has_image = True/False
151 |    * posted_today = True/False
152 |    * bundle_duplicates = True/False
153 |    * search_distance = ...
154 |    * zip_code = ...
155 |    
156 |    CraigslistForSale filters with category 'cta':
157 |    * min_price = ...
158 |    * max_price = ...
159 |    * make = ...
160 |    * model = ...
161 |    * min_year = ...
162 |    * max_year = ...
163 |    * min_miles = ...
164 |    * max_miles = ...
165 |    * min_engine_displacement = ...
166 |    * max_engine_displacement = ...
167 |    * condition = u'new', u'like new', u'excellent', u'good', u'fair', u'salvage'
168 |    * auto_cylinders = u'3 cylinders', u'4 cylinders', u'5 cylinders', u'6 cylinders', u'8 cylinders', u'10 cylinders', u'12 cylinders', u'other'
169 |    * auto_drivetrain = u'fwd', u'rwd', u'4wd'
170 |    * auto_fuel_type = u'gas', u'diesel', u'hybrid', u'electric', u'other'
171 |    * auto_paint = u'black', u'blue', u'brown', u'green', u'grey', u'orange', u'purple', u'red', u'silver', u'white', u'yellow', u'custom'
172 |    * auto_size = u'compact', u'full-size', u'mid-size', u'sub-compact'
173 |    * auto_title_status = u'clean', u'salvage', u'rebuilt', u'parts only', u'lien', u'missing'
174 |    * auto_transmission = u'manual', u'automatic', u'other'
175 |    * auto_bodytype = u'bus', u'convertible', u'coupe', u'hatchback', u'mini-van', u'offroad', u'pickup', u'sedan', u'truck', u'SUV', u'wagon', u'van', u'other'
176 | 
177 | Where to get ``site`` and ``area`` from?
178 | ----------------------------------------
179 | 
180 | When initializing any of the subclasses, you'll need to provide the ``site``, and optionall the ``area``, from where you want to query data.
181 | 
182 | To get the correct ``site``, follow these steps:
183 | 
184 | 1. Go to `craigslist.org/about/sites <https://www.craigslist.org/about/sites>`__.
185 | 2. Find the country or city you're interested on, and click on it.
186 | 3. You'll be directed to ``<site>.craigslist.org``. The value of ``<site>`` in the URL is the one you should use.
187 | 
188 | Not all sites have areas. To check if your site has areas, check for links next to the title of the Craigslist page, on the top center. For example, for New York you'll see:
189 | 
190 | .. image:: https://user-images.githubusercontent.com/1008637/45307206-bb404d80-b51e-11e8-8e6d-edfbdbd0a6fa.png
191 | 
192 | Click on the one you're interested, and you'll be redirected to ``<site>.craigslist.org/<area>``. The value of ``<area>`` in the URL is the one you should use. If there are no areas next to the title, it means your site has no areas, and you can leave that argument unset.
193 | 
194 | Where to get ``category`` from?
195 | -------------------------------
196 | 
197 | You can additionally provide a ``category`` when initializing any of the subclasses. To get a list of all the categories
198 | supported by a specific subclass, use the ``.show_categories()`` class-method:
199 | 
200 | .. code:: python
201 |     
202 |     >>> from craigslist import CraigslistServices
203 |     >>> CraigslistServices.show_categories()
204 | 
205 |     CraigslistServices categories:  
206 |     * aos = automotive services
207 |     * bts = beauty services
208 |     * cms = cell phone / mobile services
209 |     * cps = computer services
210 |     * crs = creative services
211 |     * cys = cycle services
212 |     * evs = event services
213 |     * fgs = farm & garden services
214 |     * fns = financial services
215 |     * hws = health/wellness services
216 |     * hss = household services
217 |     * lbs = labor / hauling / moving
218 |     * lgs = legal services
219 |     * lss = lessons & tutoring
220 |     * mas = marine services
221 |     * pas = pet services
222 |     * rts = real estate services
223 |     * sks = skilled trade services
224 |     * biz = small biz ads
225 |     * trv = travel/vacation services
226 |     * wet = writing / editing / translation
227 | 
228 | Is there a limit for the number of results?
229 | --------------------------------------------
230 | 
231 | Yes, Craigslist caps the results for any search to 3000.
232 | 
233 | Support
234 | -------
235 | 
236 | If you find any bug or you want to propose a new feature, please use the `issues tracker <https://github.com/juliomalegria/python-craigslist/issues>`__. I'll be happy to help you! :-)
237 | 


--------------------------------------------------------------------------------
/REQUIREMENTS.txt:
--------------------------------------------------------------------------------
1 | requests>=2.25.0
2 | urllib3>=1.26.0
3 | beautifulsoup4>=4.9.0
4 | six


--------------------------------------------------------------------------------
/VERSION.txt:
--------------------------------------------------------------------------------
1 | 1.1.4


--------------------------------------------------------------------------------
/craigslist/__init__.py:
--------------------------------------------------------------------------------
1 | from .craigslist import (
2 |     CraigslistCommunity, CraigslistEvents, CraigslistForSale, CraigslistGigs,
3 |     CraigslistHousing, CraigslistJobs, CraigslistResumes, CraigslistServices)
4 | 
5 | __all__ = [
6 |     'CraigslistCommunity', 'CraigslistEvents', 'CraigslistForSale', 'CraigslistGigs',
7 |     'CraigslistHousing', 'CraigslistJobs', 'CraigslistResumes', 'CraigslistServices']
8 | 


--------------------------------------------------------------------------------
/craigslist/base.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | try:
  3 |     from Queue import Queue  # PY2
  4 | except ImportError:
  5 |     from queue import Queue  # PY3
  6 | from threading import Thread
  7 | try:
  8 |     from urlparse import urljoin  # PY2
  9 | except ImportError:
 10 |     from urllib.parse import urljoin  # PY3
 11 | 
 12 | from six import iteritems
 13 | from six.moves import range
 14 | 
 15 | from . import utils
 16 | 
 17 | ALL_SITES = utils.get_all_sites()  # All the Craiglist sites
 18 | RESULTS_PER_REQUEST = 100  # Craigslist returns 100 results per request
 19 | 
 20 | 
 21 | class CraigslistBase(object):
 22 |     """ Base class for all Craiglist wrappers. """
 23 | 
 24 |     url_templates = {
 25 |         'base': 'http://%(site)s.craigslist.org',
 26 |         'no_area': 'http://%(site)s.craigslist.org/search/%(category)s',
 27 |         'area': 'http://%(site)s.craigslist.org/search/%(area)s/%(category)s'
 28 |     }
 29 | 
 30 |     default_site = 'sfbay'
 31 |     default_category = None
 32 | 
 33 |     base_filters = {
 34 |         'query': {'url_key': 'query', 'value': None},
 35 |         'search_titles': {'url_key': 'srchType', 'value': 'T'},
 36 |         'has_image': {'url_key': 'hasPic', 'value': 1},
 37 |         'posted_today': {'url_key': 'postedToday', 'value': 1},
 38 |         'bundle_duplicates': {'url_key': 'bundleDuplicates', 'value': 1},
 39 |         'search_distance': {'url_key': 'search_distance', 'value': None},
 40 |         'zip_code': {'url_key': 'postal', 'value': None},
 41 |     }
 42 |     extra_filters = {}
 43 |     __list_filters = {}  # Cache for list filters requested by URL
 44 | 
 45 |     # Set to True to subclass defines the customize_results() method
 46 |     custom_result_fields = False
 47 | 
 48 |     sort_by_options = {
 49 |         'newest': 'date',
 50 |         'price_asc': 'priceasc',
 51 |         'price_desc': 'pricedsc',
 52 |     }
 53 | 
 54 |     def __init__(self, site=None, area=None, category=None, filters=None,
 55 |                  log_level=logging.WARNING):
 56 |         # Logging
 57 |         self.set_logger(log_level, init=True)
 58 | 
 59 |         self.site = site or self.default_site
 60 |         if self.site not in ALL_SITES:
 61 |             msg = "'%s' is not a valid site" % self.site
 62 |             self.logger.error(msg)
 63 |             raise ValueError(msg)
 64 | 
 65 |         if area:
 66 |             if not self.is_valid_area(area):
 67 |                 msg = "'%s' is not a valid area for site '%s'" % (area, site)
 68 |                 self.logger.error(msg)
 69 |                 raise ValueError(msg)
 70 |         self.area = area
 71 | 
 72 |         self.category = category or self.default_category
 73 | 
 74 |         url_template = self.url_templates['area' if area else 'no_area']
 75 |         self.url = url_template % {'site': self.site, 'area': self.area,
 76 |                                    'category': self.category}
 77 | 
 78 |         self.filters = self.get_filters(filters)
 79 | 
 80 |     def get_filters(self, filters):
 81 |         """Parses filters passed by the user into GET parameters."""
 82 | 
 83 |         list_filters = self.get_list_filters(self.url)
 84 | 
 85 |         # If a search has few results, results for "similar listings" will be
 86 |         # included. The solution is a bit counter-intuitive, but to force this
 87 |         # not to happen, we set searchNearby=True, but not pass any
 88 |         # nearbyArea=X, thus showing no similar listings.
 89 |         parsed_filters = {'searchNearby': 1}
 90 | 
 91 |         for key, value in iteritems((filters or {})):
 92 |             try:
 93 |                 filter_ = (self.base_filters.get(key) or
 94 |                            self.extra_filters.get(key) or
 95 |                            list_filters[key])
 96 |                 if filter_['value'] is None:
 97 |                     parsed_filters[filter_['url_key']] = value
 98 |                 elif isinstance(filter_['value'], dict):
 99 |                     valid_options = filter_['value']
100 |                     if not utils.isiterable(value) or isinstance(value, str):
101 |                         value = [value]  # Force to list
102 |                     options = []
103 |                     for opt in value:
104 |                         try:
105 |                             options.append(valid_options[opt])
106 |                         except KeyError:
107 |                             self.logger.warning(
108 |                                 "'%s' is not a valid option for %s"
109 |                                 % (opt, key)
110 |                             )
111 |                     parsed_filters[filter_['url_key']] = options
112 |                 elif value:  # Don't add filter if ...=False
113 |                     parsed_filters[filter_['url_key']] = filter_['value']
114 |             except KeyError:
115 |                 self.logger.warning("'%s' is not a valid filter", key)
116 | 
117 |         return parsed_filters
118 | 
119 |     def set_logger(self, log_level, init=False):
120 |         if init:
121 |             self.logger = logging.getLogger('python-craiglist')
122 |             self.handler = logging.StreamHandler()
123 |             self.logger.addHandler(self.handler)
124 |         self.logger.setLevel(log_level)
125 |         self.handler.setLevel(log_level)
126 | 
127 |     def is_valid_area(self, area):
128 |         base_url = self.url_templates['base']
129 |         response = utils.requests_get(base_url % {'site': self.site},
130 |                                       logger=self.logger)
131 |         soup = utils.bs(response.content)
132 |         sublinks = soup.find('ul', {'class': 'sublinks'})
133 |         return sublinks and sublinks.find('a', text=area) is not None
134 | 
135 |     def get_results_approx_count(self, soup=None):
136 |         """
137 |         Gets (approx) amount of results to be returned by `get_results`.
138 | 
139 |         Note that this number could be not exactly the same as the actual
140 |         len of results returned (although from my tests usually within +/-10).
141 |         Also note that this will make an extra request to Craigslist (if `soup`
142 |         is not provided).
143 |         """
144 | 
145 |         if soup is None:
146 |             response = utils.requests_get(self.url, params=self.filters,
147 |                                           logger=self.logger)
148 |             self.logger.info('GET %s', response.url)
149 |             self.logger.info('Response code: %s', response.status_code)
150 |             response.raise_for_status()  # Something failed?
151 |             soup = utils.bs(response.content)
152 | 
153 |         totalcount = soup.find('span', {'class': 'totalcount'})
154 |         return int(totalcount.text) if totalcount else None
155 | 
156 |     def get_results(self, limit=None, start=0, sort_by=None, geotagged=False,
157 |                     include_details=False):
158 |         """
159 |         Gets results from Craigslist based on the specified filters.
160 | 
161 |         If geotagged=True, the results will include the (lat, lng) in the
162 |         'geotag' attrib (this will make the process a little bit longer).
163 |         """
164 | 
165 |         if sort_by:
166 |             try:
167 |                 self.filters['sort'] = self.sort_by_options[sort_by]
168 |             except KeyError:
169 |                 msg = ("'%s' is not a valid sort_by option, "
170 |                        "use: 'newest', 'price_asc' or 'price_desc'" % sort_by)
171 |                 self.logger.error(msg)
172 |                 raise ValueError(msg)
173 | 
174 |         total_so_far = start
175 |         results_yielded = 0
176 |         total = 0
177 | 
178 |         while True:
179 |             self.filters['s'] = start
180 |             response = utils.requests_get(self.url, params=self.filters,
181 |                                           logger=self.logger)
182 |             self.logger.info('GET %s', response.url)
183 |             self.logger.info('Response code: %s', response.status_code)
184 |             response.raise_for_status()  # Something failed?
185 | 
186 |             soup = utils.bs(response.content)
187 |             if not total:
188 |                 total = self.get_results_approx_count(soup=soup)
189 | 
190 |             rows = soup.find('ul', {'class': 'rows'})
191 |             for row in rows.find_all('li', {'class': 'result-row'},
192 |                                      recursive=False):
193 |                 if limit is not None and results_yielded >= limit:
194 |                     break
195 |                 self.logger.debug('Processing %s of %s results ...',
196 |                                   total_so_far + 1, total or '(undefined)')
197 | 
198 |                 yield self.process_row(row, geotagged, include_details)
199 | 
200 |                 results_yielded += 1
201 |                 total_so_far += 1
202 | 
203 |             if results_yielded == limit:
204 |                 break
205 |             if (total_so_far - start) < RESULTS_PER_REQUEST:
206 |                 break
207 |             start = total_so_far
208 | 
209 |     def process_row(self, row, geotagged=False, include_details=False):
210 |         id = row.attrs['data-pid']
211 |         repost_of = row.attrs.get('data-repost-of')
212 | 
213 |         link = row.find('a', {'class': 'hdrlnk'})
214 |         name = link.text
215 |         url = urljoin(self.url, link.attrs['href'])
216 | 
217 |         time = row.find('time')
218 |         if time:
219 |             datetime = time.attrs['datetime']
220 |         else:
221 |             pl = row.find('span', {'class': 'pl'})
222 |             datetime = pl.text.split(':')[0].strip() if pl else None
223 |         price = row.find('span', {'class': 'result-price'})
224 |         where = row.find('span', {'class': 'result-hood'})
225 |         if where:
226 |             where = where.text.strip()[1:-1]  # remove ()
227 |         tags_span = row.find('span', {'class': 'result-tags'})
228 |         tags = tags_span.text if tags_span else ''
229 | 
230 |         result = {'id': id,
231 |                   'repost_of': repost_of,
232 |                   'name': name,
233 |                   'url': url,
234 |                   # NOTE: Keeping 'datetime' for backwards
235 |                   # compatibility, use 'last_updated' instead.
236 |                   'datetime': datetime,
237 |                   'last_updated': datetime,
238 |                   'price': price.text if price else None,
239 |                   'where': where,
240 |                   'has_image': 'pic' in tags,
241 |                   'geotag': None,
242 |                   # In very few cases, a posting will be included in the result
243 |                   # list but it has already been deleted (or it has been
244 |                   # deleted after the list was retrieved). In that case, this
245 |                   # field will be marked as True. If you want to be extra
246 |                   # careful, always check this field is False before using a
247 |                   # result.
248 |                   'deleted': False}
249 | 
250 |         if geotagged or include_details:
251 |             detail_soup = self.fetch_content(result['url'])
252 |             if detail_soup:
253 |                 if geotagged:
254 |                     self.geotag_result(result, detail_soup)
255 |                 if include_details:
256 |                     self.include_details(result, detail_soup)
257 | 
258 |         if self.custom_result_fields:
259 |             self.customize_result(result)
260 | 
261 |         return result
262 | 
263 |     def customize_result(self, result):
264 |         """ Adds custom/delete/alter fields to result. """
265 |         # Override in subclass to add category-specific fields.
266 |         # FYI: `attrs` will only be presented if include_details was True.
267 |         pass
268 | 
269 |     def geotag_result(self, result, soup):
270 |         """ Adds (lat, lng) to result. """
271 | 
272 |         self.logger.debug('Geotagging result ...')
273 | 
274 |         map_ = soup.find('div', {'id': 'map'})
275 |         if map_:
276 |             result['geotag'] = (float(map_.attrs['data-latitude']),
277 |                                 float(map_.attrs['data-longitude']))
278 | 
279 |         return result
280 | 
281 |     def include_details(self, result, soup):
282 |         """ Adds description, images to result """
283 | 
284 |         self.logger.debug('Adding details to result...')
285 | 
286 |         body = soup.find('section', id='postingbody')
287 | 
288 |         if not body:
289 |             # This should only happen when the posting has been deleted by its
290 |             # author.
291 |             result['deleted'] = True
292 |             return
293 | 
294 |         # We need to massage the data a little bit because it might include
295 |         # some inner elements that we want to ignore.
296 |         body_text = (getattr(e, 'text', e) for e in body
297 |                      if not getattr(e, 'attrs', None))
298 |         result['body'] = ''.join(body_text).strip()
299 | 
300 |         # Add created time (in case it's different from last updated).
301 |         postinginfos = soup.find('div', {'class': 'postinginfos'})
302 |         for p in postinginfos.find_all('p'):
303 |             if 'posted' in p.text:
304 |                 time = p.find('time')
305 |                 if time:
306 |                     # This date is in ISO format. I'm removing the T literal
307 |                     # and the timezone to make it the same format as
308 |                     # 'last_updated'.
309 |                     created = time.attrs['datetime'].replace('T', ' ')
310 |                     result['created'] = created.rsplit(':', 1)[0]
311 | 
312 |         # Add images' urls.
313 |         image_tags = soup.find_all('img')
314 |         # If there's more than one picture, the first one will be repeated.
315 |         image_tags = image_tags[1:] if len(image_tags) > 1 else image_tags
316 |         images = []
317 |         for img in image_tags:
318 |             try:
319 |                 img_link = img['src'].replace('50x50c', '600x450')
320 |                 images.append(img_link)
321 |             except KeyError:
322 |                 continue  # Some posts contain empty <img> tags.
323 |         result['images'] = images
324 | 
325 |         # Add list of attributes as unparsed strings. These values are then
326 |         # processed by `parse_attrs`, and are available to be post-processed
327 |         # by subclasses.
328 |         attrgroups = soup.find_all('p', {'class': 'attrgroup'})
329 |         attrs = []
330 |         for attrgroup in attrgroups:
331 |             for attr in attrgroup.find_all('span'):
332 |                 attr_text = attr.text.strip()
333 |                 if attr_text:
334 |                     attrs.append(attr_text)
335 |         result['attrs'] = attrs
336 |         if attrs:
337 |             self.parse_attrs(result)
338 | 
339 |         # If an address is included, add it to `address`.
340 |         mapaddress = soup.find('div', {'class': 'mapaddress'})
341 |         if mapaddress:
342 |             result['address'] = mapaddress.text
343 | 
344 |     def parse_attrs(self, result):
345 |         """Parses raw attributes into structured fields in the result dict."""
346 | 
347 |         # Parse binary fields first by checking their presence.
348 |         attrs = set(attr.lower() for attr in result['attrs'])
349 |         for key, options in iteritems(self.extra_filters):
350 |             if options['value'] != 1:
351 |                 continue  # Filter is not binary
352 |             if options.get('attr', '') in attrs:
353 |                 result[key] = True
354 |         # Values from list filters are sometimes shown as {filter}: {value}
355 |         # e.g. "transmission: automatic", although usually they are shown only
356 |         # with the {value}, e.g. "laundry in bldg". By stripping the content
357 |         # before the colon (if any) we reduce it to a single case.
358 |         attrs_after_colon = set(
359 |             attr.split(': ', 1)[-1] for attr in result['attrs'])
360 |         for key, options in iteritems(self.get_list_filters(self.url)):
361 |             for option in options['value'].keys():
362 |                 if option in attrs_after_colon:
363 |                     result[key] = option
364 |                     break
365 | 
366 |     def fetch_content(self, url):
367 |         response = utils.requests_get(url, logger=self.logger)
368 |         self.logger.info('GET %s', response.url)
369 |         self.logger.info('Response code: %s', response.status_code)
370 | 
371 |         if response.ok:
372 |             return utils.bs(response.content)
373 | 
374 |         self.logger.warning("GET %s returned not OK response code: %s "
375 |                             "(skipping)", url, response.status_code)
376 |         return None
377 | 
378 |     def geotag_results(self, results, workers=8):
379 |         """
380 |         Adds (lat, lng) to each result. This process is done using N threads,
381 |         where N is the amount of workers defined (default: 8).
382 |         """
383 | 
384 |         results = list(results)
385 |         queue = Queue()
386 | 
387 |         for result in results:
388 |             queue.put(result)
389 | 
390 |         def geotagger():
391 |             while not queue.empty():
392 |                 self.logger.debug('%s results left to geotag ...',
393 |                                   queue.qsize())
394 |                 self.geotag_result(queue.get())
395 |                 queue.task_done()
396 | 
397 |         threads = []
398 |         for _ in range(workers):
399 |             thread = Thread(target=geotagger)
400 |             thread.start()
401 |             threads.append(thread)
402 | 
403 |         for thread in threads:
404 |             thread.join()
405 |         return results
406 | 
407 |     @classmethod
408 |     def get_list_filters(cls, url):
409 |         if cls.__list_filters.get(url) is None:
410 |             cls.__list_filters[url] = utils.get_list_filters(url)
411 |         return cls.__list_filters[url]
412 | 
413 |     @classmethod
414 |     def show_categories(cls):
415 |         url = cls.url_templates["no_area"] % {
416 |             "site": cls.default_site,
417 |             "category": cls.default_category,
418 |         }
419 |         response = utils.requests_get(url)
420 |         soup = utils.bs(response.content)
421 | 
422 |         cat_html = soup.find_all("input", {"class": "catcheck multi_checkbox"})
423 |         cat_ids = [html.get('data-abb') for html in cat_html]
424 |         cat_html = soup.find_all("a", {"class": "category"})
425 |         cat_names = [html.contents[0] for html in cat_html]
426 | 
427 |         print("%s categories:" % cls.__name__)
428 |         for cat_name, cat_id in sorted(zip(cat_names, cat_ids)):
429 |             print("* %s = %s" % (cat_id, cat_name))
430 | 
431 |     @classmethod
432 |     def show_filters(cls, category=None):
433 |         print('Base filters:')
434 |         for key, options in iteritems(cls.base_filters):
435 |             value_as_str = '...' if options['value'] is None else 'True/False'
436 |             print('* %s = %s' % (key, value_as_str))
437 | 
438 |         if category is None:
439 |             print("\n%s filters:" % cls.__name__)
440 |         else:
441 |             print("\n%s filters for category '%s':" % (cls.__name__, category))
442 |         for key, options in iteritems(cls.extra_filters):
443 |             value_as_str = '...' if options['value'] is None else 'True/False'
444 |             print('* %s = %s' % (key, value_as_str))
445 |         url = cls.url_templates['no_area'] % {
446 |             'site': cls.default_site,
447 |             'category': category or cls.default_category,
448 |         }
449 |         list_filters = cls.get_list_filters(url)
450 |         for key, options in iteritems(list_filters):
451 |             value_as_str = ', '.join(
452 |                 repr(opt) for opt in options['value'].keys())
453 |             print('* %s = %s' % (key, value_as_str))
454 | 


--------------------------------------------------------------------------------
/craigslist/craigslist.py:
--------------------------------------------------------------------------------
  1 | from .base import CraigslistBase
  2 | 
  3 | 
  4 | class CraigslistCommunity(CraigslistBase):
  5 |     """ Craigslist community wrapper. """
  6 | 
  7 |     default_category = 'ccc'
  8 | 
  9 | 
 10 | class CraigslistEvents(CraigslistBase):
 11 |     """ Craigslist events wrapper. """
 12 | 
 13 |     default_category = 'eee'
 14 |     custom_result_fields = True
 15 | 
 16 |     extra_filters = {
 17 |         # art/film
 18 |         'art': {'url_key': 'event_art', 'value': 1, 'attr': 'art/film'},
 19 |         'film': {'url_key': 'event_art', 'value': 1, 'attr': 'art/film'},
 20 |         # career
 21 |         'career': {'url_key': 'event_career', 'value': 1, 'attr': 'career'},
 22 |         # charitable
 23 |         'charitable': {
 24 |             'url_key': 'event_fundraiser_vol', 'value': 1,
 25 |             'attr': 'charitable'},
 26 |         'fundraiser': {
 27 |             'url_key': 'event_fundraiser_vol', 'value': 1,
 28 |             'attr': 'charitable'},
 29 |         # competiton
 30 |         'athletics': {
 31 |             'url_key': 'event_athletics', 'value': 1, 'attr': 'competition'},
 32 |         'competition': {
 33 |             'url_key': 'event_athletics', 'value': 1, 'attr': 'competition'},
 34 |         # dance
 35 |         'dance': {'url_key': 'event_dance', 'value': 1, 'attr': 'dance'},
 36 |         # fest/fair
 37 |         'festival': {
 38 |             'url_key': 'event_festival', 'value': 1, 'attr': 'fest/fair'},
 39 |         'fair': {'url_key': 'event_festival', 'value': 1, 'attr': 'fest/fair'},
 40 |         # fitness/health
 41 |         'fitness': {
 42 |             'url_key': 'event_fitness_wellness', 'value': 1,
 43 |             'attr': 'fitness/health'},
 44 |         'health': {
 45 |             'url_key': 'event_fitness_wellness', 'value': 1,
 46 |             'attr': 'fitness/health'},
 47 |         # food/drink
 48 |         'food': {'url_key': 'event_food', 'value': 1, 'attr': 'food/drink'},
 49 |         'drink': {'url_key': 'event_food', 'value': 1, 'attr': 'food/drink'},
 50 |         # free
 51 |         'free': {'url_key': 'event_free', 'value': 1, 'attr': 'free'},
 52 |         # kid friendly
 53 |         'kid_friendly': {
 54 |             'url_key': 'event_kidfriendly', 'value': 1,
 55 |             'attr': 'kid friendly'},
 56 |         # literary
 57 |         'literary': {
 58 |             'url_key': 'event_literary', 'value': 1, 'attr': 'literary'},
 59 |         # music
 60 |         'music': {'url_key': 'event_music', 'value': 1, 'attr': 'music'},
 61 |         # outdoor
 62 |         'outdoor': {'url_key': 'event_outdoor', 'value': 1, 'attr': 'outdoor'},
 63 |         # sale
 64 |         'sale': {'url_key': 'event_sale', 'value': 1, 'attr': 'sale'},
 65 |         # singles
 66 |         'singles': {'url_key': 'event_singles', 'value': 1, 'attr': 'singles'},
 67 |         # tech
 68 |         'tech': {'url_key': 'event_geek', 'value': 1, 'attr': 'tech'},
 69 |     }
 70 | 
 71 |     def customize_result(self, result):
 72 |         for attr in result.get('attrs', []):
 73 |             # Get venue.
 74 |             if attr.lower().startswith('venue: '):
 75 |                 result['venue'] = attr[7:]
 76 | 
 77 | 
 78 | class CraigslistForSale(CraigslistBase):
 79 |     """ Craigslist for sale wrapper. """
 80 | 
 81 |     default_category = 'sss'
 82 |     custom_result_fields = True
 83 | 
 84 |     extra_filters = {
 85 |         # price
 86 |         'min_price': {'url_key': 'min_price', 'value': None},
 87 |         'max_price': {'url_key': 'max_price', 'value': None},
 88 |         # make and model
 89 |         'make': {'url_key': 'auto_make_model', 'value': None},
 90 |         'model': {'url_key': 'auto_make_model', 'value': None},
 91 |         # model year
 92 |         'min_year': {'url_key': 'min_auto_year', 'value': None},
 93 |         'max_year': {'url_key': 'max_auto_year', 'value': None},
 94 |         # odometer
 95 |         'min_miles': {'url_key': 'min_auto_miles', 'value': None},
 96 |         'max_miles': {'url_key': 'max_auto_miles', 'value': None},
 97 |         # engine displacement (cc)
 98 |         'min_engine_displacement': {
 99 |             'url_key': 'min_engine_displacement_cc', 'value': None},
100 |         'max_engine_displacement': {
101 |             'url_key': 'max_engine_displacement_cc', 'value': None},
102 |     }
103 | 
104 |     def customize_result(self, result):
105 |         for attr in result.get('attrs', []):
106 |             attr_lower = attr.lower()
107 |             # Get miles.
108 |             if attr_lower.startswith('odometer: '):
109 |                 result['miles'] = attr[10:]
110 |             # Get engine displacement
111 |             if attr_lower.startswith('engine displacement (cc): '):
112 |                 result['engine_displacement'] = attr[26:]
113 | 
114 | 
115 | class CraigslistGigs(CraigslistBase):
116 |     """ Craigslist gigs wrapper. """
117 | 
118 |     default_category = 'ggg'
119 |     custom_result_fields = True
120 | 
121 |     extra_filters = {
122 |         # paid/unpaid
123 |         'is_paid': {'url_key': 'is_paid', 'value': None},
124 |     }
125 | 
126 |     def __init__(self, *args, **kwargs):
127 |         try:
128 |             is_paid = kwargs['filters']['is_paid']
129 |             kwargs['filters']['is_paid'] = 'yes' if is_paid else 'no'
130 |         except KeyError:
131 |             pass
132 |         super(CraigslistGigs, self).__init__(*args, **kwargs)
133 | 
134 |     def customize_result(self, result):
135 |         for attr in result.get('attrs', []):
136 |             # Get compensation.
137 |             if attr.lower().startswith('compensation: '):
138 |                 result['compensation'] = attr[14:]
139 |         result['is_paid'] = 'compensation' in result
140 | 
141 | 
142 | class CraigslistHousing(CraigslistBase):
143 |     """ Craigslist housing wrapper. """
144 | 
145 |     default_category = 'hhh'
146 |     custom_result_fields = True
147 | 
148 |     extra_filters = {
149 |         # price
150 |         'min_price': {'url_key': 'min_price', 'value': None},
151 |         'max_price': {'url_key': 'max_price', 'value': None},
152 |         # bedrooms
153 |         'min_bedrooms': {'url_key': 'min_bedrooms', 'value': None},
154 |         'max_bedrooms': {'url_key': 'max_bedrooms', 'value': None},
155 |         # bathrooms
156 |         'min_bathrooms': {'url_key': 'min_bathrooms', 'value': None},
157 |         'max_bathrooms': {'url_key': 'max_bathrooms', 'value': None},
158 |         # ft2
159 |         'min_ft2': {'url_key': 'minSqft', 'value': None},
160 |         'max_ft2': {'url_key': 'maxSqft', 'value': None},
161 |         # private room
162 |         'private_room': {
163 |             'url_key': 'private_room', 'value': 1, 'attr': 'private room'},
164 |         # private bath
165 |         'private_bath': {
166 |             'url_key': 'private_bath', 'value': 1, 'attr': 'private bath'},
167 |         # cats ok
168 |         'cats_ok': {
169 |             'url_key': 'pets_cat', 'value': 1, 'attr': 'cats are ok - purrr'},
170 |         # dogs ok
171 |         'dogs_ok': {
172 |             'url_key': 'pets_dog', 'value': 1, 'attr': 'dogs are ok - wooof'},
173 |         # furnished
174 |         'is_furnished': {
175 |             'url_key': 'is_furnished', 'value': 1, 'attr': 'furnished'},
176 |         # no smoking
177 |         'no_smoking': {
178 |             'url_key': 'no_smoking', 'value': 1, 'attr': 'no smoking'},
179 |         # wheelchair access
180 |         'wheelchair_acccess': {
181 |             'url_key': 'wheelchaccess', 'value': 1,
182 |             'attr': 'wheelchair accessible'},
183 |         # EV charging
184 |         'ev_charging': {
185 |             'url_key': 'ev_charging', 'value': 1, 'attr': 'ev charging'},
186 |         # no appliation fee
187 |         'no_application_fee': {'url_key': 'application_fee', 'value': 1},
188 |         # no broker fee
189 |         'no_broker_fee': {'url_key': 'broker_fee', 'value': 1},
190 |     }
191 | 
192 |     def customize_result(self, result):
193 |         for attr in result.get('attrs', []):
194 |             attr_lower = attr.lower()
195 |             # Get bedrooms and bathrooms.
196 |             if attr_lower.endswith('br') or attr_lower.endswith('ba'):
197 |                 for elem in attr_lower.split(' / '):
198 |                     if elem.endswith('br'):
199 |                         # Don't convert to int, too risky
200 |                         result['bedrooms'] = elem[:-2]
201 |                     elif elem.endswith('ba'):
202 |                         # Don't convert to int, too risky
203 |                         result['bathrooms'] = elem[:-2]
204 |             # Get area.
205 |             elif attr_lower.endswith('ft2') or attr_lower.endswith('m2'):
206 |                 result['area'] = attr_lower
207 |             # Get availability.
208 |             elif attr_lower.startswith('available '):
209 |                 result['available'] = attr[10:]
210 | 
211 | 
212 | class CraigslistJobs(CraigslistBase):
213 |     """ Craigslist jobs wrapper. """
214 | 
215 |     default_category = 'jjj'
216 |     custom_result_fields = True
217 | 
218 |     extra_filters = {
219 |         # internship
220 |         'is_internship': {
221 |             'url_key': 'is_internship', 'value': 1, 'attr': 'internship'},
222 |         # non-profit
223 |         'is_nonprofit': {
224 |             'url_key': 'is_nonprofit', 'value': 1,
225 |             'attr': 'non-profit organization'},
226 |         # telecommute
227 |         'is_telecommuting': {
228 |             'url_key': 'is_telecommuting', 'value': 1,
229 |             'attr': 'telecommuting okay'},
230 |     }
231 | 
232 |     def customize_result(self, result):
233 |         for attr in result.get('attrs', []):
234 |             # Get compensation.
235 |             if attr.lower().startswith('compensation: '):
236 |                 result['compensation'] = attr[14:]
237 | 
238 | 
239 | class CraigslistResumes(CraigslistBase):
240 |     """ Craigslist resumes wrapper. """
241 | 
242 |     default_category = 'rrr'
243 | 
244 |     extra_filters = {
245 |         # TODO: Please create an issue or PR if interested in this category.
246 |     }
247 | 
248 | 
249 | class CraigslistServices(CraigslistBase):
250 |     """ Craigslist services wrapper. """
251 | 
252 |     default_category = 'bbb'
253 | 


--------------------------------------------------------------------------------
/craigslist/utils.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import requests
 3 | from requests.exceptions import RequestException
 4 | 
 5 | ALL_SITES_URL = 'http://www.craigslist.org/about/sites'
 6 | SITE_URL = 'http://%s.craigslist.org'
 7 | USER_AGENT = 'Mozilla/5.0'
 8 | 
 9 | 
10 | def bs(content):
11 |     return BeautifulSoup(content, 'html.parser')
12 | 
13 | 
14 | def isiterable(var):
15 |     try:
16 |         return iter(var) and True
17 |     except TypeError:
18 |         return False
19 | 
20 | 
21 | def requests_get(*args, **kwargs):
22 |     """
23 |     Retries if a RequestException is raised (could be a connection error or
24 |     a timeout).
25 |     """
26 | 
27 |     logger = kwargs.pop('logger', None)
28 |     # Set default User-Agent header if not defined.
29 |     kwargs.setdefault('headers', {}).setdefault('User-Agent', USER_AGENT)
30 | 
31 |     try:
32 |         return requests.get(*args, **kwargs)
33 |     except RequestException as exc:
34 |         if logger:
35 |             logger.warning('Request failed (%s). Retrying ...', exc)
36 |         return requests.get(*args, **kwargs)
37 | 
38 | 
39 | def get_all_sites():
40 |     response = requests.get(ALL_SITES_URL)
41 |     response.raise_for_status()  # Something failed?
42 |     soup = BeautifulSoup(response.content, 'html.parser')
43 |     sites = set()
44 | 
45 |     for box in soup.findAll('div', {'class': 'box'}):
46 |         for a in box.findAll('a'):
47 |             # Remove protocol and get subdomain
48 |             site = a.attrs['href'].rsplit('//', 1)[1].split('.')[0]
49 |             sites.add(site)
50 | 
51 |     return sites
52 | 
53 | 
54 | def get_all_areas(site):
55 |     response = requests.get(SITE_URL % site)
56 |     response.raise_for_status()  # Something failed?
57 |     soup = BeautifulSoup(response.content, 'html.parser')
58 |     raw = soup.select('ul.sublinks li a')
59 |     sites = set(a.attrs['href'].rsplit('/')[1] for a in raw)
60 |     return sites
61 | 
62 | 
63 | def get_list_filters(url):
64 |     list_filters = {}
65 |     response = requests_get(url)
66 |     soup = bs(response.content)
67 |     for list_filter in soup.find_all('div', class_='search-attribute'):
68 |         filter_key = list_filter.attrs['data-attr']
69 |         filter_labels = list_filter.find_all('label')
70 |         options = {opt.text.strip(): opt.find('input').get('value')
71 |                    for opt in filter_labels}
72 |         list_filters[filter_key] = {'url_key': filter_key, 'value': options}
73 |     return list_filters
74 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.rst


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | try:
 4 |     from setuptools import setup
 5 | except ImportError:
 6 |     from distutils.core import setup
 7 | 
 8 | with open('VERSION.txt', 'r') as v:
 9 |     version = v.read().strip()
10 | 
11 | with open('REQUIREMENTS.txt', 'r') as r:
12 |     requires = r.read().split()
13 | 
14 | with open('README.rst', 'r') as r:
15 |     readme = r.read()
16 | 
17 | download_url = (
18 |     'https://github.com/juliomalegria/python-craigslist/tarball/%s'
19 | )
20 | 
21 | 
22 | setup(
23 |     name='python-craigslist',
24 |     packages=['craigslist'],
25 |     version=version,
26 |     description=('Simple Craigslist wrapper.'),
27 |     long_description=readme,
28 |     author='Julio M Alegria',
29 |     author_email='juliomalegria@gmail.com',
30 |     url='https://github.com/juliomalegria/python-craigslist',
31 |     download_url=download_url % version,
32 |     install_requires=requires,
33 |     license='MIT-Zero'
34 | )
35 | 


--------------------------------------------------------------------------------