├── __init__.py
├── setup.cfg
├── mappings.json
├── tests
    ├── __init__.py
    └── test_scrapy_model.py
├── requirements.txt
├── MANIFEST.in
├── .travis.yml
├── .gitignore
├── Makefile
├── LICENSE
├── setup.py
├── example.py
├── scrapy_model.py
└── README.md


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [wheel]
2 | universal = 1


--------------------------------------------------------------------------------
/mappings.json:
--------------------------------------------------------------------------------
1 | {"test": {"css": "div"}}
2 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | flake8
2 | wheel==0.23.0
3 | cffi==0.8.2
4 | Scrapy==0.22.2
5 | redis==2.8.0
6 | requests==2.20.0
7 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include Makefile
3 | include README.md
4 | include mappings.json
5 | recursive-include tests *


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "2.7"
 4 | # command to install dependencies
 5 | install:
 6 |   - "pip install -r requirements.txt"
 7 | # command to run tests
 8 | script:
 9 |   - make lint
10 |   - make test
11 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | 
21 | # Installer logs
22 | pip-log.txt
23 | 
24 | # Unit test / coverage reports
25 | .coverage
26 | .tox
27 | nosetests.xml
28 | htmlcov
29 | 
30 | # Translations
31 | *.mo
32 | 
33 | # Mr Developer
34 | .mr.developer.cfg
35 | .project
36 | .pydevproject
37 | 
38 | # Complexity
39 | output/*.html
40 | output/*/index.html
41 | 
42 | # Sphinx
43 | docs/_build
44 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: clean-pyc clean-build docs clean
 2 | 
 3 | help:
 4 | 	@echo "clean-build - remove build artifacts"
 5 | 	@echo "clean-pyc - remove Python file artifacts"
 6 | 	@echo "lint - check style with flake8"
 7 | 	@echo "test - run tests quickly with the default Python"
 8 | 	@echo "test-all - run tests on every Python version with tox"
 9 | 	@echo "coverage - check code coverage quickly with the default Python"
10 | 	@echo "docs - generate Sphinx HTML documentation, including API docs"
11 | 	@echo "release - package and upload a release"
12 | 	@echo "dist - package"
13 | 
14 | clean: clean-build clean-pyc
15 | 	rm -fr htmlcov/
16 | 
17 | clean-build:
18 | 	rm -fr build/
19 | 	rm -fr dist/
20 | 	rm -fr *.egg-info
21 | 
22 | clean-pyc:
23 | 	find . -name '*.pyc' -exec rm -f {} +
24 | 	find . -name '*.pyo' -exec rm -f {} +
25 | 	find . -name '*~' -exec rm -f {} +
26 | 
27 | lint:
28 | 	flake8 scrapy_model.py tests
29 | 
30 | test:
31 | 	python setup.py test
32 | 
33 | coverage:
34 | 	coverage run --source scrapy_model.py setup.py test
35 | 	coverage report -m
36 | 	coverage html
37 | 	open htmlcov/index.html
38 | 
39 | release: clean
40 | 	python setup.py sdist upload
41 | 	python setup.py bdist_wheel upload
42 | 
43 | dist: clean
44 | 	python setup.py sdist
45 | 	python setup.py bdist_wheel
46 | 	ls -l dist
47 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014, Bruno Rocha
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 5 | 
 6 | * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 7 | 
 8 | * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 9 | 
10 | * Neither the name of Scrapy Model nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
11 | 
12 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | try:
 5 |     from setuptools import setup
 6 | except ImportError:
 7 |     from distutils.core import setup
 8 | 
 9 | 
10 | readme = open('README.md').read()
11 | 
12 | requirements = [
13 |     'wheel==0.23.0',
14 |     'cffi==0.8.2',
15 |     'Scrapy==0.22.2',
16 |     'redis==2.8.0',
17 |     'requests==2.20.0',
18 | ]
19 | 
20 | test_requirements = [
21 | ]
22 | 
23 | setup(
24 |     name='scrapy_model',
25 |     version='0.1.6',
26 |     description='Scrapy helper to create scrapers from models',
27 |     long_description=readme,
28 |     author='Bruno Rocha',
29 |     author_email='rochacbruno@gmail.com',
30 |     url='https://github.com/rochacbruno/scrapy_model',
31 |     py_modules=['scrapy_model'],
32 |     include_package_data=True,
33 |     install_requires=requirements,
34 |     license="BSD",
35 |     zip_safe=False,
36 |     keywords='scrapy_model',
37 |     classifiers=[
38 |         'Development Status :: 2 - Pre-Alpha',
39 |         'Intended Audience :: Developers',
40 |         'License :: OSI Approved :: BSD License',
41 |         'Natural Language :: English',
42 |         "Programming Language :: Python :: 2",
43 |         'Programming Language :: Python :: 2.6',
44 |         'Programming Language :: Python :: 2.7',
45 |         'Programming Language :: Python :: 3',
46 |         'Programming Language :: Python :: 3.3',
47 |     ],
48 |     test_suite='tests',
49 |     tests_require=test_requirements
50 | )
51 | 


--------------------------------------------------------------------------------
/tests/test_scrapy_model.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | test_scrapy_model
 6 | ----------------------------------
 7 | 
 8 | Tests for `scrapy_model` module.
 9 | """
10 | 
11 | import unittest
12 | 
13 | from example import TestFetcher, DummyModel
14 | 
15 | 
16 | class TestScrapy_model(unittest.TestCase):
17 | 
18 |     def setUp(self):
19 |         fetcher = TestFetcher(cache_fetch=True)
20 |         fetcher.url = "http://en.m.wikipedia.org/wiki/Guido_van_Rossum"
21 | 
22 |         fetcher.mappings['name'] = {
23 |             "css": ("#section_0::text")
24 |         }
25 | 
26 |         fetcher.parse()
27 |         self.fetcher = fetcher
28 |         self.model = DummyModel()
29 | 
30 |     def test_fetched_correct_name(self):
31 |         self.assertEquals(self.fetcher.name.value, u'Guido van Rossum')
32 | 
33 |     def test_name_in_data_is_the_same_in_fields(self):
34 |         self.assertEquals(self.fetcher.name.value, self.fetcher._data.name)
35 | 
36 |     def test_load_mappings_from_json_string(self):
37 |         js = '{"test": {"css": "div"}}'
38 |         self.fetcher.load_mappings_from_file(js)
39 |         self.assertEquals(self.fetcher.mappings["test"], {"css": "div"})
40 | 
41 |     def test_load_mappings_from_json_path(self):
42 |         self.fetcher.load_mappings_from_file('mappings.json')
43 |         self.assertEquals(self.fetcher.mappings["test"], {"css": "div"})
44 | 
45 |     def test_load_mappings_from_json_file(self):
46 |         with open('mappings.json') as jsonfile:
47 |             self.fetcher.load_mappings_from_file(jsonfile)
48 |             self.assertEquals(
49 |                 self.fetcher.mappings["test"], {"css": "div"}
50 |             )
51 | 
52 | if __name__ == '__main__':
53 |     unittest.main()
54 | 


--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | from scrapy_model import BaseFetcherModel, CSSField, XPathField, RedisCache
 4 | 
 5 | 
 6 | class TestFetcher(BaseFetcherModel):
 7 |     photo_url = XPathField('//*[@id="content"]/div[1]/table/tr[2]/td/a')
 8 | 
 9 |     nationality = CSSField(
10 |         '#content > div:nth-child(1) > table > tr:nth-child(4) > td > a::text',
11 |         takes_first=True,
12 |         processor=lambda value: value.upper()  # it could be a list of funcs
13 |     )
14 | 
15 |     links = CSSField(
16 |         '#content > div:nth-child(11) > ul > li > a.external::attr(href)',
17 |         auto_extract=True
18 |     )
19 | 
20 |     def parse_photo_url(self, selector):
21 |         return "http://en.m.wikipedia.org/{}".format(
22 |             selector.xpath("@href").extract()[0]
23 |         )
24 | 
25 |     def parse_name(self, selector):
26 |         return selector.extract()[0]
27 | 
28 |     def post_parse(self):
29 |         # executed after all parsers
30 |         # you can load any data on to self._data
31 |         # access self._data and self._fields for current data
32 |         # self.selector contains original page
33 |         # self.fetch() returns original html
34 |         self._data.url = self.url
35 | 
36 | 
37 | class DummyModel(object):
38 |     """
39 |     For tests only, it can be a model in your database ORM
40 |     """
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     from pprint import pprint
45 | 
46 |     fetcher = TestFetcher(cache_fetch=True,
47 |                           cache=RedisCache,
48 |                           cache_expire=1800)
49 | 
50 |     fetcher.url = "http://en.m.wikipedia.org/wiki/Guido_van_Rossum"
51 | 
52 |     # Mappings can be loaded from a json file
53 |     # fetcher.load_mappings_from_file('path/to/file')
54 |     fetcher.mappings['name'] = {
55 |         "css": ("#section_0::text")
56 |     }
57 | 
58 |     fetcher.parse()
59 | 
60 |     print "Fetcher holds the data"
61 |     print fetcher._data.name
62 |     pprint(fetcher._data)
63 | 
64 |     # How to populate an object
65 |     print "Populating an object"
66 |     dummy = DummyModel()
67 | 
68 |     fetcher.populate(dummy, fields=["name", "nationality"])
69 |     # fields attr is optional
70 |     print dummy.nationality
71 |     pprint(dummy.__dict__)
72 | 


--------------------------------------------------------------------------------
/scrapy_model.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | __all__ = ['BaseFetcherModel', 'CSSField', 'XPathField', 'RedisCache']
  4 | 
  5 | import json
  6 | import logging
  7 | import requests
  8 | from collections import Sequence
  9 | from redis import Redis
 10 | from redis.exceptions import ConnectionError
 11 | from scrapy.selector import Selector
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | class NoCache(object):
 17 |     def __init__(self, *args, **kwargs):
 18 |         pass
 19 | 
 20 |     def get(self, key):
 21 |         return None
 22 | 
 23 |     def set(self, key, value, expire=None):
 24 |         pass
 25 | 
 26 | 
 27 | class RedisCache(object):
 28 |     def __init__(self, *args, **kwargs):
 29 |         self.cache = Redis(*args, **kwargs)
 30 | 
 31 |     def get(self, key):
 32 |         try:
 33 |             return self.cache.get(key)
 34 |         except ConnectionError as e:
 35 |             logger.error("Cant connect to Redis server %s", e)
 36 |             return None
 37 | 
 38 |     def set(self, key, value, expire=None):
 39 |         try:
 40 |             self.cache.set(key, value, expire)
 41 |         except ConnectionError as e:
 42 |             logger.error("Cant connect to Redis server %s", e)
 43 | 
 44 | 
 45 | class Storage(dict):
 46 |     """
 47 |     A dict that accepts [keys] or .attributes
 48 |     >>> obj = Storage()
 49 |     >>> obj["name"] = "Bruno"
 50 |     >>> obj.company = "ACME"
 51 |     >>> obj.name == obj["name]
 52 |     >>> obj["company] == obj.company
 53 |     """
 54 | 
 55 |     def __getattr__(self, attr):
 56 |         return self.get(attr)
 57 | 
 58 |     def __setattr__(self, attr, value):
 59 |         self[attr] = value
 60 | 
 61 | 
 62 | class BaseField(object):
 63 |     """
 64 |     Base for other selector fields
 65 |     """
 66 | 
 67 |     def __init__(self,
 68 |                  query,
 69 |                  auto_extract=False,
 70 |                  takes_first=False,
 71 |                  processor=None,
 72 |                  query_validator=None,
 73 |                  default=None):
 74 |         self.query = [query] if isinstance(query, basestring) else query
 75 |         self.query_validator = query_validator or (lambda data: True)
 76 |         self.default = default
 77 |         self.auto_extract = auto_extract
 78 |         self.takes_first = takes_first
 79 |         self.processor = processor or (lambda untouched_data: untouched_data)
 80 |         self._data = self.selector = self._raw_data = None
 81 | 
 82 |     @property
 83 |     def value(self):
 84 |         return self._data
 85 | 
 86 |     def _parse(self, selector):
 87 |         parsed = self.parse(selector)
 88 |         if hasattr(parsed, 'extract'):
 89 |             extracted = parsed.extract()
 90 |             if self.takes_first and len(extracted) > 0:
 91 |                 for value in extracted:
 92 |                     if value is not None and value != '':
 93 |                         return self._processor(value)
 94 |             elif self.auto_extract:
 95 |                 return self._processor(extracted)
 96 |         return self._processor(parsed)
 97 | 
 98 |     def _processor(self, data):
 99 |         """
100 |         runs the processor if defined
101 |         processor can be a list of functions to be chained
102 |         or a single function
103 |         """
104 |         if isinstance(self.processor, Sequence):
105 |             for function in self.processor:
106 |                 data = function(data)
107 |         else:
108 |             data = self.processor(data)
109 |         return data
110 | 
111 |     def parse(self, selector):
112 |         raise NotImplementedError("Must be implemented in child class")
113 | 
114 |     def get_identifier(self):
115 |         return getattr(self, 'identifier', "")
116 | 
117 |     def __repr__(self):
118 |         return u"<{} - {} - {}>".format(
119 |             self.__class__.__name__, self.get_identifier(), self._data
120 |         )
121 | 
122 |     def __str__(self):
123 |         return unicode(self._data)
124 | 
125 |     def __unicode__(self):
126 |         return unicode(self._data)
127 | 
128 | 
129 | class GenericField(BaseField):
130 |     def __init__(self, identifier=None, value=None):
131 |         super(GenericField, self).__init__("")
132 |         self._data = value
133 |         self.identifier = identifier
134 | 
135 |     def parse(self, selector):
136 |         return None
137 | 
138 | 
139 | class CSSField(BaseField):
140 |     def parse(self, selector):
141 |         for query in self.query:
142 |             res = selector.css(query)
143 |             if len(res) and self.query_validator(res):
144 |                 return res
145 |         return self.default or selector.css("__empty_selector__")
146 | 
147 | 
148 | class XPathField(BaseField):
149 |     def parse(self, selector):
150 |         for query in self.query:
151 |             res = selector.xpath(query)
152 |             if len(res) and self.query_validator(res):
153 |                 return res
154 |         return self.default or selector.css("__empty_selector__")
155 | 
156 | 
157 | class BaseFetcherModel(object):
158 |     """
159 |     fields example:
160 |             name = CSSField("div.perfil > div > div.perf.col-md-12 >"
161 |                             " div.col-md-10.desc > h1::text")
162 |     mappings example:
163 |         mappings = {
164 |             'name': {'css': 'div#test'},
165 |             'phone': {'xpath': '//phone'},
166 |             'location': '.location'  # assumes css
167 |         }
168 | 
169 | 
170 |     Any method named parsed_<field_name> will run after the data is collected
171 |     """
172 | 
173 |     mappings = {}
174 | 
175 |     def __init__(self, url=None, mappings=None,
176 |                  cache_fetch=False,
177 |                  cache=NoCache,
178 |                  cache_args=None,
179 |                  cache_expire=None):
180 |         self.load_fields()
181 |         self.url = url
182 |         self.refresh = False
183 |         self._data = Storage()
184 |         self._selector = None
185 |         self.mappings = mappings or self.mappings.copy()
186 |         self.cache_fetch = cache_fetch
187 |         self.cache_expire = cache_expire
188 | 
189 |         if isinstance(cache, type):
190 |             self.cache = cache(**(cache_args or {}))
191 |         else:
192 |             self.cache = cache
193 | 
194 |     def load_fields(self):
195 |         self._fields = []
196 |         for name, field in self.__class__.__dict__.items():
197 |             if isinstance(field, BaseField):
198 |                 field.identifier = name
199 |                 self._fields.append(field)
200 | 
201 |     def fetch(self, url=None):
202 |         url = self.url or url
203 |         cached = self.cache.get(url)
204 |         if cached and self.cache_fetch:
205 |             return cached
206 |         response = requests.get(url)
207 |         if self.cache_fetch:
208 |             self.cache.set(url, response.content, expire=self.cache_expire)
209 |         return response.content
210 | 
211 |     @property
212 |     def selector(self):
213 |         if not self._selector or self.refresh:
214 |             self._selector = Selector(text=self.fetch())
215 |             self.refresh = False
216 |         return self._selector
217 | 
218 |     def pre_parse(self, selector=None):
219 |         """
220 |         To be implemented optionally in child classes
221 |         Example: in this method is possible to validade
222 |         if there is a parse_ writen for each field in a model
223 | 
224 |         class MyFetcherModel(BaseFetcherModel):
225 |             model_class = AModelFromAnyORM
226 | 
227 |             def pre_parse(self, selector=None):
228 |                 # considering model_class as Django or MongoEngine model
229 |                 model_fields = self.model_class._meta.field_names
230 |                 parse_methods = [
231 |                     k for k, v in self.__dict__.items()
232 |                     if k.startswith('parse_') and callable(v)
233 |                 ]
234 |                 for field_name in model_fields:
235 |                     if not field_name in parse_methods:
236 |                         raise Exception(
237 |                             "parse method for %s is mandatory!" % field_name
238 |                         )
239 | 
240 |         """
241 | 
242 |     def parse(self, selector=None):
243 |         """
244 |         The entry point
245 |         fetcher = Fetcher(url="http://...")
246 |         fetcher.parse()
247 |         """
248 | 
249 |         self.pre_parse(selector)
250 | 
251 |         selector = selector or self.selector
252 | 
253 |         for field in self._fields:
254 |             data = field._parse(selector)
255 |             self._data[field.identifier] = field._raw_data = data
256 | 
257 |         # mappings has always the priority
258 |         for field_name, query in self.mappings.items():
259 |             if isinstance(query, dict):
260 |                 method = query.keys()[0]
261 |                 path = query.values()[0]
262 |             else:
263 |                 method = 'css'
264 |                 path = query
265 |             self._data[field_name] = getattr(selector, method)(path)
266 | 
267 |         self.run_field_parsers()
268 | 
269 |         for field in self._fields:
270 |             field._data = field.selector = self._data.get(field.identifier)
271 | 
272 |         self.post_parse()
273 | 
274 |         self.load_generic_fields()
275 | 
276 |     def load_generic_fields(self):
277 |         for k, v in self._data.items():
278 |             if k not in self._fields:
279 |                 field = GenericField(k, v)
280 |                 self._fields.append(field)
281 |                 setattr(self, k, field)
282 | 
283 |     def post_parse(self):
284 |         """
285 |         To be implemented optionally in child classes
286 |         """
287 | 
288 |     def run_field_parsers(self):
289 |         self._raw_data = self._data.copy()
290 |         for field_name, raw_selector in self._data.items():
291 |             field_parser = getattr(self, 'parse_%s' % field_name, None)
292 |             if field_parser:
293 |                 try:
294 |                     parsed_data = field_parser(raw_selector)
295 |                 except Exception as e:
296 |                     logger.error(
297 |                         "Exception ocurred in parse_%s: %s", field_name, e
298 |                     )
299 |                     self._data[field_name] = raw_selector
300 |                 else:
301 |                     self._data[field_name] = parsed_data
302 | 
303 |     def populate(self, obj, fields=None):
304 |         fields = fields or self._data.keys()
305 |         for field in fields:
306 |             setattr(obj, field, self._data.get(field))
307 | 
308 |     def load_mappings_from_file(self, path_or_file):
309 |         """
310 |         Will take a JSON file object, string or path
311 |         and loads on to self.mappings
312 |          {
313 |             'name': {'css': 'div#test'},
314 |             'phone': {'xpath': '//phone'},
315 |             'location': '.location'  # assumes css
316 |          }
317 |         """
318 |         if isinstance(path_or_file, basestring):
319 |             try:
320 |                 data = open(path_or_file).read()
321 |             except IOError:
322 |                 data = path_or_file
323 |         elif isinstance(path_or_file, file) or hasattr(path_or_file, 'read'):
324 |             data = path_or_file.read()
325 | 
326 |         self.mappings.update(json.loads(data))
327 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Create scraper using Scrapy Selectors
  2 | ============================================
  3 | 
  4 | > NOTE: Please consider using this another project https://github.com/ssteuteville/scrapyz better maintained and documented. But if you still find scrapy_model useful welcome!
  5 | 
  6 | [![Build
  7 | Status](https://travis-ci.org/rochacbruno/scrapy_model.png)](https://travis-ci.org/rochacbruno/scrapy_model)
  8 | 
  9 | [![PyPi version](https://img.shields.io/pypi/v/scrapy_model.svg)](https://pypi.python.org/pypi/scrapy_model/)
 10 | [![PyPi downloads](https://img.shields.io/pypi/dm/scrapy_model.svg)](https://pypi.python.org/pypi/scrapy_model/)
 11 | 
 12 | 
 13 | ## What is Scrapy?
 14 | 
 15 | Scrapy is a fast high-level screen scraping and web crawling framework, used to crawl websites and extract structured data from their pages. It can be used for a wide range of purposes, from data mining to monitoring and automated testing.
 16 | 
 17 | http://scrapy.org/
 18 | 
 19 | 
 20 | ## What is scrapy_model ?
 21 | 
 22 | It is just a helper to create scrapers using the Scrapy Selectors allowing you to select elements by CSS or by XPATH and structuring your scraper via Models (just like an ORM model) and plugable to an ORM model via ``populate`` method.
 23 | 
 24 | Import the BaseFetcherModel, CSSField or XPathField (you can use both)
 25 | 
 26 | ```python
 27 | from scrapy_model import BaseFetcherModel, CSSField
 28 | ```
 29 | 
 30 | Go to a webpage you want to scrap and use chrome dev tools or firebug to figure out the css paths then considering you want to get the following fragment from some page.
 31 | 
 32 | ```html
 33 |     <span id="person">Bruno Rocha <a href="http://brunorocha.org">website</a></span>
 34 | ```
 35 | 
 36 | ```python
 37 | class MyFetcher(BaseFetcherModel):
 38 |     name = CSSField('span#person')
 39 |     website = CSSField('span#person a')
 40 |     # XPathField('//xpath_selector_here')
 41 | ```
 42 | 
 43 | Fields can receive ``auto_extract=True`` parameter which auto extracts values from selector before calling the parse or processors. Also you can pass the ``takes_first=True`` which will for auto_extract and also tries to get the first element of the result, because scrapy selectors returns a list of matched elements.
 44 | 
 45 | 
 46 | ### Multiple queries in a single field
 47 | 
 48 | You can use multiple queries for a single field
 49 | 
 50 | ```python
 51 | name = XPathField(
 52 |     ['//*[@id="8"]/div[2]/div/div[2]/div[2]/ul',
 53 |      '//*[@id="8"]/div[2]/div/div[3]/div[2]/ul']
 54 | )
 55 | ```
 56 | 
 57 | In that case, the parsing will try to fetch by the first query and returns if finds a match, else it will try the subsequent queries until it finds something, or it will return an empty selector.
 58 | 
 59 | #### Finding the best match by a query validator
 60 | 
 61 | If you want to run multiple queries and also validates the best match you can pass a validator function which will take the scrapy selector an should return a boolean.
 62 | 
 63 | Example, imagine you get the "name" field defined above and you want to validates each query to ensure it has a 'li' with a text "Schblaums" in there.
 64 | 
 65 | ```python
 66 | 
 67 | def has_schblaums(selector):
 68 |     for li in selector.css('li'): # takes each <li> inside the ul selector
 69 |         li_text = li.css('::text').extract() # Extract only the text
 70 |         if "Schblaums" in li_text:  # check if "Schblaums" is there
 71 |             return True  # this selector is valid!
 72 |     return False  # invalid query, take the next or default value
 73 | 
 74 | class Fetcher(....):
 75 |     name = XPathField(
 76 |         ['//*[@id="8"]/div[2]/div/div[2]/div[2]/ul',
 77 |          '//*[@id="8"]/div[2]/div/div[3]/div[2]/ul'],
 78 |         query_validator=has_schblaums,
 79 |         default="undefined_name"  # optional
 80 |     )
 81 | ```
 82 | 
 83 | In the above example if both queries are invalid, the "name" field will be filled with an empty_selector, or the value defined in "default" parameter.
 84 | 
 85 | > **NOTE:** if the field has a "default" and fails in all the matcher, the default value will be passed to "processor" and also to "parse_" methods.
 86 | 
 87 | Every method named ``parse_<field>`` will run after all the fields are fetched for each field.
 88 | 
 89 | ```python
 90 |     def parse_name(self, selector):
 91 |         # here selector is the scrapy selector for 'span#person'
 92 |         name = selector.css('::text').extract()
 93 |         return name
 94 | 
 95 |     def parse_website(self, selector):
 96 |         # here selector is the scrapy selector for 'span#person a'
 97 |         website_url = selector.css('::attr(href)').extract()
 98 |         return website_url
 99 | 
100 | ```
101 | 
102 | 
103 | after defined need to run the scraper
104 | 
105 | 
106 | ```python
107 | 
108 | fetcher = Myfetcher(url='http://.....')  # optionally you can use cached_fetch=True to cache requests on redis
109 | fetcher.parse()
110 | ```
111 | 
112 | Now you can iterate ``_data``, ``_raw_data`` and atributes in fetcher
113 | 
114 | ```python
115 | >>> fetcher.name
116 | <CSSField - name - Bruno Rocha>
117 | >>> fetcher.name.value
118 | Bruno Rocha
119 | >>> fetcher._data
120 | {"name": "Bruno Rocha", "website": "http://brunorocha.org"}
121 | ```
122 | 
123 | You can populate some object
124 | 
125 | ```python
126 | >>> obj = MyObject()
127 | >>> fetcher.populate(obj)  # fields optional
128 | 
129 | >>> obj.name
130 | Bruno Rocha
131 | ```
132 | 
133 | If you do not want to define each field explicitly in the class, you can use a json file to automate the process
134 | 
135 | ```python
136 | class MyFetcher(BaseFetcherModel):
137 |    """ will load from json """
138 | 
139 | fetcher = MyFetcher(url='http://.....')
140 | fetcher.load_mappings_from_file('path/to/file.json')
141 | fetcher.parse()
142 | ```
143 | 
144 | In that case file.json should be
145 | 
146 | ```json
147 | {
148 |    "name": {"css", "span#person"},
149 |    "website": {"css": "span#person a"}
150 | }
151 | ```
152 | 
153 | You can use ``{"xpath": "..."}`` in case you prefer select by xpath
154 | 
155 | 
156 | ### parse and processor
157 | 
158 | There are 2 ways of transforming or normalizing the data for each field
159 | 
160 | #### Processors
161 | 
162 | A processor is a function, or a list of functions which will be called in the given sequence against the field value, it receives the raw_selector or the value depending on auto_extract and takes_first arguments.
163 | 
164 | It can be used for Normalization, Clean, Transformation etc..
165 | 
166 | Example:
167 | 
168 | ```python
169 | 
170 | def normalize_state(state_name):
171 |     # query my database and return the first instance of state object
172 |     return MyDatabase.State.Search(name=state_name).first()
173 | 
174 | def text_cleanup(state_name):
175 |     return state_name.strip().replace('-', '').lower()
176 | 
177 | class MyFetcher(BaseFetcherModel):
178 |     state = CSSField(
179 |         "#state::text",
180 |         takes_first=True,
181 |         processor=[text_cleanup, normalize_state]
182 |     )
183 | 
184 | fetcher = MyFetcher(url="http://....")
185 | fetcher.parse()
186 | 
187 | fetcher._raw_data.state
188 | 'Sao-Paulo'
189 | fetcher._data.state
190 | <ORM Instance - State - São Paulo>
191 | ```
192 | 
193 | #### Parse methods
194 | 
195 | any method called ``parse_<field_name>`` will run after all the process of selecting and parsing, it receives the selector or the value depending on auto_extract and takes_first argument in that field.
196 | 
197 | example:
198 | 
199 | ```python
200 | def parse_name(self, selector):
201 |    return selector.css('::text').extract()[0].upper()
202 | ```
203 | 
204 | In the above case, the name field returns the raw_selector and in the parse method we can build extra queries using ``css`` or ``xpath`` and also we need to extract() the values from the selector and optionally select the first element and apply any transformation we need.
205 | 
206 | ### Caching the html fetch
207 | 
208 | In order to cache the html returned by the url fetching for future parsing and tests you specify a cache model, by default there is no cache but you can use the built in RedisCache passing
209 | 
210 | ```python
211 |     from scrapy_model import RedisCache
212 |     fetcher = TestFetcher(cache_fetch=True,
213 |                           cache=RedisCache,
214 |                           cache_expire=1800)
215 | ```
216 | 
217 | or specifying arguments to the Redis client.
218 | 
219 | > it is a general Redis connection from python ``redis`` module
220 | 
221 | ```python
222 |     fetcher = TestFetcher(cache_fetch=True,
223 |                           cache=RedisCache("192.168.0.12:9200"),
224 |                           cache_expire=1800)
225 | ```
226 | 
227 | You can create your own caching structure, e.g: to cache htmls in memcached or s3
228 | 
229 | the cache class just need to implement ``get`` and ``set`` methods.
230 | 
231 | ```python
232 | from boto import connect_s3
233 | 
234 | class S3Cache(object):
235 |     def __init__(self, *args, **kwargs):
236 |         connection = connect_s3(ACCESS_KEY, SECRET_KEY)
237 |         self.bucket = connection.get_bucket(BUCKET_ID)
238 | 
239 |     def get(self, key):
240 |         value = self.bucket.get_key(key)
241 |         return value.get_contents_as_string() if key else None
242 | 
243 |     def set(self, key, value, expire=None):
244 |         self.bucket.set_contents(key, value, expire=expire)
245 | 
246 | 
247 | fetcher = MyFetcher(url="http://...",
248 |                     cache_fetch=True,
249 |                     cache=S3cache,
250 |                     cache_expire=1800)
251 | 
252 | ```
253 | 
254 | ### Instalation
255 | 
256 | easy to install
257 | 
258 | If running ubuntu maybe you need to run:
259 | 
260 | ```bash
261 | sudo apt-get install python-scrapy
262 | sudo apt-get install libffi-dev
263 | sudo apt-get install python-dev
264 | ```
265 | 
266 | then
267 | 
268 | ```bash
269 | pip install scrapy_model
270 | ```
271 | 
272 | or
273 | 
274 | 
275 | ```bash
276 | git clone https://github.com/rochacbruno/scrapy_model
277 | cd scrapy_model
278 | pip install -r requirements.txt
279 | python setup.py install
280 | python example.py
281 | ```
282 | 
283 | Example code to fetch the url http://en.m.wikipedia.org/wiki/Guido_van_Rossum
284 | 
285 | ```python
286 | #coding: utf-8
287 | 
288 | from scrapy_model import BaseFetcherModel, CSSField, XPathField
289 | 
290 | 
291 | class TestFetcher(BaseFetcherModel):
292 |     photo_url = XPathField('//*[@id="content"]/div[1]/table/tr[2]/td/a')
293 | 
294 |     nationality = CSSField(
295 |         '#content > div:nth-child(1) > table > tr:nth-child(4) > td > a',
296 |     )
297 | 
298 |     links = CSSField(
299 |         '#content > div:nth-child(11) > ul > li > a.external::attr(href)',
300 |         auto_extract=True
301 |     )
302 | 
303 |     def parse_photo_url(self, selector):
304 |         return "http://en.m.wikipedia.org/{}".format(
305 |             selector.xpath("@href").extract()[0]
306 |         )
307 | 
308 |     def parse_nationality(self, selector):
309 |         return selector.css("::text").extract()[0]
310 | 
311 |     def parse_name(self, selector):
312 |         return selector.extract()[0]
313 | 
314 |     def pre_parse(self, selector=None):
315 |         # this method is executed before the parsing
316 |         # you can override it, take a look at the doc string
317 | 
318 |     def post_parse(self):
319 |         # executed after all parsers
320 |         # you can load any data on to self._data
321 |         # access self._data and self._fields for current data
322 |         # self.selector contains original page
323 |         # self.fetch() returns original html
324 |         self._data.url = self.url
325 | 
326 | 
327 | class DummyModel(object):
328 |     """
329 |     For tests only, it can be a model in your database ORM
330 |     """
331 | 
332 | 
333 | if __name__ == "__main__":
334 |     from pprint import pprint
335 | 
336 |     fetcher = TestFetcher(cache_fetch=True)
337 |     fetcher.url = "http://en.m.wikipedia.org/wiki/Guido_van_Rossum"
338 | 
339 |     # Mappings can be loaded from a json file
340 |     # fetcher.load_mappings_from_file('path/to/file')
341 |     fetcher.mappings['name'] = {
342 |         "css": ("#section_0::text")
343 |     }
344 | 
345 |     fetcher.parse()
346 | 
347 |     print "Fetcher holds the data"
348 |     print fetcher._data.name
349 |     print fetcher._data
350 | 
351 |     # How to populate an object
352 |     print "Populating an object"
353 |     dummy = DummyModel()
354 | 
355 |     fetcher.populate(dummy, fields=["name", "nationality"])
356 |     # fields attr is optional
357 |     print dummy.nationality
358 |     pprint(dummy.__dict__)
359 | 
360 | ```
361 | 
362 | # outputs
363 | 
364 | 
365 | ```
366 | Fetcher holds the data
367 | Guido van Rossum
368 | {'links': [u'http://www.python.org/~guido/',
369 |            u'http://neopythonic.blogspot.com/',
370 |            u'http://www.artima.com/weblogs/index.jsp?blogger=guido',
371 |            u'http://python-history.blogspot.com/',
372 |            u'http://www.python.org/doc/essays/cp4e.html',
373 |            u'http://www.twit.tv/floss11',
374 |            u'http://www.computerworld.com.au/index.php/id;66665771',
375 |            u'http://www.stanford.edu/class/ee380/Abstracts/081105.html',
376 |            u'http://stanford-online.stanford.edu/courses/ee380/081105-ee380-300.asx'],
377 |  'name': u'Guido van Rossum',
378 |  'nationality': u'Dutch',
379 |  'photo_url': 'http://en.m.wikipedia.org//wiki/File:Guido_van_Rossum_OSCON_2006.jpg',
380 |  'url': 'http://en.m.wikipedia.org/wiki/Guido_van_Rossum'}
381 | Populating an object
382 | Dutch
383 | {'name': u'Guido van Rossum', 'nationality': u'Dutch'}
384 | ```
385 | 


--------------------------------------------------------------------------------