├── .bumpversion.cfg ├── .coveragerc ├── .gitignore ├── .travis.yml ├── CHANGES.rst ├── LICENSE ├── README.rst ├── requirements.txt ├── scrapy_magicfields ├── __init__.py └── middleware.py ├── setup.cfg ├── setup.py ├── tests ├── __init__.py └── test_magicfields.py └── tox.ini /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 1.1.0 3 | commit = True 4 | tag = True 5 | 6 | [bumpversion:file:setup.py] 7 | 8 | [bumpversion:file:scrapy_magicfields/__init__.py] 9 | 10 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = true 3 | source = scrapy_magicfields 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3.5 3 | 4 | sudo: false 5 | 6 | env: 7 | matrix: 8 | - TOXENV=py27 9 | - TOXENV=py35 10 | 11 | install: pip install -U tox codecov 12 | 13 | script: tox 14 | 15 | after_success: 16 | - codecov 17 | 18 | deploy: 19 | provider: pypi 20 | distributions: "sdist bdist_wheel" 21 | user: redapple 22 | password: 23 | secure: Nmm5WFz/InbM5jjRzV9QQeuDJi5ezrePNBwu1YaEg6jVzCk2eZHLTatDFoLzSzfxMrRMGfRDZ7rtrLQl8wcHQUCqc5+p46X7tm2T3tG3YDDrQRLlwBVS7PFino2dqkcgBnG0HJU7sZYF4RVrBvIa/F51/1UxSRfNT8tEYKO3zhFXAH6TN9CxW7u7+2+Qgfr2Q8T40P1Rqu1xMy+AkLdaPTl1OpG5VwAASGiLlW10eJ+6WMhMqhnGuvrNADeqgit0kQ4/euNHBqP8/LNW9EKPexurcW7uRtxLmesHdo5jO5h0Tix5ULhnarq7np6Tp46du9HOediDqi2Un+99wJzfE685yTnc/gCKFHa+km72ZKFtusaUdEu6qpJwgLCMpKBwL5UKI7DQNyysbHQ6WKMGCUbt5OI8zvW3posoStyA5IjD1sd0kAIEqhHjiszPCpolp7izy3DLRaF9TGRZhRhY6qOQ8tM2J/v+0JYPuBm74qrMQ6cAftqMpiP/ruGqLI7ho33yw2lcma2pGCJiPC5Y6IvTA1tGGUe18rbJYddPCtMw/hj4BHY57Gbf3bBskkLwQ9TNprAeEZVNG2eTbBg64pEQCaBEVrVtrHvhExHnWB+d8BGN3IHspyfhh7t9ZqzQGZPGLOv42M2PqDvKAoV505wdnSOk8WEv15Q+Y0AW2tw= 24 | on: 25 | tags: true 26 | repo: scrapy-plugins/scrapy-magicfields 27 | condition: $TOXENV = py35 28 | -------------------------------------------------------------------------------- /CHANGES.rst: -------------------------------------------------------------------------------- 1 | Changes 2 | ======= 3 | 4 | 5 | 1.1.0 (2016-06-30) 6 | ------------------ 7 | 8 | Add support for spiders returning dict items. 9 | 10 | 11 | 1.0.0 (2016-06-29) 12 | ------------------ 13 | 14 | Initial release. 15 | 16 | This version is functionally equivalent to scrapylib's v1.7.0 17 | ``scrapylib.magicfields.MagicFieldsMiddleware``. 18 | 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016, Scrapy Plugins 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of scrapy-magicfields nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ================== 2 | scrapy-magicfields 3 | ================== 4 | 5 | .. image:: https://travis-ci.org/scrapy-plugins/scrapy-magicfields.svg?branch=master 6 | :target: https://travis-ci.org/scrapy-plugins/scrapy-magicfields 7 | 8 | .. image:: https://codecov.io/gh/scrapy-plugins/scrapy-magicfields/branch/master/graph/badge.svg 9 | :target: https://codecov.io/gh/scrapy-plugins/scrapy-magicfields 10 | 11 | This is a Scrapy spider middleware to add extra fields to items, 12 | based on the configuration settings ``MAGIC_FIELDS`` and ``MAGIC_FIELDS_OVERRIDE``. 13 | 14 | 15 | Installation 16 | ============ 17 | 18 | Install scrapy-magicfields using ``pip``:: 19 | 20 | $ pip install scrapy-magicfields 21 | 22 | 23 | Configuration 24 | ============= 25 | 26 | 1. Add MagicFieldsMiddleware by including it in ``SPIDER_MIDDLEWARES`` 27 | in your ``settings.py`` file:: 28 | 29 | SPIDER_MIDDLEWARES = { 30 | 'scrapy_magicfields.MagicFieldsMiddleware': 100, 31 | } 32 | 33 | Here, priority ``100`` is just an example. 34 | Set its value depending on other middlewares you may have enabled already. 35 | 36 | 2. Enable the middleware using ``MAGIC_FIELDS`` (and optionally ``MAGIC_FIELDS_OVERRIDE``) 37 | in your ``setting.py``. 38 | 39 | 40 | Usage 41 | ===== 42 | 43 | Both settings ``MAGIC_FIELDS`` and ``MAGIC_FIELDS_OVERRIDE`` are dicts: 44 | 45 | * the keys are the destination field names, 46 | * their value is a string which accepts **magic variables**, 47 | — identified by a starting ``$`` (dollar sign), 48 | which will be substituted by a corresponding value at runtime. 49 | 50 | Some magic variables also accept arguments, and are specified after the magic name, 51 | using a ``:`` (column) as separator. 52 | 53 | 54 | You can set project-global magics with ``MAGIC_FIELDS``, 55 | and tune them for a specific spider using ``MAGIC_FIELDS_OVERRIDE``. 56 | 57 | In case there is more than one argument, they must come separated by ``,`` (comma sign). 58 | So the generic magic format is:: 59 | 60 | $[:arg1,arg2,...] 61 | 62 | 63 | Supported magic variables 64 | ------------------------- 65 | 66 | ``$time`` 67 | the UTC timestamp at which the item was scraped, in format ``'%Y-%m-%d %H:%M:%S'``. 68 | 69 | ``$unixtime`` 70 | the unixtime (number of seconds since the Epoch, i.e. ``time.time()``) 71 | at which the item was scraped. 72 | 73 | ``$isotime`` 74 | the UTC timestamp at which the item was scraped, with format ``'%Y-%m-%dT%H:%M:%S"``. 75 | 76 | ``$spider`` 77 | must be followed by an argument, 78 | which is the name of an attribute of the spider (like an argument passed to it). 79 | 80 | ``$env`` 81 | the value of an environment variable. 82 | It acccepts as argument the name of the variable. 83 | 84 | ``$jobid`` 85 | the job id (shortcut for ``$env:SCRAPY_JOB``) 86 | 87 | ``$jobtime`` 88 | the UTC timestamp at which the job started, in format ``'%Y-%m-%d %H:%M:%S'``. 89 | 90 | ``$response`` 91 | Access to some response properties. 92 | 93 | ``$response:url`` 94 | The url from where the item was extracted from. 95 | 96 | ``$response:status`` 97 | Response http status. 98 | 99 | ``$response:headers`` 100 | Response http headers. 101 | 102 | ``$setting`` 103 | Access the given Scrapy setting. It accepts one argument: the name of the setting. 104 | 105 | ``$field`` 106 | Allows to copy the value of one field to another 107 | Its argument is the source field. 108 | Effects are unpredicable if you use as source a field that is filled 109 | using magic fields. 110 | 111 | 112 | Examples 113 | -------- 114 | 115 | The following configuration will add two fields to each scraped item: 116 | 117 | - ``'timestamp'``, which will be filled with the string ``'item scraped at '``, 118 | - and ``'spider'``, which will contain the spider name 119 | 120 | :: 121 | 122 | MAGIC_FIELDS = { 123 | "timestamp": "item scraped at $time", 124 | "spider": "$spider:name" 125 | } 126 | 127 | The following configuration will copy the url to the field sku:: 128 | 129 | MAGIC_FIELDS = { 130 | "sku": "$field:url" 131 | } 132 | 133 | Magics also accept a regular expression argument which allows to extract 134 | and assign only part of the value generated by the magic. 135 | You have to specify it using the ``r''`` notation. 136 | 137 | Let's pretend that the urls of your items look like ``'http://www.example.com/product.html?item_no=345'`` 138 | and you want to assign to the ``sku`` field only the item number. 139 | 140 | The following example, similar to the previous one but with a second regular expression argument, 141 | will do the task:: 142 | 143 | MAGIC_FIELDS = { 144 | "sku": "$field:url,r'item_no=(\d+)'" 145 | } 146 | 147 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scrapy 2 | -------------------------------------------------------------------------------- /scrapy_magicfields/__init__.py: -------------------------------------------------------------------------------- 1 | from .middleware import MagicFieldsMiddleware 2 | 3 | 4 | __version__ = "1.1.0" 5 | -------------------------------------------------------------------------------- /scrapy_magicfields/middleware.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | import os 4 | import re 5 | import time 6 | 7 | from scrapy.exceptions import NotConfigured 8 | from scrapy.item import BaseItem 9 | 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | def _time(): 15 | return datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S') 16 | 17 | def _isotime(): 18 | return datetime.datetime.utcnow().isoformat() 19 | 20 | _REGEXES = {} 21 | _REGEX_ERRORS = {} 22 | def _extract_regex_group(regex, txt): 23 | compiled = _REGEXES.get(regex) 24 | errmessage = _REGEX_ERRORS.get(regex) 25 | if compiled is None and errmessage is None: 26 | try: 27 | compiled = re.compile(regex) 28 | _REGEXES[regex] = compiled 29 | except Exception as e: 30 | errmessage = e.message 31 | _REGEX_ERRORS[regex] = errmessage 32 | if errmessage: 33 | raise ValueError(errmessage) 34 | m = compiled.search(txt) 35 | if m: 36 | return "".join(m.groups()) or None 37 | 38 | _ENTITY_FUNCTION_MAP = { 39 | '$time': _time, 40 | '$unixtime': time.time, 41 | '$isotime': _isotime, 42 | } 43 | 44 | _ENTITIES_RE = re.compile("(\$[a-z]+)(:\w+)?(?:,r\'(.+)\')?") 45 | def _first_arg(args): 46 | if args: 47 | return args.pop(0) 48 | 49 | def _format(fmt, spider, response, item, fixed_values): 50 | out = fmt 51 | for m in _ENTITIES_RE.finditer(fmt): 52 | val = None 53 | entity, args, regex = m.groups() 54 | args = list(filter(None, (args or ':')[1:].split(','))) 55 | if entity == "$jobid": 56 | val = os.environ.get('SCRAPY_JOB', '') 57 | elif entity == "$spider": 58 | attr = _first_arg(args) 59 | if not attr or not hasattr(spider, attr): 60 | logger.warning("Error at '%s': spider does not have attribute" % m.group()) 61 | else: 62 | val = str(getattr(spider, attr)) 63 | elif entity == "$response": 64 | attr = _first_arg(args) 65 | if not attr or not hasattr(response, attr): 66 | logger.warning("Error at '%s': response does not have attribute" % m.group()) 67 | else: 68 | val = str(getattr(response, attr)) 69 | elif entity == "$field": 70 | attr = _first_arg(args) 71 | if attr in item: 72 | val = str(item[attr]) 73 | elif entity in fixed_values: 74 | attr = _first_arg(args) 75 | val = fixed_values[entity] 76 | if entity == "$setting" and attr: 77 | val = str(val[attr]) 78 | elif entity == "$env" and args: 79 | attr = _first_arg(args) 80 | if attr: 81 | val = os.environ.get(attr, '') 82 | else: 83 | function = _ENTITY_FUNCTION_MAP.get(entity) 84 | if function is not None: 85 | try: 86 | val = str(function(*args)) 87 | except: 88 | logger.warning("Error at '%s': invalid argument for function" % m.group()) 89 | if val is not None: 90 | out = out.replace(m.group(), val, 1) 91 | if regex: 92 | try: 93 | out = _extract_regex_group(regex, out) 94 | except ValueError as e: 95 | logger.warning("Error at '%s': %s" % (m.group(), e.message)) 96 | 97 | return out 98 | 99 | class MagicFieldsMiddleware(object): 100 | 101 | @classmethod 102 | def from_crawler(cls, crawler): 103 | mfields = crawler.settings.getdict("MAGIC_FIELDS").copy() 104 | mfields.update(crawler.settings.getdict("MAGIC_FIELDS_OVERRIDE")) 105 | if not mfields: 106 | raise NotConfigured 107 | return cls(mfields, crawler.settings) 108 | 109 | def __init__(self, mfields, settings): 110 | self.mfields = mfields 111 | self.fixed_values = { 112 | "$jobtime": _time(), 113 | "$setting": settings, 114 | } 115 | 116 | def process_spider_output(self, response, result, spider): 117 | for _res in result: 118 | if isinstance(_res, (BaseItem, dict)): 119 | for field, fmt in self.mfields.items(): 120 | _res.setdefault(field, _format(fmt, spider, response, _res, self.fixed_values)) 121 | yield _res 122 | 123 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal=1 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='scrapy-magicfields', 5 | version='1.1.0', 6 | license='BSD', 7 | description='Scrapy middleware to add extra "magic" fields to items', 8 | author='Scrapinghub', 9 | author_email='info@scrapinghub.com', 10 | url='http://github.com/scrapy-plugins/scrapy-magicfields', 11 | packages=['scrapy_magicfields'], 12 | platforms=['Any'], 13 | classifiers=[ 14 | 'Development Status :: 4 - Beta', 15 | 'License :: OSI Approved :: BSD License', 16 | 'Operating System :: OS Independent', 17 | 'Programming Language :: Python', 18 | 'Programming Language :: Python :: 2', 19 | 'Programming Language :: Python :: 2.7', 20 | 'Programming Language :: Python :: 3', 21 | 'Programming Language :: Python :: 3.5', 22 | ], 23 | install_requires=['scrapy'] 24 | ) 25 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/test_magicfields.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import re, os 3 | from unittest import TestCase 4 | 5 | from scrapy.spiders import Spider 6 | from scrapy.utils.test import get_crawler 7 | from scrapy.item import DictItem, Field 8 | from scrapy.http import HtmlResponse 9 | 10 | from scrapy_magicfields import MagicFieldsMiddleware 11 | from scrapy_magicfields.middleware import _format 12 | 13 | 14 | class TestItem(DictItem): 15 | fields = { 16 | 'url': Field(), 17 | 'nom': Field(), 18 | 'prix': Field(), 19 | 'spider': Field(), 20 | 'sku': Field(), 21 | } 22 | 23 | 24 | class MagicFieldsTest(TestCase): 25 | 26 | def setUp(self): 27 | self.environ = os.environ.copy() 28 | self.spider = Spider('myspider', arg1='val1', start_urls = ["http://example.com"]) 29 | 30 | def _log(x): 31 | print(x) 32 | 33 | self.spider.log = _log 34 | self.response = HtmlResponse(body=b"", url="http://www.example.com/product/8798732") 35 | self.item = TestItem({'nom': 'myitem', 'prix': "56.70 euros", "url": "http://www.example.com/product.html?item_no=345"}) 36 | 37 | def tearDown(self): 38 | os.environ = self.environ 39 | 40 | def assertRegexpMatches(self, text, regexp): 41 | """not present in python below 2.7""" 42 | return self.assertNotEqual(re.match(regexp, text), None) 43 | 44 | def test_hello(self): 45 | self.assertEqual(_format("hello world!", self.spider, self.response, self.item, {}), 'hello world!') 46 | 47 | def test_spidername_time(self): 48 | formatted = _format("Spider: $spider:name. Item scraped at $time", self.spider, self.response, self.item, {}) 49 | self.assertRegexpMatches(formatted, 'Spider: myspider. Item scraped at \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$') 50 | 51 | def test_unixtime(self): 52 | formatted = _format("Item scraped at $unixtime", self.spider, self.response, self.item, {}) 53 | self.assertRegexpMatches(formatted, 'Item scraped at \d+\.\d+$') 54 | 55 | def test_isotime(self): 56 | formatted = _format("$isotime", self.spider, self.response, self.item, {}) 57 | self.assertRegexpMatches(formatted, '\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{6}$') 58 | 59 | def test_jobid(self): 60 | os.environ["SCRAPY_JOB"] = 'aa788' 61 | formatted = _format("job id '$jobid' for spider $spider:name", self.spider, self.response, self.item, {}) 62 | self.assertEqual(formatted, "job id 'aa788' for spider myspider") 63 | 64 | def test_spiderarg(self): 65 | formatted = _format("Argument arg1: $spider:arg1", self.spider, self.response, self.item, {}) 66 | self.assertEqual(formatted, 'Argument arg1: val1') 67 | 68 | def test_spiderattr(self): 69 | formatted = _format("$spider:start_urls", self.spider, self.response, self.item, {}) 70 | self.assertEqual(formatted, "['http://example.com']") 71 | 72 | def test_settings(self): 73 | formatted = _format("$setting:MY_SETTING", self.spider, self.response, self.item, {"$setting": {"MY_SETTING": True}}) 74 | self.assertEqual(formatted, 'True') 75 | 76 | def test_notexisting(self): 77 | """Not existing entities are not substituted""" 78 | formatted = _format("Item scraped at $myentity", self.spider, self.response, self.item, {}) 79 | self.assertEqual(formatted, 'Item scraped at $myentity') 80 | 81 | def test_noargs(self): 82 | """If entity does not accept arguments, don't substitute""" 83 | formatted = _format("Scraped on day $unixtime:arg", self.spider, self.response, self.item, {}) 84 | self.assertEqual(formatted, "Scraped on day $unixtime:arg") 85 | 86 | def test_noargs2(self): 87 | """If entity does not have enough arguments, don't substitute""" 88 | formatted = _format("$spider", self.spider, self.response, self.item, {}) 89 | self.assertEqual(formatted, "$spider") 90 | 91 | def test_invalidattr(self): 92 | formatted = _format("Argument arg2: $spider:arg2", self.spider, self.response, self.item, {}) 93 | self.assertEqual(formatted, "Argument arg2: $spider:arg2") 94 | 95 | def test_environment(self): 96 | os.environ["TEST_ENV"] = "testval" 97 | formatted = _format("$env:TEST_ENV", self.spider, self.response, self.item, {}) 98 | self.assertEqual(formatted, "testval") 99 | 100 | def test_response(self): 101 | formatted = _format("$response:url", self.spider, self.response, self.item, {}) 102 | self.assertEqual(formatted, self.response.url) 103 | 104 | def test_fields_copy(self): 105 | formatted = _format("$field:nom", self.spider, self.response, self.item, {}) 106 | self.assertEqual(formatted, 'myitem') 107 | 108 | def test_regex(self): 109 | formatted = _format("$field:url,r'item_no=(\d+)'", self.spider, self.response, self.item, {}) 110 | self.assertEqual(formatted, '345') 111 | 112 | def test_mware(self): 113 | settings = {"MAGIC_FIELDS": {"spider": "$spider:name"}} 114 | crawler = get_crawler(settings_dict=settings) 115 | mware = MagicFieldsMiddleware.from_crawler(crawler) 116 | result = list(mware.process_spider_output(self.response, [self.item], self.spider))[0] 117 | expected = { 118 | 'nom': 'myitem', 119 | 'prix': '56.70 euros', 120 | 'spider': 'myspider', 121 | 'url': 'http://www.example.com/product.html?item_no=345' 122 | } 123 | self.assertEqual(result, expected) 124 | 125 | def test_mware_override(self): 126 | settings = { 127 | "MAGIC_FIELDS": {"spider": "$spider:name"}, 128 | "MAGIC_FIELDS_OVERRIDE": {"sku": "$field:nom"} 129 | } 130 | crawler = get_crawler(settings_dict=settings) 131 | mware = MagicFieldsMiddleware.from_crawler(crawler) 132 | result = list(mware.process_spider_output(self.response, [self.item], self.spider))[0] 133 | expected = { 134 | 'nom': 'myitem', 135 | 'prix': '56.70 euros', 136 | 'spider': 'myspider', 137 | 'url': 'http://www.example.com/product.html?item_no=345', 138 | 'sku': 'myitem', 139 | } 140 | self.assertEqual(result, expected) 141 | 142 | 143 | class MagicFieldsDictItemTest(MagicFieldsTest): 144 | 145 | def setUp(self): 146 | super(MagicFieldsDictItemTest, self).setUp() 147 | self.item = {'nom': 'myitem', 'prix': "56.70 euros", "url": "http://www.example.com/product.html?item_no=345"} 148 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # Tox (http://tox.testrun.org/) is a tool for running tests 2 | # in multiple virtualenvs. This configuration file will run the 3 | # test suite on all supported python versions. To use it, "pip install tox" 4 | # and then run "tox" from this directory. 5 | 6 | [tox] 7 | envlist = py27, py35 8 | 9 | [testenv] 10 | deps = 11 | -rrequirements.txt 12 | coverage 13 | nose 14 | 15 | commands = 16 | nosetests \ 17 | --with-doctest \ 18 | --with-coverage --cover-package=scrapy_magicfields \ 19 | tests 20 | --------------------------------------------------------------------------------