├── .bumpversion.cfg ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.rst ├── requirements-py3.txt ├── requirements.txt ├── scrapy_djangoitem └── __init__.py ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── models.py ├── settings.py └── test_djangoitem.py └── tox.ini /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 1.1.1 3 | commit = True 4 | tag = True 5 | tag_name = v{new_version} 6 | 7 | [bumpversion:file:setup.py] 8 | 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | matrix: 3 | include: 4 | - python: 2.7 5 | env: TOXENV=py27 6 | - python: 3.4 7 | env: TOXENV=py34 8 | - python: 3.5 9 | env: TOXENV=py35 10 | - python: 3.6 11 | env: TOXENV=py36 12 | install: 13 | - pip install -U tox 14 | script: tox 15 | notifications: 16 | irc: 17 | use_notice: true 18 | skip_join: true 19 | channels: 20 | - irc.freenode.org#scrapy 21 | deploy: 22 | provider: pypi 23 | distributions: sdist bdist_wheel 24 | user: scrapy 25 | password: 26 | secure: bUpnSgikr11B4ddmDUlAEg6ujKVM1Lwd7M7mecdXyMDVDobIOIpKA9GfgajfM9Uh9NDGYERvkIzXAikM4uY3Ltz+QtL4qJ14y7hp0Uw2IfoLcRiea315ieNdEQL2cF6EC6GEo49/Ht9iLLZsrlSZdOnYn+HjopYe58cYYuAHyp8= 27 | on: 28 | tags: true 29 | all_branches: true 30 | repo: scrapy-plugins/scrapy-djangoitem 31 | condition: $TOXENV == py27 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Scrapy project 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of scrapy-djangoitem nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | scrapy-djangoitem 3 | ================= 4 | 5 | .. image:: https://img.shields.io/pypi/v/scrapy-djangoitem.svg 6 | :target: https://pypi.python.org/pypi/scrapy-djangoitem 7 | :alt: PyPI Version 8 | 9 | .. image:: https://img.shields.io/travis/scrapy-plugins/scrapy-djangoitem/master.svg 10 | :target: http://travis-ci.org/scrapy-plugins/scrapy-djangoitem 11 | :alt: Build Status 12 | 13 | .. image:: https://img.shields.io/github/license/scrapy-plugins/scrapy-djangoitem.svg 14 | :target: https://github.com/scrapy-plugins/scrapy-djangoitem/blob/master/LICENSE 15 | :alt: License 16 | 17 | 18 | ``scrapy-djangoitem`` is an extension that allows you to define `Scrapy items 19 | `_ using existing `Django 20 | models `_. 21 | 22 | This utility provides a new class, named ``DjangoItem``, that you can use as a 23 | regular Scrapy item and link it to a Django model with its ``django_model`` 24 | attribute. Start using it right away by importing it from this package:: 25 | 26 | from scrapy_djangoitem import DjangoItem 27 | 28 | Installation 29 | ============ 30 | 31 | Starting with ``v1.1`` both ``Python 2.7`` and ``Python 3.4/3.5`` are 32 | supported. For ``Python 3`` you need ``Scrapy v1.1`` or above. 33 | 34 | Latest tested Django version is ``Django 1.9``. 35 | 36 | Install from ``PyPI`` using:: 37 | 38 | pip install scrapy-djangoitem 39 | 40 | 41 | Introduction 42 | ============ 43 | 44 | ``DjangoItem`` is a class of item that gets its fields definition from a 45 | Django model, you simply create a ``DjangoItem`` and specify what Django 46 | model it relates to. 47 | 48 | Besides of getting the model fields defined on your item, ``DjangoItem`` 49 | provides a method to create and populate a Django model instance with the item 50 | data. 51 | 52 | Usage 53 | ===== 54 | 55 | ``DjangoItem`` works much like ModelForms in Django, you create a subclass 56 | and define its ``django_model`` attribute to be a valid Django model. With this 57 | you will get an item with a field for each Django model field. 58 | 59 | In addition, you can define fields that aren't present in the model and even 60 | override fields that are present in the model defining them in the item. 61 | 62 | Let's see some examples: 63 | 64 | Creating a Django model for the examples:: 65 | 66 | from django.db import models 67 | 68 | class Person(models.Model): 69 | name = models.CharField(max_length=255) 70 | age = models.IntegerField() 71 | 72 | Defining a basic ``DjangoItem``:: 73 | 74 | from scrapy_djangoitem import DjangoItem 75 | 76 | class PersonItem(DjangoItem): 77 | django_model = Person 78 | 79 | ``DjangoItem`` works just like Scrapy items:: 80 | 81 | >>> p = PersonItem() 82 | >>> p['name'] = 'John' 83 | >>> p['age'] = '22' 84 | 85 | To obtain the Django model from the item, we call the extra method 86 | ``DjangoItem.save()`` of the ``DjangoItem``:: 87 | 88 | >>> person = p.save() 89 | >>> person.name 90 | 'John' 91 | >>> person.age 92 | '22' 93 | >>> person.id 94 | 1 95 | 96 | The model is already saved when we call ``DjangoItem.save()``, we 97 | can prevent this by calling it with ``commit=False``. We can use 98 | ``commit=False`` in ``DjangoItem.save()`` method to obtain an unsaved model:: 99 | 100 | >>> person = p.save(commit=False) 101 | >>> person.name 102 | 'John' 103 | >>> person.age 104 | '22' 105 | >>> person.id 106 | None 107 | 108 | As said before, we can add other fields to the item:: 109 | 110 | import scrapy 111 | from scrapy_djangoitem import DjangoItem 112 | 113 | class PersonItem(DjangoItem): 114 | django_model = Person 115 | sex = scrapy.Field() 116 | 117 | :: 118 | 119 | >>> p = PersonItem() 120 | >>> p['name'] = 'John' 121 | >>> p['age'] = '22' 122 | >>> p['sex'] = 'M' 123 | 124 | And we can override the fields of the model with your own:: 125 | 126 | class PersonItem(DjangoItem): 127 | django_model = Person 128 | name = scrapy.Field(default='No Name') 129 | 130 | This is useful to provide properties to the field, like a default or any other 131 | property that your project uses. Those additional fields won't be taken into 132 | account when doing a ``DjangoItem.save()``. 133 | 134 | Caveats 135 | ======= 136 | 137 | ``DjangoItem`` is a rather convenient way to integrate Scrapy projects with Django 138 | models, but bear in mind that Django ORM **may not scale well** if you scrape a lot 139 | of items (ie. millions) with Scrapy. This is because a relational backend is 140 | **often not a good choice for a write intensive applications** (such as a web 141 | crawler), specially if the database is highly normalized and with many indices. 142 | 143 | Setup 144 | ===== 145 | 146 | To use the Django models outside the Django application you need to set up the 147 | ``DJANGO_SETTINGS_MODULE`` environment variable and --in most cases-- modify 148 | the ``PYTHONPATH`` environment variable to be able to import the settings 149 | module. 150 | 151 | There are many ways to do this depending on your use case and preferences. 152 | Below is detailed one of the simplest ways to do it. 153 | 154 | Suppose your Django project is named ``mysite``, is located in the path 155 | ``/home/projects/mysite`` and you have created an app ``myapp`` with the model 156 | ``Person``. That means your directory structure is something like this:: 157 | 158 | /home/projects/mysite 159 | ├── manage.py 160 | ├── myapp 161 | │   ├── __init__.py 162 | │   ├── models.py 163 | │   ├── tests.py 164 | │   └── views.py 165 | └── mysite 166 | ├── __init__.py 167 | ├── settings.py 168 | ├── urls.py 169 | └── wsgi.py 170 | 171 | Then you need to add ``/home/projects/mysite`` to the ``PYTHONPATH`` 172 | environment variable and set up the environment variable 173 | ``DJANGO_SETTINGS_MODULE`` to ``mysite.settings``. That can be done in your 174 | Scrapy's settings file by adding the lines below:: 175 | 176 | import sys 177 | sys.path.append('/home/projects/mysite') 178 | 179 | import os 180 | os.environ['DJANGO_SETTINGS_MODULE'] = 'mysite.settings' 181 | 182 | Notice that we modify the ``sys.path`` variable instead the ``PYTHONPATH`` 183 | environment variable as we are already within the python runtime. If everything 184 | is right, you should be able to start the ``scrapy shell`` command and import 185 | the model ``Person`` (i.e. ``from myapp.models import Person``). 186 | 187 | Starting with ``Django 1.8`` you also have to explicitly set up ``Django`` if using 188 | it outside a ``manage.py`` context 189 | (see `Django Docs `_):: 190 | 191 | import django 192 | django.setup() 193 | 194 | 195 | Development 196 | =========== 197 | 198 | Test suite from the ``tests`` directory can be run using ``tox`` by running:: 199 | 200 | tox 201 | 202 | ...using the configuration in ``tox.ini``. The ``Python`` interpreters 203 | used have to be installed locally on the system. 204 | 205 | 206 | Changelog 207 | ========= 208 | 209 | v1.1.1 (2016-05-04) 210 | ------------------- 211 | 212 | * Distribute as universal wheel 213 | * Fix README's markup 214 | 215 | v1.1 (2016-05-04) 216 | ----------------- 217 | 218 | * ``Python 3.4/3.5`` support 219 | * Making tests work with ``Django 1.9`` again 220 | 221 | v1.0 (2015-04-29) 222 | ----------------- 223 | 224 | * Initial version 225 | -------------------------------------------------------------------------------- /requirements-py3.txt: -------------------------------------------------------------------------------- 1 | Scrapy>=1.1.0rc1 2 | Django 3 | six -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Scrapy>=0.24.5 2 | Django 3 | six 4 | -------------------------------------------------------------------------------- /scrapy_djangoitem/__init__.py: -------------------------------------------------------------------------------- 1 | from six import with_metaclass 2 | from django.core.exceptions import ValidationError 3 | from scrapy.item import Field, Item, ItemMeta 4 | 5 | 6 | 7 | class DjangoItemMeta(ItemMeta): 8 | 9 | def __new__(mcs, class_name, bases, attrs): 10 | cls = super(DjangoItemMeta, mcs).__new__(mcs, class_name, bases, attrs) 11 | cls.fields = cls.fields.copy() 12 | 13 | if cls.django_model: 14 | cls._model_fields = [] 15 | cls._model_meta = cls.django_model._meta 16 | for model_field in cls._model_meta.fields: 17 | if not model_field.auto_created: 18 | if model_field.name not in cls.fields: 19 | cls.fields[model_field.name] = Field() 20 | cls._model_fields.append(model_field.name) 21 | return cls 22 | 23 | 24 | class DjangoItem(with_metaclass(DjangoItemMeta, Item)): 25 | 26 | django_model = None 27 | 28 | def __init__(self, *args, **kwargs): 29 | super(DjangoItem, self).__init__(*args, **kwargs) 30 | self._instance = None 31 | self._errors = None 32 | 33 | def save(self, commit=True): 34 | if commit: 35 | self.instance.save() 36 | return self.instance 37 | 38 | def is_valid(self, exclude=None): 39 | self._get_errors(exclude) 40 | return not bool(self._errors) 41 | 42 | def _get_errors(self, exclude=None): 43 | if self._errors is not None: 44 | return self._errors 45 | 46 | self._errors = {} 47 | if exclude is None: 48 | exclude = [] 49 | 50 | try: 51 | self.instance.clean_fields(exclude=exclude) 52 | except ValidationError as e: 53 | self._errors = e.update_error_dict(self._errors) 54 | 55 | try: 56 | self.instance.clean() 57 | except ValidationError as e: 58 | self._errors = e.update_error_dict(self._errors) 59 | 60 | # uniqueness is not checked, because it is faster to check it when 61 | # saving object to database. Just beware, that failed save() 62 | # raises IntegrityError instead of ValidationError. 63 | 64 | return self._errors 65 | errors = property(_get_errors) 66 | 67 | @property 68 | def instance(self): 69 | if self._instance is None: 70 | modelargs = dict((k, self.get(k)) for k in self._values 71 | if k in self._model_fields) 72 | self._instance = self.django_model(**modelargs) 73 | return self._instance 74 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal = 1 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | setup( 5 | name='scrapy-djangoitem', 6 | version='1.1.1', 7 | url='https://github.com/scrapy-plugins/scrapy-djangoitem', 8 | description='Scrapy extension to write scraped items using Django models', 9 | long_description=open('README.rst').read(), 10 | author='Scrapy developers', 11 | license='BSD', 12 | packages=find_packages(exclude=('tests', 'tests.*')), 13 | include_package_data=True, 14 | zip_safe=False, 15 | classifiers=[ 16 | 'Framework :: Scrapy', 17 | 'Development Status :: 5 - Production/Stable', 18 | 'Environment :: Console', 19 | 'Intended Audience :: Developers', 20 | 'License :: OSI Approved :: BSD License', 21 | 'Operating System :: OS Independent', 22 | 'Programming Language :: Python', 23 | 'Programming Language :: Python :: 2', 24 | 'Programming Language :: Python :: 2.7', 25 | 'Programming Language :: Python :: 3', 26 | 'Programming Language :: Python :: 3.4', 27 | 'Programming Language :: Python :: 3.5', 28 | 'Programming Language :: Python :: 3.6', 29 | 'Topic :: Utilities', 30 | 'Framework :: Django', 31 | 'Framework :: Scrapy', 32 | ], 33 | install_requires=['six'], 34 | requires=['scrapy (>=0.24.5)', 'django'], 35 | ) 36 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapy-plugins/scrapy-djangoitem/b646edc534239426de1258820fbbb7faa5affcca/tests/__init__.py -------------------------------------------------------------------------------- /tests/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | 3 | 4 | class Person(models.Model): 5 | name = models.CharField(max_length=255, default='Robot') 6 | age = models.IntegerField() 7 | 8 | class Meta: 9 | app_label = 'test_djangoitem' 10 | 11 | 12 | class IdentifiedPerson(models.Model): 13 | identifier = models.PositiveIntegerField(primary_key=True) 14 | name = models.CharField(max_length=255) 15 | age = models.IntegerField() 16 | 17 | class Meta: 18 | app_label = 'test_djangoitem' 19 | -------------------------------------------------------------------------------- /tests/settings.py: -------------------------------------------------------------------------------- 1 | DATABASES = { 2 | 'default': { 3 | 'ENGINE': 'django.db.backends.sqlite3', 4 | 'NAME': ':memory:', 5 | } 6 | } 7 | 8 | SECRET_KEY = 'top-secret' 9 | -------------------------------------------------------------------------------- /tests/test_djangoitem.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | os.environ['DJANGO_SETTINGS_MODULE'] = 'tests.settings' 5 | import django 6 | django.setup() 7 | 8 | from scrapy_djangoitem import DjangoItem, Field 9 | from tests.models import Person, IdentifiedPerson 10 | 11 | 12 | class BasePersonItem(DjangoItem): 13 | django_model = Person 14 | 15 | 16 | class NewFieldPersonItem(BasePersonItem): 17 | other = Field() 18 | 19 | 20 | class OverrideFieldPersonItem(BasePersonItem): 21 | age = Field() 22 | 23 | 24 | class IdentifiedPersonItem(DjangoItem): 25 | django_model = IdentifiedPerson 26 | 27 | 28 | class DjangoItemTest(unittest.TestCase): 29 | 30 | def assertSortedEqual(self, first, second, msg=None): 31 | return self.assertEqual(sorted(first), sorted(second), msg) 32 | 33 | def test_base(self): 34 | i = BasePersonItem() 35 | self.assertSortedEqual(i.fields.keys(), ['age', 'name']) 36 | 37 | def test_new_fields(self): 38 | i = NewFieldPersonItem() 39 | self.assertSortedEqual(i.fields.keys(), ['age', 'other', 'name']) 40 | 41 | def test_override_field(self): 42 | i = OverrideFieldPersonItem() 43 | self.assertSortedEqual(i.fields.keys(), ['age', 'name']) 44 | 45 | def test_custom_primary_key_field(self): 46 | """ 47 | Test that if a custom primary key exists, it is 48 | in the field list. 49 | """ 50 | i = IdentifiedPersonItem() 51 | self.assertSortedEqual(i.fields.keys(), ['age', 'identifier', 'name']) 52 | 53 | def test_save(self): 54 | i = BasePersonItem() 55 | self.assertSortedEqual(i.fields.keys(), ['age', 'name']) 56 | 57 | i['name'] = 'John' 58 | i['age'] = '22' 59 | person = i.save(commit=False) 60 | 61 | self.assertEqual(person.name, 'John') 62 | self.assertEqual(person.age, '22') 63 | 64 | def test_override_save(self): 65 | i = OverrideFieldPersonItem() 66 | 67 | i['name'] = 'John' 68 | # it is not obvious that "age" should be saved also, since it was 69 | # redefined in child class 70 | i['age'] = '22' 71 | person = i.save(commit=False) 72 | 73 | self.assertEqual(person.name, 'John') 74 | self.assertEqual(person.age, '22') 75 | 76 | def test_validation(self): 77 | long_name = 'z' * 300 78 | i = BasePersonItem(name=long_name) 79 | self.assertFalse(i.is_valid()) 80 | self.assertEqual(set(i.errors), set(['age', 'name'])) 81 | i = BasePersonItem(name='John') 82 | self.assertTrue(i.is_valid(exclude=['age'])) 83 | self.assertEqual({}, i.errors) 84 | 85 | # once the item is validated, it does not validate again 86 | i['name'] = long_name 87 | self.assertTrue(i.is_valid()) 88 | 89 | def test_override_validation(self): 90 | i = OverrideFieldPersonItem() 91 | i['name'] = 'John' 92 | self.assertFalse(i.is_valid()) 93 | 94 | i = i = OverrideFieldPersonItem() 95 | i['name'] = 'John' 96 | i['age'] = '22' 97 | self.assertTrue(i.is_valid()) 98 | 99 | def test_default_field_values(self): 100 | i = BasePersonItem() 101 | person = i.save(commit=False) 102 | self.assertEqual(person.name, 'Robot') 103 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # Tox (http://tox.testrun.org/) is a tool for running tests 2 | # in multiple virtualenvs. This configuration file will run the 3 | # test suite on all supported python versions. To use it, "pip install tox" 4 | # and then run "tox" from this directory. 5 | 6 | [tox] 7 | envlist = py27,py34,py35 8 | 9 | [testenv] 10 | deps = 11 | -rrequirements.txt 12 | pytest 13 | commands = 14 | py.test {posargs:tests} 15 | 16 | [testenv:py34] 17 | basepython = python3.4 18 | deps = 19 | -rrequirements-py3.txt 20 | pytest 21 | 22 | [testenv:py35] 23 | basepython = python3.5 24 | deps = {[testenv:py34]deps} 25 | --------------------------------------------------------------------------------