├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── pgpipeline └── __init__.py ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | .venv/ 83 | venv/ 84 | ENV/ 85 | 86 | # Spyder project settings 87 | .spyderproject 88 | 89 | # Rope project settings 90 | .ropeproject 91 | .vscode 92 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Dotan Nahum 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | init: 2 | pip install -r requirements.txt 3 | install: 4 | pip install -e . 5 | test: 6 | nosetests tests 7 | dist: 8 | rm -rf dist 9 | python setup.py sdist bdist_wheel 10 | release: 11 | twine upload dist/* 12 | .PHONY: init install test dist 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pgpipeline: automatic postgres pipeline for Scrapy 2 | 3 | A Scrapy pipeline module to persist items to a postgres table automatically. 4 | 5 | 6 | ## Quick Start 7 | 8 | Here's an example showing automatic item pipeline, with a custom `JSONB` field. 9 | 10 | ```python 11 | # settings.py 12 | from sqlalchemy.dialects.postgresql import JSONB 13 | 14 | ITEM_PIPELINES = { 15 | 'pgpipeline.PgPipeline': 300, 16 | } 17 | 18 | PG_PIPELINE = { 19 | 'connection': 'postgresql://localhost:5432/scrapy_db', 20 | 'table_name': 'demo_items', 21 | 'pkey': 'item_id', 22 | 'ignore_identical': ['item_id', 'job_id'], 23 | 'types': { 24 | 'some_data': JSONB 25 | }, 26 | 'onconflict': 'upsert' 27 | } 28 | ``` 29 | 30 | All columns, tables, and indices are automatically created. 31 | 32 | * `pkey`: a primary key for this item (other than database-generated `id`) 33 | * `ignore_identical`: these are a set of fields by which we identify duplicates and skip insert. 34 | * `types`: keys specified here will be using the type given, otherwise types are guessed. 35 | * `onconflict`: upsert|ignore|non-null - `ignore` will skip inserting on conflict and `upsert` will update. `non-null` will upsert only values that are not `None` and thus avoid removing existing values. 36 | ## Developers 37 | 38 | Set up a development environment 39 | ``` 40 | $ pip install -r requirements.txt 41 | ``` 42 | 43 | ### Development 44 | 45 | * Dependencies: list them in `requirements.txt` 46 | 47 | ### Release 48 | 49 | * Dependencies: list them in `setup.py` under `install_requires`: 50 | 51 | ```python 52 | install_requires=['peppercorn'], 53 | ``` 54 | 55 | Then: 56 | 57 | ``` 58 | $ make dist && make release 59 | ``` 60 | 61 | # Contributing 62 | 63 | Fork, implement, add tests, pull request, get my everlasting thanks and a respectable place here :). 64 | 65 | 66 | ### Thanks: 67 | 68 | To all [Contributors](https://github.com/jondot/pgpipeline/graphs/contributors) - you make this happen, thanks! 69 | 70 | 71 | # Copyright 72 | 73 | Copyright (c) 2017 [Dotan Nahum](http://gplus.to/dotan) [@jondot](http://twitter.com/jondot). See [LICENSE](LICENSE) for further details. 74 | -------------------------------------------------------------------------------- /pgpipeline/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import dataset 3 | from sqlalchemy.dialects.postgresql import JSONB 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class PgPipeline(object): 8 | def __init__(self, **kwargs): 9 | self.args = kwargs 10 | 11 | @classmethod 12 | def from_crawler(cls, crawler): 13 | args = crawler.settings.get('PG_PIPELINE', {}) 14 | return cls(**args) 15 | 16 | def open_spider(self, spider): 17 | if self.args.get('connection'): 18 | self.db = dataset.connect(self.args.get('connection')) 19 | self.table = self.db[self.args.get('table_name')] 20 | self.pkey = self.args.get('pkey') 21 | self.types = self.args.get('types', {}) 22 | self.ignore_identical = self.args.get('ignore_identical') 23 | self.table.create_index([self.pkey]) 24 | self.table.create_index(self.ignore_identical) 25 | self.onconflict = self.args.get('onconflict', 'ignore') 26 | 27 | self.enabled = True 28 | 29 | def process_item(self, item, spider): 30 | if self.enabled: 31 | if self.onconflict == 'ignore': 32 | logger.debug("SAVE(ignore) %s", item) 33 | self.table.insert_ignore( 34 | item, self.ignore_identical, types=self.types) 35 | elif self.onconflict == 'upsert': 36 | logger.debug("SAVE(upsert) %s", item) 37 | self.table.upsert( 38 | item, self.ignore_identical, types=self.types) 39 | elif self.onconflict == 'non-null': 40 | logger.debug("SAVE(non-null) %s", item) 41 | row, res = self.table._upsert_pre_check( 42 | item, self.ignore_identical, None) 43 | selected = item 44 | if res is not None: 45 | # remove keys with none value 46 | selected = dict((k, v) for k, v in item.iteritems() if v) 47 | self.table.upsert( 48 | selected, self.ignore_identical, types=self.types) 49 | else: 50 | self.table.insert( 51 | selected, self.ignore_identical, types=self.types) 52 | else: 53 | raise Exception("no such strategy: %s" % (self.onconflict)) 54 | 55 | else: 56 | logger.debug("DISABLED") 57 | return item 58 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | nose 2 | sphinx 3 | inquirer 4 | wheel 5 | twine 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from setuptools import setup, find_packages 4 | 5 | with open('README.md') as f: 6 | readme = f.read() 7 | 8 | with open('LICENSE') as f: 9 | license = f.read() 10 | 11 | setup( 12 | name='pgpipeline', 13 | version='0.4.0', 14 | description='Pgpipeline: An automatic postgres item pipeline for Scrapy', 15 | long_description=readme, 16 | author='Dotan Nahum', 17 | author_email='jondotan@gmail.com', 18 | url='https://github.com/jondot/pgpipeline', 19 | license=license, 20 | packages=find_packages(exclude=('tests', 'docs')), 21 | install_requires=['dataset']) 22 | --------------------------------------------------------------------------------