├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── pgpipeline
    └── __init__.py
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | .venv/
83 | venv/
84 | ENV/
85 | 
86 | # Spyder project settings
87 | .spyderproject
88 | 
89 | # Rope project settings
90 | .ropeproject
91 | .vscode
92 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 Dotan Nahum
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | init:
 2 | 	pip install -r requirements.txt
 3 | install:
 4 | 	pip install -e .
 5 | test:
 6 | 	nosetests tests
 7 | dist:
 8 | 	rm -rf dist
 9 | 	python setup.py sdist bdist_wheel
10 | release:
11 | 	twine upload dist/*
12 | .PHONY: init install test dist
13 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Pgpipeline: automatic postgres pipeline for Scrapy
 2 | 
 3 | A Scrapy pipeline module to persist items to a postgres table automatically.
 4 | 
 5 | 
 6 | ## Quick Start
 7 | 
 8 | Here's an example showing automatic item pipeline, with a custom `JSONB` field.
 9 | 
10 | ```python
11 | # settings.py
12 | from sqlalchemy.dialects.postgresql import JSONB
13 | 
14 | ITEM_PIPELINES = {
15 |     'pgpipeline.PgPipeline': 300,
16 | }
17 | 
18 | PG_PIPELINE = {
19 |     'connection': 'postgresql://localhost:5432/scrapy_db',
20 |     'table_name': 'demo_items',
21 |     'pkey': 'item_id',
22 |     'ignore_identical': ['item_id', 'job_id'],
23 |     'types': {
24 |         'some_data': JSONB
25 |     },
26 |     'onconflict': 'upsert'
27 | }
28 | ```
29 | 
30 | All columns, tables, and indices are automatically created.
31 | 
32 | * `pkey`: a primary key for this item (other than database-generated `id`)
33 | * `ignore_identical`: these are a set of fields by which we identify duplicates and skip insert.
34 | * `types`: keys specified here will be using the type given, otherwise types are guessed.
35 | * `onconflict`: upsert|ignore|non-null - `ignore` will skip inserting on conflict and `upsert` will update. `non-null` will upsert only values that are not `None` and thus avoid removing existing values.
36 | ## Developers
37 | 
38 | Set up a development environment
39 | ```
40 | $ pip install -r requirements.txt
41 | ```
42 | 
43 | ### Development
44 | 
45 | * Dependencies: list them in `requirements.txt`
46 | 
47 | ### Release
48 | 
49 | * Dependencies: list them in `setup.py` under `install_requires`:
50 | 
51 | ```python
52 | install_requires=['peppercorn'],
53 | ```
54 | 
55 | Then:
56 | 
57 | ```
58 | $ make dist && make release
59 | ```
60 | 
61 | # Contributing
62 | 
63 | Fork, implement, add tests, pull request, get my everlasting thanks and a respectable place here :).
64 | 
65 | 
66 | ### Thanks:
67 | 
68 | To all [Contributors](https://github.com/jondot/pgpipeline/graphs/contributors) - you make this happen, thanks!
69 | 
70 | 
71 | # Copyright
72 | 
73 | Copyright (c) 2017 [Dotan Nahum](http://gplus.to/dotan) [@jondot](http://twitter.com/jondot). See [LICENSE](LICENSE) for further details.
74 | 


--------------------------------------------------------------------------------
/pgpipeline/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import dataset
 3 | from sqlalchemy.dialects.postgresql import JSONB
 4 | logger = logging.getLogger(__name__)
 5 | 
 6 | 
 7 | class PgPipeline(object):
 8 |     def __init__(self, **kwargs):
 9 |         self.args = kwargs
10 | 
11 |     @classmethod
12 |     def from_crawler(cls, crawler):
13 |         args = crawler.settings.get('PG_PIPELINE', {})
14 |         return cls(**args)
15 | 
16 |     def open_spider(self, spider):
17 |         if self.args.get('connection'):
18 |             self.db = dataset.connect(self.args.get('connection'))
19 |             self.table = self.db[self.args.get('table_name')]
20 |             self.pkey = self.args.get('pkey')
21 |             self.types = self.args.get('types', {})
22 |             self.ignore_identical = self.args.get('ignore_identical')
23 |             self.table.create_index([self.pkey])
24 |             self.table.create_index(self.ignore_identical)
25 |             self.onconflict = self.args.get('onconflict', 'ignore')
26 | 
27 |             self.enabled = True
28 | 
29 |     def process_item(self, item, spider):
30 |         if self.enabled:
31 |             if self.onconflict == 'ignore':
32 |                 logger.debug("SAVE(ignore) %s", item)
33 |                 self.table.insert_ignore(
34 |                     item, self.ignore_identical, types=self.types)
35 |             elif self.onconflict == 'upsert':
36 |                 logger.debug("SAVE(upsert) %s", item)
37 |                 self.table.upsert(
38 |                     item, self.ignore_identical, types=self.types)
39 |             elif self.onconflict == 'non-null':
40 |                 logger.debug("SAVE(non-null) %s", item)
41 |                 row, res = self.table._upsert_pre_check(
42 |                     item, self.ignore_identical, None)
43 |                 selected = item
44 |                 if res is not None:
45 |                     # remove keys with none value
46 |                     selected = dict((k, v) for k, v in item.iteritems() if v)
47 |                     self.table.upsert(
48 |                         selected, self.ignore_identical, types=self.types)
49 |                 else:
50 |                     self.table.insert(
51 |                         selected, self.ignore_identical, types=self.types)
52 |             else:
53 |                 raise Exception("no such strategy: %s" % (self.onconflict))
54 | 
55 |         else:
56 |             logger.debug("DISABLED")
57 |         return item
58 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | nose
2 | sphinx
3 | inquirer
4 | wheel
5 | twine
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from setuptools import setup, find_packages
 4 | 
 5 | with open('README.md') as f:
 6 |     readme = f.read()
 7 | 
 8 | with open('LICENSE') as f:
 9 |     license = f.read()
10 | 
11 | setup(
12 |     name='pgpipeline',
13 |     version='0.4.0',
14 |     description='Pgpipeline: An automatic postgres item pipeline for Scrapy',
15 |     long_description=readme,
16 |     author='Dotan Nahum',
17 |     author_email='jondotan@gmail.com',
18 |     url='https://github.com/jondot/pgpipeline',
19 |     license=license,
20 |     packages=find_packages(exclude=('tests', 'docs')),
21 |     install_requires=['dataset'])
22 | 


--------------------------------------------------------------------------------