├── .gitignore
├── LICENSE
├── README.md
├── requirements.txt
├── requirements_dev.txt
├── scrapy_mysql_pipeline
    ├── __init__.py
    └── pipeline.py
├── setup.cfg
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | .eggs/
3 | *.egg-info/
4 | dist/
5 | ChangeLog
6 | AUTHORS
7 | README.rst
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | scrapy_mysql_pipeline: Asynchronous mysql Scrapy item pipeline
 4 | 
 5 | Copyright (c) 2017 Iaroslav Russkykh
 6 | 
 7 | Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | of this software and associated documentation files (the "Software"), to deal
 9 | in the Software without restriction, including without limitation the rights
10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the Software is
12 | furnished to do so, subject to the following conditions:
13 | 
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![Python 3.6](https://img.shields.io/badge/Python-3.6-blue.svg)  
 2 | 
 3 | # Pull requests are always welcome 
 4 | 
 5 | ## scrapy-mysql-pipeline  
 6 | Asynchronous mysql [Scrapy](https://doc.scrapy.org/en/latest/) item pipeline  
 7 | 
 8 | #### Installation  
 9 | ```bash
10 | pip install scrapy-mysql-pipeline
11 | ```
12 | #### Configuration  
13 | Add pipeline  
14 | ```python
15 | ITEM_PIPELINES = {
16 |     'scrapy_mysql_pipeline.MySQLPipeline': 300,
17 | }
18 | ```
19 | Default values:  
20 | ```python
21 | MYSQL_HOST = 'localhost'
22 | MYSQL_PORT = 3306
23 | MYSQL_USER = None
24 | MYSQL_PASSWORD = ''
25 | MYSQL_DB = None
26 | MYSQL_TABLE = None
27 | MYSQL_UPSERT = False
28 | MYSQL_RETRIES = 3
29 | MYSQL_CLOSE_ON_ERROR = True
30 | MYSQL_CHARSET = 'utf8'
31 | ```
32 | `MYSQL_USER`, `MYSQL_PASSWORD`, `MYSQL_DB` and `MYSQL_TABLE`,  variables must be set in settings.py  
33 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | scrapy>=1.4.0
2 | pymysql>=0.7.11
3 | 
4 | 


--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
1 | scrapy>=1.4.0
2 | pymysql>=0.7.11
3 | pypandoc
4 | twine


--------------------------------------------------------------------------------
/scrapy_mysql_pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | from .pipeline import MySQLPipeline
2 | 
3 | __version__ = '2019.07.19'
4 | __all__ = ['MySQLPipeline', ]
5 | 


--------------------------------------------------------------------------------
/scrapy_mysql_pipeline/pipeline.py:
--------------------------------------------------------------------------------
  1 | """
  2 | MIT License
  3 | 
  4 | scrapy_mysql_pipeline: Asynchronous mysql Scrapy item pipeline
  5 | 
  6 | Copyright (c) 2017 Iaroslav Russkykh
  7 | 
  8 | Permission is hereby granted, free of charge, to any person obtaining a copy
  9 | of this software and associated documentation files (the "Software"), to deal
 10 | in the Software without restriction, including without limitation the rights
 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 | copies of the Software, and to permit persons to whom the Software is
 13 | furnished to do so, subject to the following conditions:
 14 | 
 15 | The above copyright notice and this permission notice shall be included in all
 16 | copies or substantial portions of the Software.
 17 | 
 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 24 | SOFTWARE.
 25 | """
 26 | 
 27 | import logging
 28 | import pprint
 29 | 
 30 | from pymysql.cursors import DictCursor
 31 | 
 32 | from pymysql import OperationalError
 33 | from pymysql.constants.CR import CR_SERVER_GONE_ERROR,  CR_SERVER_LOST, CR_CONNECTION_ERROR
 34 | from twisted.internet import defer
 35 | from twisted.enterprise import adbapi
 36 | 
 37 | logger = logging.getLogger(__name__)
 38 | logger.setLevel('DEBUG')
 39 | 
 40 | 
 41 | class MySQLPipeline(object):  #
 42 |     """
 43 |     Defaults:
 44 |     MYSQL_HOST = 'localhost'
 45 |     MYSQL_PORT = 3306
 46 |     MYSQL_USER = None
 47 |     MYSQL_PASSWORD = ''
 48 |     MYSQL_DB = None
 49 |     MYSQL_TABLE = None
 50 |     MYSQL_UPSERT = False
 51 |     MYSQL_RETRIES = 3
 52 |     MYSQL_CLOSE_ON_ERROR = True
 53 |     MYSQL_CHARSET = 'utf8'
 54 |     Pipeline:
 55 |     ITEM_PIPELINES = {
 56 |        'scrapy_mysql_pipeline.MySQLPipeline': 300,
 57 |     }
 58 |     """
 59 |     stats_name = 'mysql_pipeline'
 60 | 
 61 |     @classmethod
 62 |     def from_crawler(cls, crawler):
 63 |         return cls(crawler)
 64 | 
 65 |     def __init__(self, crawler):
 66 |         self.stats = crawler.stats
 67 |         self.settings = crawler.settings
 68 |         db_args = {
 69 |             'host': self.settings.get('MYSQL_HOST', 'localhost'),
 70 |             'port': self.settings.get('MYSQL_PORT', 3306),
 71 |             'user': self.settings.get('MYSQL_USER', None),
 72 |             'password': self.settings.get('MYSQL_PASSWORD', ''),
 73 |             'db': self.settings.get('MYSQL_DB', None),
 74 |             'charset': self.settings.get('MYSQL_CHARSET', 'utf8'),
 75 |             'cursorclass': DictCursor,
 76 |             'cp_reconnect': True,
 77 |         }
 78 |         self.retries = self.settings.get('MYSQL_RETRIES', 3)
 79 |         self.close_on_error = self.settings.get('MYSQL_CLOSE_ON_ERROR', True)
 80 |         self.upsert = self.settings.get('MYSQL_UPSERT', False)
 81 |         self.table = self.settings.get('MYSQL_TABLE', None)
 82 |         self.db = adbapi.ConnectionPool('pymysql', **db_args)
 83 | 
 84 |     def close_spider(self, spider):
 85 |         self.db.close()
 86 | 
 87 |     @staticmethod
 88 |     def preprocess_item(item):
 89 |         """Can be useful with extremly straight-line spiders design without item loaders or items at all
 90 |         CAVEAT: On my opinion if you want to write something here - you must read
 91 |         http://scrapy.readthedocs.io/en/latest/topics/loaders.html before
 92 |         """
 93 |         return item
 94 | 
 95 |     def postprocess_item(self, *args):
 96 |         """Can be useful if you need to update query tables depends of mysql query result"""
 97 |         pass
 98 | 
 99 |     @defer.inlineCallbacks
100 |     def process_item(self, item, spider):
101 |         retries = self.retries
102 |         status = False
103 |         while retries:
104 |             try:
105 |                 item = self.preprocess_item(item)
106 |                 yield self.db.runInteraction(self._process_item, item)
107 |             except OperationalError as e:
108 |                 if e.args[0] in (
109 |                         CR_SERVER_GONE_ERROR,
110 |                         CR_SERVER_LOST,
111 |                         CR_CONNECTION_ERROR,
112 |                 ):
113 |                     retries -= 1
114 |                     logger.info('%s %s attempts to reconnect left', e, retries)
115 |                     self.stats.inc_value('{}/reconnects'.format(self.stats_name))
116 |                     continue
117 |                 logger.exception('%s', pprint.pformat(item))
118 |                 self.stats.inc_value('{}/errors'.format(self.stats_name))
119 |             except Exception:
120 |                 logger.exception('%s', pprint.pformat(item))
121 |                 self.stats.inc_value('{}/errors'.format(self.stats_name))
122 |             else:
123 |                 status = True  # executed without errors
124 |             break
125 |         else:
126 |             if self.close_on_error:  # Close spider if connection error happened and MYSQL_CLOSE_ON_ERROR = True
127 |                 spider.crawler.engine.close_spider(spider, '{}_fatal_error'.format(self.stats_name))
128 |         self.postprocess_item(item, status)
129 |         yield item
130 | 
131 |     def _generate_sql(self, data):
132 |         columns = lambda d: ', '.join(['`{}`'.format(k) for k in d])
133 |         values = lambda d: [v for v in d.values()]
134 |         placeholders = lambda d: ', '.join(['%s'] * len(d))
135 |         if self.upsert:
136 |             sql_template = 'INSERT INTO `{}` ( {} ) VALUES ( {} ) ON DUPLICATE KEY UPDATE {}'
137 |             on_duplicate_placeholders = lambda d: ', '.join(['`{}` = %s'.format(k) for k in d])
138 |             return (
139 |                 sql_template.format(
140 |                     self.table, columns(data),
141 |                     placeholders(data), on_duplicate_placeholders(data)
142 |                 ),
143 |                 values(data) + values(data)
144 |             )
145 |         else:
146 |             sql_template = 'INSERT INTO `{}` ( {} ) VALUES ( {} )'
147 |             return (
148 |                 sql_template.format(self.table, columns(data), placeholders(data)),
149 |                 values(data)
150 |             )
151 | 
152 |     def _process_item(self, tx, row):
153 |         sql, data = self._generate_sql(row)
154 |         try:
155 |             tx.execute(sql, data)
156 |         except Exception:
157 |             logger.error("SQL: %s", sql)
158 |             raise
159 |         self.stats.inc_value('{}/saved'.format(self.stats_name))
160 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = scrapy-mysql-pipeline
 3 | author = Iaroslav Russkykh
 4 | author-email = iarruss@ya.ru
 5 | summary = Asynchronous mysql Scrapy item pipeline
 6 | license = MIT
 7 | description-file = README.rst
 8 | home-page = https://github.com/IaroslavR/scrapy-mysql-pipeline
 9 | requires-python = >=2.7
10 | classifier =
11 |     Development Status :: 4 - Beta
12 |     Framework :: Scrapy
13 |     Intended Audience :: Developers
14 |     License :: OSI Approved :: MIT License
15 |     Operating System :: OS Independent
16 |     Programming Language :: Python
17 |     Topic :: Internet :: WWW/HTTP
18 | 
19 | [files]
20 | packages =
21 |     scrapy_mysql_pipeline
22 | 
23 | [wheel]
24 | universal = 1
25 | 
26 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | import os
 3 | 
 4 | if "PY_DEV" in os.environ:
 5 |     import pypandoc
 6 |     with open('README.rst', 'w') as f:
 7 |         f.write(pypandoc.convert('README.md', 'rst'))
 8 | else:
 9 |     os.environ.update(SKIP_WRITE_GIT_CHANGELOG='1')
10 | os.environ.update(SKIP_GENERATE_AUTHORS='1')
11 | setuptools.setup(
12 |     setup_requires=['pbr', ],
13 |     pbr=True
14 | )
15 | 


--------------------------------------------------------------------------------