├── .gitignore ├── LICENSE ├── README.md ├── requirements.txt ├── requirements_dev.txt ├── scrapy_mysql_pipeline ├── __init__.py └── pipeline.py ├── setup.cfg └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | .eggs/ 3 | *.egg-info/ 4 | dist/ 5 | ChangeLog 6 | AUTHORS 7 | README.rst 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | scrapy_mysql_pipeline: Asynchronous mysql Scrapy item pipeline 4 | 5 | Copyright (c) 2017 Iaroslav Russkykh 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in all 15 | copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 | SOFTWARE. 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Python 3.6](https://img.shields.io/badge/Python-3.6-blue.svg) 2 | 3 | # Pull requests are always welcome 4 | 5 | ## scrapy-mysql-pipeline 6 | Asynchronous mysql [Scrapy](https://doc.scrapy.org/en/latest/) item pipeline 7 | 8 | #### Installation 9 | ```bash 10 | pip install scrapy-mysql-pipeline 11 | ``` 12 | #### Configuration 13 | Add pipeline 14 | ```python 15 | ITEM_PIPELINES = { 16 | 'scrapy_mysql_pipeline.MySQLPipeline': 300, 17 | } 18 | ``` 19 | Default values: 20 | ```python 21 | MYSQL_HOST = 'localhost' 22 | MYSQL_PORT = 3306 23 | MYSQL_USER = None 24 | MYSQL_PASSWORD = '' 25 | MYSQL_DB = None 26 | MYSQL_TABLE = None 27 | MYSQL_UPSERT = False 28 | MYSQL_RETRIES = 3 29 | MYSQL_CLOSE_ON_ERROR = True 30 | MYSQL_CHARSET = 'utf8' 31 | ``` 32 | `MYSQL_USER`, `MYSQL_PASSWORD`, `MYSQL_DB` and `MYSQL_TABLE`, variables must be set in settings.py 33 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scrapy>=1.4.0 2 | pymysql>=0.7.11 3 | 4 | -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- 1 | scrapy>=1.4.0 2 | pymysql>=0.7.11 3 | pypandoc 4 | twine -------------------------------------------------------------------------------- /scrapy_mysql_pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | from .pipeline import MySQLPipeline 2 | 3 | __version__ = '2019.07.19' 4 | __all__ = ['MySQLPipeline', ] 5 | -------------------------------------------------------------------------------- /scrapy_mysql_pipeline/pipeline.py: -------------------------------------------------------------------------------- 1 | """ 2 | MIT License 3 | 4 | scrapy_mysql_pipeline: Asynchronous mysql Scrapy item pipeline 5 | 6 | Copyright (c) 2017 Iaroslav Russkykh 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy 9 | of this software and associated documentation files (the "Software"), to deal 10 | in the Software without restriction, including without limitation the rights 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | copies of the Software, and to permit persons to whom the Software is 13 | furnished to do so, subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be included in all 16 | copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | SOFTWARE. 25 | """ 26 | 27 | import logging 28 | import pprint 29 | 30 | from pymysql.cursors import DictCursor 31 | 32 | from pymysql import OperationalError 33 | from pymysql.constants.CR import CR_SERVER_GONE_ERROR, CR_SERVER_LOST, CR_CONNECTION_ERROR 34 | from twisted.internet import defer 35 | from twisted.enterprise import adbapi 36 | 37 | logger = logging.getLogger(__name__) 38 | logger.setLevel('DEBUG') 39 | 40 | 41 | class MySQLPipeline(object): # 42 | """ 43 | Defaults: 44 | MYSQL_HOST = 'localhost' 45 | MYSQL_PORT = 3306 46 | MYSQL_USER = None 47 | MYSQL_PASSWORD = '' 48 | MYSQL_DB = None 49 | MYSQL_TABLE = None 50 | MYSQL_UPSERT = False 51 | MYSQL_RETRIES = 3 52 | MYSQL_CLOSE_ON_ERROR = True 53 | MYSQL_CHARSET = 'utf8' 54 | Pipeline: 55 | ITEM_PIPELINES = { 56 | 'scrapy_mysql_pipeline.MySQLPipeline': 300, 57 | } 58 | """ 59 | stats_name = 'mysql_pipeline' 60 | 61 | @classmethod 62 | def from_crawler(cls, crawler): 63 | return cls(crawler) 64 | 65 | def __init__(self, crawler): 66 | self.stats = crawler.stats 67 | self.settings = crawler.settings 68 | db_args = { 69 | 'host': self.settings.get('MYSQL_HOST', 'localhost'), 70 | 'port': self.settings.get('MYSQL_PORT', 3306), 71 | 'user': self.settings.get('MYSQL_USER', None), 72 | 'password': self.settings.get('MYSQL_PASSWORD', ''), 73 | 'db': self.settings.get('MYSQL_DB', None), 74 | 'charset': self.settings.get('MYSQL_CHARSET', 'utf8'), 75 | 'cursorclass': DictCursor, 76 | 'cp_reconnect': True, 77 | } 78 | self.retries = self.settings.get('MYSQL_RETRIES', 3) 79 | self.close_on_error = self.settings.get('MYSQL_CLOSE_ON_ERROR', True) 80 | self.upsert = self.settings.get('MYSQL_UPSERT', False) 81 | self.table = self.settings.get('MYSQL_TABLE', None) 82 | self.db = adbapi.ConnectionPool('pymysql', **db_args) 83 | 84 | def close_spider(self, spider): 85 | self.db.close() 86 | 87 | @staticmethod 88 | def preprocess_item(item): 89 | """Can be useful with extremly straight-line spiders design without item loaders or items at all 90 | CAVEAT: On my opinion if you want to write something here - you must read 91 | http://scrapy.readthedocs.io/en/latest/topics/loaders.html before 92 | """ 93 | return item 94 | 95 | def postprocess_item(self, *args): 96 | """Can be useful if you need to update query tables depends of mysql query result""" 97 | pass 98 | 99 | @defer.inlineCallbacks 100 | def process_item(self, item, spider): 101 | retries = self.retries 102 | status = False 103 | while retries: 104 | try: 105 | item = self.preprocess_item(item) 106 | yield self.db.runInteraction(self._process_item, item) 107 | except OperationalError as e: 108 | if e.args[0] in ( 109 | CR_SERVER_GONE_ERROR, 110 | CR_SERVER_LOST, 111 | CR_CONNECTION_ERROR, 112 | ): 113 | retries -= 1 114 | logger.info('%s %s attempts to reconnect left', e, retries) 115 | self.stats.inc_value('{}/reconnects'.format(self.stats_name)) 116 | continue 117 | logger.exception('%s', pprint.pformat(item)) 118 | self.stats.inc_value('{}/errors'.format(self.stats_name)) 119 | except Exception: 120 | logger.exception('%s', pprint.pformat(item)) 121 | self.stats.inc_value('{}/errors'.format(self.stats_name)) 122 | else: 123 | status = True # executed without errors 124 | break 125 | else: 126 | if self.close_on_error: # Close spider if connection error happened and MYSQL_CLOSE_ON_ERROR = True 127 | spider.crawler.engine.close_spider(spider, '{}_fatal_error'.format(self.stats_name)) 128 | self.postprocess_item(item, status) 129 | yield item 130 | 131 | def _generate_sql(self, data): 132 | columns = lambda d: ', '.join(['`{}`'.format(k) for k in d]) 133 | values = lambda d: [v for v in d.values()] 134 | placeholders = lambda d: ', '.join(['%s'] * len(d)) 135 | if self.upsert: 136 | sql_template = 'INSERT INTO `{}` ( {} ) VALUES ( {} ) ON DUPLICATE KEY UPDATE {}' 137 | on_duplicate_placeholders = lambda d: ', '.join(['`{}` = %s'.format(k) for k in d]) 138 | return ( 139 | sql_template.format( 140 | self.table, columns(data), 141 | placeholders(data), on_duplicate_placeholders(data) 142 | ), 143 | values(data) + values(data) 144 | ) 145 | else: 146 | sql_template = 'INSERT INTO `{}` ( {} ) VALUES ( {} )' 147 | return ( 148 | sql_template.format(self.table, columns(data), placeholders(data)), 149 | values(data) 150 | ) 151 | 152 | def _process_item(self, tx, row): 153 | sql, data = self._generate_sql(row) 154 | try: 155 | tx.execute(sql, data) 156 | except Exception: 157 | logger.error("SQL: %s", sql) 158 | raise 159 | self.stats.inc_value('{}/saved'.format(self.stats_name)) 160 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = scrapy-mysql-pipeline 3 | author = Iaroslav Russkykh 4 | author-email = iarruss@ya.ru 5 | summary = Asynchronous mysql Scrapy item pipeline 6 | license = MIT 7 | description-file = README.rst 8 | home-page = https://github.com/IaroslavR/scrapy-mysql-pipeline 9 | requires-python = >=2.7 10 | classifier = 11 | Development Status :: 4 - Beta 12 | Framework :: Scrapy 13 | Intended Audience :: Developers 14 | License :: OSI Approved :: MIT License 15 | Operating System :: OS Independent 16 | Programming Language :: Python 17 | Topic :: Internet :: WWW/HTTP 18 | 19 | [files] 20 | packages = 21 | scrapy_mysql_pipeline 22 | 23 | [wheel] 24 | universal = 1 25 | 26 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | import os 3 | 4 | if "PY_DEV" in os.environ: 5 | import pypandoc 6 | with open('README.rst', 'w') as f: 7 | f.write(pypandoc.convert('README.md', 'rst')) 8 | else: 9 | os.environ.update(SKIP_WRITE_GIT_CHANGELOG='1') 10 | os.environ.update(SKIP_GENERATE_AUTHORS='1') 11 | setuptools.setup( 12 | setup_requires=['pbr', ], 13 | pbr=True 14 | ) 15 | --------------------------------------------------------------------------------