├── tests ├── __init__.py ├── test_entities.py ├── chromeget.py ├── test_torproxy.py ├── test_processors.py └── test_sqlfeedstorage.py ├── scrapy_plus ├── __init__.py ├── utils │ ├── __init__.py │ ├── starturls.py │ ├── spiders.py │ └── parser.py ├── extensions │ ├── __init__.py │ ├── oss.py │ └── sql.py ├── pipelines │ ├── __init__.py │ ├── mongo.py │ ├── images.py │ └── files.py ├── dupefilters │ ├── __init__.py │ ├── redis.py │ └── redisbloom.py ├── spiders │ ├── __init__.py │ ├── taobao.py │ ├── netease.py │ └── douban.py ├── items │ ├── __init__.py │ ├── NewsItem.py │ ├── ProductItem.py │ └── BookItem.py ├── middlewares │ ├── __init__.py │ ├── proxy.py │ ├── autologin.py │ ├── chrome.py │ ├── tor.py │ ├── ua.py │ ├── huaban.py │ └── splash.py └── processors.py ├── requirements.txt ├── setup.py ├── .gitignore └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scrapy_plus/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scrapy_plus/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scrapy_plus/extensions/__init__.py: -------------------------------------------------------------------------------- 1 | from .oss import OSSFeedStorage 2 | from .sql import SQLFeedStorage, SQLItemExporter 3 | -------------------------------------------------------------------------------- /scrapy_plus/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | from .mongo import MongoDBPipeline 2 | from .images import ImagesPipeline 3 | 4 | __all__ = ["MongoDBPipeline", "ImagesPipeline"] 5 | -------------------------------------------------------------------------------- /scrapy_plus/dupefilters/__init__.py: -------------------------------------------------------------------------------- 1 | from .redis import RedisDupeFilter 2 | from .redisbloom import RedisBloomDupeFilter 3 | 4 | __all__ = ["RedisBloomDupeFilter", "RedisDupeFilter"] -------------------------------------------------------------------------------- /scrapy_plus/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | from .douban import BookSpider 2 | from .netease import NeteaseSpider 3 | from .taobao import TaobaoSpider 4 | 5 | __all__ = ["BookSpider", "NeteaseSpider", "TaobaoSpider"] 6 | -------------------------------------------------------------------------------- /scrapy_plus/items/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from collections import defaultdict 3 | import scrapy 4 | from scrapy import Item, Field 5 | from scrapy.loader.processors import Join, MapCompose, Identity 6 | 7 | -------------------------------------------------------------------------------- /scrapy_plus/middlewares/__init__.py: -------------------------------------------------------------------------------- 1 | from .autologin import LoginMiddleWare 2 | from .chrome import ChromeMiddleware 3 | from .proxy import RandomProxyMiddleware 4 | from .splash import SplashSpiderMiddleware 5 | from .tor import TorProxyMiddleware 6 | from .ua import RandomUserAgentMiddleware 7 | -------------------------------------------------------------------------------- /tests/test_entities.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from sqlalchemy.ext.declarative import declarative_base 4 | from sqlalchemy import Column, Integer, String, Float, Text, DateTime 5 | 6 | Base = declarative_base() 7 | 8 | 9 | class Book(Base): 10 | """ 11 | 测试用的数据实体 12 | """ 13 | __tablename__ = "books" 14 | id = Column(Integer, primary_key=True) 15 | name = Column(String) 16 | alias = Column(String) 17 | summary = Column(Text) 18 | 19 | -------------------------------------------------------------------------------- /scrapy_plus/middlewares/proxy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import random 4 | 5 | 6 | class RandomProxyMiddleware(object): 7 | """ 8 | 随机代理。在运行时会从settings.py设置的PROXIES中随机抽取一个作为当前代理地址。 9 | """ 10 | @classmethod 11 | def from_crawler(cls, crawler): 12 | return cls(proxies=crawler.settings.getlist('HTTP_PROXIES')) 13 | 14 | def __init__(self, proxies=[]): 15 | self.proxies = proxies 16 | 17 | def process_request(self, request, spider): 18 | request.meta['proxy'] = random.choice(self.proxies) 19 | -------------------------------------------------------------------------------- /tests/chromeget.py: -------------------------------------------------------------------------------- 1 | # coding=utf8 2 | import os 3 | from selenium import webdriver 4 | from selenium.webdriver.common.keys import Keys 5 | from selenium.webdriver.chrome.options import Options 6 | 7 | # 以无头方式使用 Chrome 而无需再采用PhantomJS的方式 8 | chrome_options = Options() 9 | chrome_options.add_argument("--headless") # 指定采用无头方式 10 | 11 | browser = webdriver.Chrome(executable_path="/usr/local/Caskroom/chromedriver/2.46/chromedriver", chrome_options=chrome_options) 12 | 13 | browser.get("http://www.baidu.com") 14 | #browser.get("https://s.taobao.com/search?q=Vue2&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306") 15 | browser.implicitly_wait(1) 16 | 17 | element = browser.find_element_by_id('kw') 18 | #button = browser.find_element_by_css_selector('form button.btn-search') 19 | 20 | print(element.get_attribute('name')) 21 | browser.close() -------------------------------------------------------------------------------- /scrapy_plus/items/NewsItem.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy.item import Item, Field 3 | from scrapy.loader.processors import TakeFirst, MapCompose, Compose, Identity, Join 4 | from w3lib.html import remove_tags 5 | 6 | 7 | class NewsItem(Item): 8 | title = Field(output_processor=TakeFirst()) 9 | desc = Field(input_processor=MapCompose(str.strip, 10 | stop_on_none=True), 11 | output_processor=TakeFirst()) 12 | link = Field(output_processor=TakeFirst()) 13 | pub_date = Field(input_processor=MapCompose(lambda v: v.split()[0], 14 | stop_on_none=True), 15 | output_processor=TakeFirst()) 16 | body = Field(input_processor=MapCompose(remove_tags, str.strip, 17 | stop_on_none=True), 18 | output_processor=TakeFirst()) 19 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aliyun-python-sdk-core==2.13.30 2 | aliyun-python-sdk-core-v3==2.13.11 3 | aliyun-python-sdk-kms==2.13.0 4 | attrs==20.3.0 5 | Automat==20.2.0 6 | certifi==2020.12.5 7 | cffi==1.14.4 8 | chardet==4.0.0 9 | constantly==15.1.0 10 | crcmod==1.7 11 | cryptography==3.3.2 12 | cssselect==1.1.0 13 | dateparser==1.0.0 14 | hyperlink==20.0.1 15 | idna==2.10 16 | incremental==17.5.0 17 | itemadapter==0.2.0 18 | itemloaders==1.0.4 19 | jmespath==0.10.0 20 | lxml==4.6.5 21 | oss2==2.13.1 22 | parsel==1.6.0 23 | Protego==0.1.16 24 | pyasn1==0.4.8 25 | pyasn1-modules==0.2.8 26 | pycparser==2.20 27 | pycryptodome==3.9.9 28 | PyDispatcher==2.0.5 29 | PyHamcrest==2.0.2 30 | pymongo==3.11.2 31 | pyOpenSSL==20.0.1 32 | python-dateutil==2.8.1 33 | pytz==2020.4 34 | queuelib==1.5.0 35 | redis==3.5.3 36 | regex==2020.11.13 37 | requests==2.25.1 38 | Scrapy==2.5.1 39 | scrapy-splash==0.8.0 40 | selenium==3.141.0 41 | service-identity==18.1.0 42 | six==1.15.0 43 | SQLAlchemy==1.3.20 44 | stem==1.8.0 45 | Twisted==20.3.0 46 | tzlocal==2.1 47 | urllib3==1.26.5 48 | w3lib==1.22.0 49 | zope.interface==5.2.0 50 | -------------------------------------------------------------------------------- /tests/test_torproxy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import unittest 4 | from scrapy_plus.middlewares.tor import TorProxyMiddleware 5 | from scrapy.http import Request 6 | from urllib3 import ProxyManager 7 | 8 | 9 | class TorProxyMiddlewareTestCase(unittest.TestCase): 10 | 11 | def test_tor_should_change_diff_ips(self): 12 | tor = TorProxyMiddleware(tor_proxy='127.0.0.1:8118', 13 | tor_password='mypassword', 14 | after_times=2) 15 | request = Request(url='http://www.baidu.com') 16 | ip = self.get_ip() 17 | for i in range(1, 10): 18 | tor.process_request(request, None) 19 | if i > 1 and (i % 2) != 0: 20 | new_ip = self.get_ip() 21 | self.assertNotEqual(ip, new_ip) 22 | ip = new_ip 23 | 24 | def get_ip(self): 25 | http = ProxyManager('http://127.0.0.1:8118') 26 | body = http.request('GET', 'http://icanhazip.com') 27 | return str(body.data, 'utf-8').replace('\n', '') 28 | 29 | 30 | if __name__ == '__main__': 31 | unittest.main() 32 | -------------------------------------------------------------------------------- /scrapy_plus/dupefilters/redis.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | from redis import Redis 4 | from scrapy.dupefilters import BaseDupeFilter 5 | 6 | 7 | 8 | class RedisDupeFilter(BaseDupeFilter): 9 | """ 10 | Redis 去重过滤器 11 | """ 12 | def __init__(self, host='localhost', port=6379, db=0): 13 | self.redis = Redis(host=host, port=port, db=db) 14 | self.logger = logging.getLogger(__name__) 15 | 16 | @classmethod 17 | def from_settings(cls, settings): 18 | host = settings.get('REDIS_HOST', 'localhost') 19 | redis_port = settings.getint('REDIS_PORT') 20 | redis_db = settings.get('REDIS_DUP_DB') 21 | return cls(host, redis_port, redis_db) 22 | 23 | def request_seen(self, request): 24 | fp = request.url 25 | key = 'UrlFingerprints' 26 | if not self.redis.sismember(key, fp): 27 | self.redis.sadd(key, fp) 28 | return False 29 | return True 30 | 31 | def log(self, request, spider): 32 | msg = ("已过滤的重复请求: %(request)s") 33 | self.logger.debug(msg, {'request': request}, extra={'spider': spider}) 34 | spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider) 35 | 36 | 37 | -------------------------------------------------------------------------------- /scrapy_plus/items/ProductItem.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from scrapy import Item, Field 4 | from scrapy.loader.processors import TakeFirst 5 | from scrapy_plus.processors import Text, CleanText, Price, Image, Url, Number 6 | 7 | 8 | class ProductItem(Item): 9 | """ 10 | 商品实体 11 | """ 12 | name = Field(input_processor=CleanText(), 13 | output_processor=TakeFirst()), # 品名 14 | link = Field(input_processor=Url(), 15 | output_processor=TakeFirst()) # 链接地址 16 | image_urls = Field(input_processor=Image(), 17 | output_processor=TakeFirst()) # 产品图片地址 18 | image_files = Field() # 图片下载至本地的位置 19 | price = Field(input_processor=Price(), 20 | output_processor=TakeFirst()) # 价格 21 | deal = Field(input_processor=Number(), 22 | output_processor=TakeFirst()) # 成交人数 23 | free_shipping = Field(input_processor=CleanText(), 24 | output_processor=TakeFirst()) # 是否包邮 25 | shop = Field(input_processor=CleanText(), 26 | output_processor=TakeFirst()) # 淘宝店名 27 | location = Field(input_processor=CleanText(), 28 | output_processor=TakeFirst()) # 地区 29 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from setuptools import setup, find_packages 4 | 5 | with open('requirements.txt') as reqs_file: 6 | REQS = reqs_file.read() 7 | 8 | with open('README.md', encoding='utf-8') as readme_file: 9 | README = readme_file.read() 10 | 11 | setup( 12 | name='scrapy_plus', 13 | version='1.0.5', 14 | packages=find_packages(exclude=["tests"]), 15 | install_requires=REQS, 16 | url='http://www.github.com/dotnetage/scrapy_plus', 17 | license='BSD', 18 | author='Ray', 19 | author_email='csharp2002@hotmail.com', 20 | description="scrapy 常用爬网必备工具包", 21 | long_description=README, 22 | long_description_content_type='text/markdown', 23 | zip_safe=False, 24 | platforms='any', 25 | keywords=('scrapy', 'crawl', 'redis', 'tor'), 26 | classifiers=['Development Status :: 4 - Beta', 27 | 'Intended Audience :: Developers', 28 | 'License :: OSI Approved :: BSD License', 29 | 'Natural Language :: English', 30 | 'Operating System :: OS Independent', 31 | 'Programming Language :: Python :: 3.6', 32 | 'Topic :: Software Development :: Libraries', 33 | 'Topic :: Utilities']) 34 | -------------------------------------------------------------------------------- /tests/test_processors.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import unittest 3 | from scrapy_plus.processors import Number, Text, Date, Price 4 | 5 | 6 | class ProcessorTestCase(unittest.TestCase): 7 | 8 | def test_number_processor(self): 9 | tests_text = "共93.2人" 10 | expected = 93.2 11 | processor = Number() 12 | 13 | actual = processor([tests_text]) 14 | self.assertEqual(actual[0], expected) 15 | 16 | def test_text_processor(self): 17 | tests_text = "
This is a text with some html tags
" 18 | expected_text = "This is a text with some html tags" 19 | processor = Text() 20 | 21 | actual = processor([tests_text]) 22 | self.assertEqual(actual[0], expected_text) 23 | 24 | def test_price_processor(self): 25 | tests_text = "¥24.2 元" 26 | expected = 24.2 27 | processor = Price() 28 | 29 | actual = processor([tests_text]) 30 | self.assertEqual(actual[0], expected) 31 | 32 | def test_date_processor(self): 33 | tests_text = "2015年2月3日" 34 | expected_text = '2015-02-03T00:00:00' 35 | processor = Date() 36 | 37 | actual = processor([tests_text]) 38 | self.assertEqual(actual[0].strftime('%Y-%m-%dT%H:%M:%S'), expected_text) 39 | -------------------------------------------------------------------------------- /scrapy_plus/pipelines/mongo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import pymongo 4 | import logging 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class MongoDBPipeline(object): 10 | """ 11 | MongoDB数据管道 12 | 13 | 配置方法: 14 | ITEM_PIPELINES = ['scrapyplus.pipelines.MongoDBPipeline', ] 15 | 16 | MONGODB_SERVER = "localhost" 17 | MONGODB_PORT = 27017 18 | MONGODB_DB = "数据库名" 19 | MONGODB_COLLECTION = "表名" 20 | """ 21 | 22 | def __init__(self, server=None, port=None, db_name=None, col=None): 23 | connection = pymongo.MongoClient(server, port) 24 | db = connection[db_name] 25 | self.collection = db[col] 26 | 27 | @classmethod 28 | def from_settings(cls, settings): 29 | server = settings['MONGODB_SERVER'], 30 | port = settings['MONGODB_PORT'] 31 | db_name = settings['MONGODB_DB'] 32 | collection_name = settings['MONGODB_COLLECTION'] 33 | return cls(server, port, db_name, collection_name) 34 | 35 | def process_item(self, item, spider): 36 | self.collection.insert(dict(item)) 37 | logger.debug("成功将数据插入至MongoDB",extra={'spider':spider}) 38 | spider.crawler.stats.inc_value( 39 | 'mongodb/inserted', spider=spider) 40 | return item 41 | -------------------------------------------------------------------------------- /scrapy_plus/spiders/taobao.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy import Spider, Request 3 | import urllib 4 | from ..items import ProductItem 5 | from scrapy.loader import ItemLoader 6 | 7 | 8 | class TaobaoSpider(Spider): 9 | name = 'taobao' 10 | allowed_domains = ['s.taobao.com'] 11 | base_url = 'https://s.taobao.com/search?q=%s' 12 | 13 | def start_requests(self): 14 | keywords = self.gen_keywords() 15 | 16 | for kw in keywords: 17 | url = self.base_url % urllib.parse.quote(kw.encode('utf-8')) 18 | yield Request(url, self.parse, meta={'kw': kw}) 19 | 20 | def gen_keywords(self): 21 | raise NotImplemented 22 | 23 | def parse(self, response): 24 | 25 | products = response.css('#mainsrp-itemlist .items .item') 26 | 27 | for product in products: 28 | loader = ItemLoader(item=ProductItem(), selector=product) 29 | loader.add_css('price', '.price>strong::text') 30 | loader.add_css('name', 'div.title>a::text') 31 | loader.add_css('shop', '.shopname>span::text') 32 | loader.add_css('image_url', '.pic img::attr(data-src)') 33 | loader.add_css('deal', '.deal-cnt::text') 34 | loader.add_css('location', '.location::text') 35 | loader.add_css('link', 'div.title>a::attr(href)') 36 | loader.add_css('free_shipping', '.icon-service-free') 37 | 38 | yield loader.load_item() 39 | -------------------------------------------------------------------------------- /scrapy_plus/pipelines/images.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from scrapy.pipelines.images import ImagesPipeline as _ImagesPipeline 4 | from scrapy.pipelines.files import FSFilesStore, S3FilesStore, GCSFilesStore 5 | from .files import OSSFilesStore 6 | 7 | class ImagesPipeline(_ImagesPipeline): 8 | STORE_SCHEMES = { 9 | '': FSFilesStore, 10 | 'file': FSFilesStore, 11 | 's3': S3FilesStore, 12 | 'gs': GCSFilesStore, 13 | 'oss':OSSFilesStore 14 | } 15 | 16 | def __init__(self, store_uri, download_func=None, settings=None): 17 | super(ImagesPipeline, self).__init__(store_uri, settings=settings, 18 | download_func=download_func) 19 | 20 | @classmethod 21 | def from_settings(cls, settings): 22 | s3store = cls.STORE_SCHEMES['s3'] 23 | s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID'] 24 | s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY'] 25 | s3store.POLICY = settings['IMAGES_STORE_S3_ACL'] 26 | 27 | gcs_store = cls.STORE_SCHEMES['gs'] 28 | gcs_store.GCS_PROJECT_ID = settings['GCS_PROJECT_ID'] 29 | gcs_store.POLICY = settings['IMAGES_STORE_GCS_ACL'] or None 30 | 31 | ossStore = cls.STORE_SCHEMES['oss'] 32 | ossStore.OSS_ACCESS_KEY = settings['OSS_ACCESS_KEY'] 33 | ossStore.OSS_ACCESS_SECRET = settings['OSS_ACCESS_SECRET'] 34 | 35 | store_uri = settings['IMAGES_STORE'] 36 | return cls(store_uri, settings=settings) -------------------------------------------------------------------------------- /scrapy_plus/spiders/netease.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy.linkextractors import LinkExtractor 3 | from scrapy.spiders import CrawlSpider, Rule 4 | from ..items import NewsItem 5 | from scrapy.loader import ItemLoader 6 | 7 | 8 | class NeteaseSpider(CrawlSpider): 9 | name = 'netease' 10 | allowed_domains = ['163.com'] 11 | urls = 'https://www.163.com/' 12 | start_urls = urls.split(',') 13 | 14 | rules = ( 15 | Rule(LinkExtractor(allow=r'(\w+):\/\/([^/:]+)\/(\d{2})+\/(\d{4})+\/(\d{2})+\/([^#]*)'), 16 | callback='parse_item', follow=True), 17 | ) 18 | 19 | def parse_item(self, response): 20 | loader = ItemLoader(item=NewsItem(), response=response) 21 | loader.add_css('title', '#epContentLeft>h1::text') 22 | loader.add_css('pub_date', '#epContentLeft .post_time_source::text') 23 | loader.add_css('desc', '#epContentLeft .post_desc::text') 24 | 25 | # 游戏栏目 play.163.com 26 | loader.add_css('title', 'h1.article-h1::text') 27 | loader.add_css('desc', '.artical-summary::text') 28 | 29 | # 人间栏目 renjian.163.com 30 | loader.add_css('title', '.bannertext>.daxie_sub_title::text') 31 | loader.add_css('pub_date', '.sub_title>.pub_time::text') 32 | 33 | # 体育 sports.163.com 34 | loader.add_css('title', '.m-article .article-top>.article-title::text') 35 | loader.add_xpath('body', '//div[@class=".article-details"]') 36 | 37 | loader.add_xpath('body', '//div[@id="endText"]') 38 | loader.add_value('link', response.url) 39 | return loader.load_item() 40 | -------------------------------------------------------------------------------- /scrapy_plus/middlewares/autologin.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from scrapy import signals 4 | from scrapy.exceptions import IgnoreRequest 5 | from scrapy.http import HtmlResponse, FormRequest 6 | from logging import getLogger 7 | 8 | 9 | class LoginMiddleWare(): 10 | """ 11 | 预登录表单中间件 12 | """ 13 | @classmethod 14 | def from_crawler(cls, crawler): 15 | return cls(login_url=crawler.settings.get('LOGIN_URL'), 16 | user_name=crawler.settings.get('LOGIN_USR'), 17 | password=crawler.settings.get('LOGIN_PWD'), 18 | user_ele=crawler.settings.get('LOGIN_USR_FIELD'), 19 | pwd_ele=crawler.settings.get('LOGIN_PWD_FIELD')) 20 | 21 | def __init__(self, login_url, user_name, password, user_ele='username', pwd_ele='password'): 22 | self.logger = getLogger(__name__) # 打开日志 23 | self.login_url = login_url 24 | self.user_name = user_name 25 | self.password = password 26 | self.user_ele = user_ele 27 | self.pwd_ele = pwd_ele 28 | 29 | def process_request(self, request, spider): 30 | cookies = request.headers.getlist('Cookie') 31 | if cookies is None or len(cookies)==0: 32 | return FormRequest(url=self.login_url, 33 | formdata={self.user_ele: self.user_name, self.pwd_ele: self.password}) 34 | return request 35 | 36 | def process_response(self, request, response, spider): 37 | if "authentication failed" in response.body: 38 | return IgnoreRequest() 39 | 40 | def process_exception(self, request, exception, spider): 41 | self.logger.error("登录失败") 42 | -------------------------------------------------------------------------------- /tests/test_sqlfeedstorage.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import unittest 4 | from scrapy_plus.extensions import SQLFeedStorage 5 | from scrapy_plus.extensions.sql import EntityFileFaker 6 | 7 | from sqlalchemy.engine import create_engine 8 | from sqlalchemy.orm import sessionmaker 9 | 10 | 11 | class SQLFeedStorageTestCase(unittest.TestCase): 12 | """ 13 | SQLFeedStorage单元测试 14 | """ 15 | 16 | def setUp(self): 17 | self.book = { 18 | 'id': 1, 19 | 'name': "Vue2实践揭秘", 20 | 'alias': "Vue2实践揭秘 - 电子工业出版社出版", 21 | 'summary': "这是一本关于Vue2实践的书籍,由浅入深层层揭显Vue2中的隐秘。" 22 | } 23 | 24 | self.connection_str = "sqlite:///test.db" 25 | 26 | def test_entity_faker_should_banch_update(self): 27 | from tests.test_entities import Base, Book 28 | 29 | engine = create_engine(self.connection_str) 30 | Base.metadata.bind = engine 31 | DBSession = sessionmaker(bind=engine) 32 | 33 | Base.metadata.create_all() 34 | faker = EntityFileFaker(DBSession(), Book) 35 | 36 | faker.write(self.book.keys(), self.book) 37 | faker.close() 38 | 39 | session = DBSession() 40 | books = session.query(Book).all() 41 | self.assertEqual(books.__len__(), 1) 42 | faker.close() 43 | 44 | Base.metadata.drop_all() 45 | 46 | def test_sql_feed_storage_should_create_database(self): 47 | storage = SQLFeedStorage('sqlite:///test1.db', 48 | 'tests.test_entities', 'Base', 'Book') 49 | file = storage.open(None) 50 | file.write(self.book.keys(), self.book) 51 | storage.store(file) 52 | 53 | 54 | if __name__ == '__main__': 55 | unittest.main() 56 | -------------------------------------------------------------------------------- /scrapy_plus/utils/starturls.py: -------------------------------------------------------------------------------- 1 | import re 2 | import six 3 | from datetime import datetime 4 | from itertools import product 5 | from scrapy import Request 6 | 7 | 8 | 9 | class FragmentGenerator(object): 10 | def _process_fixed(self, fragment): 11 | return [fragment] 12 | 13 | def _process_list(self, fragment): 14 | return fragment.split(' ') 15 | 16 | def _process_date(self, fragment): 17 | now = datetime.now() 18 | return [now.strftime(fragment)] 19 | 20 | def _process_range(self, fragment): 21 | a, b = fragment.split('-') 22 | 23 | if a.isalpha() and b.isalpha(): 24 | a, b = [ord(w.lower()) for w in [a, b]] 25 | return (chr(w) for w in six.moves.range(a, b + 1)) 26 | else: 27 | a, b = int(a), int(b) 28 | return (str(i) for i in six.moves.range(a, b + 1)) 29 | 30 | def _process_fragment(self, fragment): 31 | processor = getattr(self, '_process_{}'.format(fragment['type'])) 32 | return processor(fragment['value']) 33 | 34 | def process_fragments(self, spec): 35 | return map(self._process_fragment, spec['fragments']) 36 | 37 | def __call__(self, spec): 38 | generated = product(*self.process_fragments(spec)) 39 | for fragment_list in generated: 40 | yield ''.join(fragment_list) 41 | _NEWLINE_RE = re.compile('[\r\n]') 42 | 43 | 44 | class FeedGenerator(object): 45 | def __init__(self, callback): 46 | self.callback = callback 47 | 48 | def __call__(self, url): 49 | return Request(url, callback=self.parse_urls) 50 | 51 | def parse_urls(self, response): 52 | newline_urls = _NEWLINE_RE.split(response.text) 53 | urls = [url for url in newline_urls if url] 54 | for url in urls: 55 | yield Request(url, callback=self.callback) -------------------------------------------------------------------------------- /scrapy_plus/items/BookItem.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | from scrapy import Item, Field 4 | from ..processors import Number, Date, Price, Text, CleanText 5 | from scrapy.loader.processors import TakeFirst, Join 6 | 7 | 8 | class BookItem(Item): 9 | # 书名 10 | name = Field(input_processor=CleanText(), 11 | output_processor=TakeFirst()) 12 | # 作者 13 | authors = Field(input_processor=CleanText(), 14 | output_processor=TakeFirst()) 15 | # 出版社 16 | publishing_house = Field(input_processor=CleanText(), 17 | output_processor=TakeFirst()) 18 | # 出品方 19 | publisher = Field(input_processor=CleanText(), 20 | output_processor=TakeFirst()) 21 | # 原名 22 | origin_name = Field(input_processor=CleanText(), 23 | output_processor=TakeFirst()) 24 | # 译者 25 | translators = Field(input_processor=CleanText(), 26 | output_processor=TakeFirst()) 27 | # 出版时间 28 | pub_date = Field(input_processor=Date(), 29 | output_processor=TakeFirst()) 30 | # 页数 31 | pages = Field(input_processor=Number(), 32 | output_processor=TakeFirst()) 33 | # 定价 34 | price = Field(input_processor=Price(), 35 | output_processor=TakeFirst()) 36 | # ISBN 37 | isbn = Field(input_processor=CleanText(), 38 | output_processor=TakeFirst()) 39 | # 豆瓣评分 40 | rates = Field(input_processor=Number(), 41 | output_processor=TakeFirst()) 42 | # 评价数 43 | rating_count = Field(input_processor=Number(), 44 | output_processor=TakeFirst()) 45 | # 简介 46 | summary = Field(input_processor=Text(), 47 | output_processor=Join()) 48 | # 作者简介 49 | about_authors = Field(input_processor=CleanText(), 50 | output_processor=TakeFirst()) 51 | -------------------------------------------------------------------------------- /scrapy_plus/extensions/oss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from scrapy.extensions.feedexport import BlockingFeedStorage 4 | import oss2 5 | import os 6 | from urllib.parse import urlparse 7 | 8 | 9 | class OSSFeedStorage(BlockingFeedStorage): 10 | """ 11 | 阿里云OSS存储后端 12 | """ 13 | 14 | def __init__(self, uri): 15 | # < Schema >: // < Bucket >.< 外网Endpoint > / < Object > 16 | u = urlparse(uri) 17 | self.uri = uri 18 | self.bucket_name = u.hostname.splite('.')[0] 19 | self.endpoint = '.'.join(u.hostname.splite('.')[1:]) 20 | self.path = u.path 21 | 22 | def open(self, spider): 23 | access_key = spider.crawler.settings.get('OSS_ACCESS_KEY') 24 | access_secret = spider.crawler.settings.get('OSS_ACCESS_SECRET') 25 | 26 | # 创建Bucket对象,所有Object相关的接口都可以通过Bucket对象来进行 27 | self.bucket = oss2.Bucket(oss2.Auth(access_key, access_secret), 28 | self.endpoint, self.bucket_name) 29 | 30 | def _store_in_thread(self, file): 31 | # 首先可以用帮助函数设定分片大小,设我们期望的分片大小为128KB 32 | total_size = os.path.getsize(file) 33 | part_size = oss2.determine_part_size(total_size, preferred_size=128 * 1024) 34 | 35 | # 初始化分片上传,得到Upload ID。接下来的接口都要用到这个Upload ID。 36 | key = file.replace('../', '') 37 | upload_id = self.bucket.init_multipart_upload(key).upload_id 38 | 39 | # 逐个上传分片 40 | # 其中oss2.SizedFileAdapter()把fileobj转换为一个新的文件对象,新的文件对象可读的长度等于size_to_upload 41 | with open(file, 'rb') as fileobj: 42 | parts = [] 43 | part_number = 1 44 | offset = 0 45 | while offset < total_size: 46 | size_to_upload = min(part_size, total_size - offset) 47 | result = self.bucket.upload_part(key, upload_id, part_number, 48 | oss2.SizedFileAdapter(fileobj, size_to_upload)) 49 | parts.append(oss2.models.PartInfo(part_number, result.etag, size=size_to_upload, part_crc=result.crc)) 50 | 51 | offset += size_to_upload 52 | part_number += 1 53 | 54 | # 完成分片上传 55 | self.bucket.complete_multipart_upload(key, upload_id, parts) 56 | 57 | # 验证一下 58 | with open(file, 'rb') as fileobj: 59 | assert self.bucket.get_object(key).read() == fileobj.read() 60 | -------------------------------------------------------------------------------- /scrapy_plus/spiders/douban.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy.spiders import CrawlSpider, Rule 3 | from scrapy.linkextractors import LinkExtractor 4 | from scrapy.loader import ItemLoader 5 | from ..items import BookItem 6 | 7 | 8 | class BookSpider(CrawlSpider): 9 | name = "doubanbook" 10 | start_urls = ['https://book.douban.com/tag/'] 11 | rules = (Rule(LinkExtractor(allow=('\/tag\/(.*?)')), follow=True), 12 | Rule(LinkExtractor(allow=('\/tag\/(.*?)\?start\='), 13 | tags=('link'), attrs=('href')), follow=True), 14 | Rule(LinkExtractor(allow=('\/subject\/.*'), ), follow=False, callback='parse_item')) 15 | 16 | def parse_item(self, response): 17 | loader = ItemLoader(item=BookItem(), response=response) 18 | loader.add_css('name',"h1 span::text") # 标题 19 | loader.add_css('summary','.related_info #link-report .intro p::text') # 简介 20 | loader.add_xpath('authors', u'//span[.//text()[normalize-space(.)="作者:"]]/following::text()[1]') 21 | loader.add_xpath('authors', u'//span[.//text()[normalize-space(.)="作者:"]]/following::text()[2]') 22 | loader.add_xpath('publishing_house', u'//span[.//text()[normalize-space(.)="出版社:"]]/following::text()[1]') 23 | loader.add_xpath('publisher', u'//span[.//text()[normalize-space(.)="出品方:"]]/following::text()[1]') 24 | loader.add_xpath('publisher', u'//span[.//text()[normalize-space(.)="出品方:"]]/following::text()[2]') 25 | loader.add_xpath('origin_name', u'//span[.//text()[normalize-space(.)="原作名:"]]/following::text()[1]') 26 | loader.add_xpath('translators', u'//span[.//text()[normalize-space(.)="译者:"]]/following::text()[1]') 27 | loader.add_xpath('translators', u'//span[.//text()[normalize-space(.)="译者"]]/following::text()[2]') 28 | loader.add_xpath('pub_date', u'//span[.//text()[normalize-space(.)="出版年:"]]/following::text()[1]') 29 | loader.add_xpath('pages', u'//span[.//text()[normalize-space(.)="页数:"]]/following::text()[1]') 30 | loader.add_xpath('price', u'//span[.//text()[normalize-space(.)="定价:"]]/following::text()[1]') 31 | loader.add_xpath('isbn', u'//span[.//text()[normalize-space(.)="ISBN:"]]/following::text()[1]') 32 | loader.add_css('rates',".rating_num::text") # 得分 33 | loader.add_css('rating_count', ".rating_people>span::text") #投票 34 | return loader.load_item() 35 | 36 | -------------------------------------------------------------------------------- /scrapy_plus/middlewares/chrome.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from scrapy import signals 4 | from scrapy.exceptions import IgnoreRequest 5 | from selenium import webdriver 6 | from selenium.common.exceptions import TimeoutException 7 | from selenium.webdriver.common.by import By 8 | from selenium.webdriver.support.ui import WebDriverWait 9 | from selenium.webdriver.support import expected_conditions as EC 10 | from scrapy.http import HtmlResponse 11 | from logging import getLogger 12 | import random 13 | 14 | 15 | class ChromeMiddleware(): 16 | """ 17 | Chrome 无头浏览器仿真中间件。 18 | """ 19 | @classmethod 20 | def from_crawler(cls, crawler): 21 | return cls(timeout=crawler.settings.get('SELENIUM_TIMEOUT'), 22 | exec_path=crawler.settings.get('CHROMEDRIVER')) 23 | 24 | def __init__(self, timeout=None, exec_path=''): 25 | self.logger = getLogger(__name__) # 打开日志 26 | self.timeout = timeout 27 | options = webdriver.ChromeOptions() 28 | options.add_argument('headless') # 采用无头浏览器 29 | self.browser = webdriver.Chrome( 30 | executable_path=exec_path, chrome_options=options) 31 | 32 | self.browser.set_window_size(1400, 700) # 设置浏览窗口 33 | self.browser.set_page_load_timeout(self.timeout) # 设置浏览器加载网页的超时时间 34 | self.wait = WebDriverWait(self.browser, self.timeout) 35 | 36 | def __del__(self): 37 | self.browser.close() # 释构时关闭浏览器实例 38 | 39 | def process_request(self, request, spider): 40 | """ 41 | 用Chrome抓取页面 42 | :param request: Request对象 43 | :param spider: Spider对象 44 | :return: HtmlResponse 45 | """ 46 | self.logger.debug(u'启动Chrome...') 47 | # page = request.meta.get('sn', 1) 48 | 49 | try: 50 | self.browser.get(request.url) 51 | 52 | # 等待页面的宝贝全部加载完成 53 | self.wait.until(EC.presence_of_element_located( 54 | (By.CSS_SELECTOR, '.m-itemlist .items .item'))) 55 | 56 | return HtmlResponse(url=request.url, 57 | body=self.browser.page_source, 58 | request=request, 59 | encoding='utf-8', 60 | status=200) 61 | 62 | except TimeoutException: 63 | # 超时抛出异常 64 | return HtmlResponse(url=request.url, status=500, request=request) 65 | 66 | -------------------------------------------------------------------------------- /scrapy_plus/pipelines/files.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import os 3 | import oss2 4 | from urllib.parse import urlparse 5 | 6 | from twisted.internet import threads 7 | 8 | 9 | class OSSFilesStore(object): 10 | OSS_ACCESS_KEY = "" 11 | OSS_ACCESS_SECRET = "" 12 | 13 | def __init__(self, uri): 14 | # < Schema >: // < Bucket >.< 外网Endpoint > / < Object > 15 | u = urlparse(uri) 16 | self.uri = uri 17 | self.bucket_name = u.hostname.splite('.')[0] 18 | self.endpoint = '.'.join(u.hostname.splite('.')[1:]) 19 | self.objectPath = u.path 20 | self.bucket = oss2.Bucket(oss2.Auth(self.OSS_ACCESS_KEY, self.OSS_ACCESS_SECRET), 21 | self.endpoint, self.bucket_name) 22 | 23 | def stat_file(self, path, info): 24 | 25 | def _onsuccess(meta): 26 | checksum = meta.headers['ETag'] 27 | last_modified = meta.headers['Last-Modifie'] 28 | return {'checksum': checksum, 'last_modified': last_modified} 29 | 30 | return threads.deferToThread(self.bucket.get_object_meta, path).addCallback(_onsuccess) 31 | 32 | 33 | 34 | def persist_file(self, path, buf, info, meta=None, headers=None): 35 | # 首先可以用帮助函数设定分片大小,设我们期望的分片大小为128KB 36 | total_size = len(buf) 37 | part_size = oss2.determine_part_size(total_size, preferred_size=128 * 1024) 38 | 39 | # 初始化分片上传,得到Upload ID。接下来的接口都要用到这个Upload ID。 40 | key = os.path.join(self.objectPath, info) 41 | upload_id = self.bucket.init_multipart_upload(key).upload_id 42 | 43 | # 逐个上传分片 44 | # 其中oss2.SizedFileAdapter()把fileobj转换为一个新的文件对象,新的文件对象可读的长度等于size_to_upload 45 | parts = [] 46 | part_number = 1 47 | offset = 0 48 | while offset < total_size: 49 | size_to_upload = min(part_size, total_size - offset) 50 | result = self.bucket.upload_part(key, upload_id, part_number, 51 | oss2.SizedFileAdapter(buf, size_to_upload)) 52 | parts.append(oss2.models.PartInfo(part_number, 53 | result.etag, 54 | size=size_to_upload, 55 | part_crc=result.crc)) 56 | 57 | offset += size_to_upload 58 | part_number += 1 59 | 60 | # 完成分片上传 61 | self.bucket.complete_multipart_upload(key, upload_id, parts) 62 | 63 | -------------------------------------------------------------------------------- /scrapy_plus/middlewares/tor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | 洋葱头代理 4 | """ 5 | # from scrapy import signals 6 | from logging import getLogger 7 | from stem.control import Controller 8 | import stem 9 | # import random 10 | import time 11 | 12 | logger = getLogger(__name__) 13 | 14 | class TorProxyMiddleware(object): 15 | """ 16 | 洋葱头代理中间件 17 | 18 | ## settings.py 中的配置说明 19 | 20 | - HTTP_PROXY - 本机的代理端口 21 | - TOR_CTRL_PORT - 本机Tor的控制端口 22 | - TOR_PASSWORD - 登入Tor的密码 23 | """ 24 | 25 | def __init__(self, tor_proxy='127.0.0.1:8118', tor_control_port=9051, tor_password=None, after_times=50): 26 | 27 | if not tor_proxy: 28 | raise Exception('http proxy setting should not be empty') 29 | 30 | if not tor_control_port: 31 | raise Exception('tor control port setting should not be empty') 32 | 33 | if not tor_password: 34 | raise Exception('tor password setting should not be empty') 35 | 36 | self.http_proxy = tor_proxy 37 | self.tor_control_port = tor_control_port 38 | self.tor_password = tor_password 39 | self.count = 0 40 | self.times = after_times 41 | 42 | @classmethod 43 | def from_crawler(cls, crawler): 44 | tor_proxy = crawler.settings.get('TOR_PROXY') 45 | tor_control_port = crawler.settings.getint('TOR_CTRL_PORT') # 默认为9051 46 | tor_password = crawler.settings.get('TOR_PASSWORD') 47 | after_times = crawler.settings.get('TOR_CHANGE_AFTER_TIMES') 48 | return cls(tor_proxy, tor_control_port, tor_password, after_times) 49 | 50 | def process_request(self, request, spider): 51 | # 当启用Retry中间件,并且曾经出现2次的Retry就应该尝试更换IP 52 | retry_times = request.meta.get('retry_times', 0) 53 | 54 | if (self.count > 0 and self.count % self.times == 0) or retry_times>= 2: 55 | logger.debug("正在更换新的IP地址") 56 | self.ip_renew(spider) 57 | 58 | self.count += 1 59 | 60 | request.meta['proxy'] = self.http_proxy 61 | 62 | def ip_renew(self,spider): 63 | """access tor ControlPort to signal tor get a new IP 64 | """ 65 | with Controller.from_port(port=self.tor_control_port) as controller: 66 | controller.authenticate(password=self.tor_password) 67 | controller.signal(stem.Signal.NEWNYM) 68 | time.sleep(controller.get_newnym_wait()) 69 | controller.close() 70 | spider.crawler.stats.inc_value('renew_ip/count') 71 | -------------------------------------------------------------------------------- /scrapy_plus/middlewares/ua.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import random 4 | 5 | _agents = [ 6 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36', 7 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 8 | 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16', 9 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', 10 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36', 11 | 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', 12 | 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)' 13 | 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0', 14 | 'Mozilla/5.0 (Linux; U; Android 2.2) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1', 15 | 'Mozilla/5.0 (Windows NT 5.1; rv:7.0.1) Gecko/20100101 Firefox/7.0.1', 16 | 'Mozilla/5.0 (Linux; Android 6.0.1; SM-G532G Build/MMB29T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.83 Mobile Safari/537.36', 17 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/604.5.6 (KHTML, like Gecko) Version/11.0.3 Safari/604.5.6', 18 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2', 19 | 'MAC:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36', 20 | 'Windows:Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 21 | 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5', 22 | 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5', 23 | 'Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5', 24 | ] 25 | 26 | 27 | class RandomUserAgentMiddleware(object): 28 | """ 29 | 随机User Agent 中间件 30 | """ 31 | 32 | @classmethod 33 | def from_crawler(cls, crawler): 34 | return cls(user_agents=crawler.settings.getlist('USER_AGENTS', None)) 35 | 36 | def __init__(self, user_agents=None): 37 | self.user_agents = user_agents if user_agents is not None else _agents 38 | 39 | def process_request(self, request, spider): 40 | if self.user_agents != None and len(self.user_agents) > 0: 41 | request.headers.setdefault( 42 | b'User-Agent', random.choice(self.user_agents)) 43 | -------------------------------------------------------------------------------- /scrapy_plus/dupefilters/redisbloom.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | from scrapy.utils.request import request_fingerprint 4 | from redis import Redis 5 | from hashlib import md5 6 | from scrapy.dupefilters import BaseDupeFilter 7 | 8 | BLOOMFILTER_HASH_NUMBER = 6 9 | BLOOMFILTER_BIT = 30 10 | 11 | 12 | class SimpleHash(object): 13 | def __init__(self, cap, seed): 14 | self.cap = cap 15 | self.seed = seed 16 | 17 | def hash(self, value): 18 | ret = 0 19 | for i in range(len(value)): 20 | ret += self.seed * ret + ord(value[i]) 21 | return (self.cap - 1) & ret 22 | 23 | 24 | class RedisBloomDupeFilter(BaseDupeFilter): 25 | 26 | def __init__(self, host='localhost', port=6379, db=0, blockNum=1, key='bloomfilter'): 27 | self.redis = Redis(host=host, port=port, db=db) 28 | 29 | self.bit_size = 1 << 31 # Redis的String类型最大容量为512M,现使用256M 30 | self.seeds = [5, 7, 11, 13, 31, 37, 61] 31 | self.key = key 32 | self.blockNum = blockNum 33 | self.hashfunc = [] 34 | for seed in self.seeds: 35 | self.hashfunc.append(SimpleHash(self.bit_size, seed)) 36 | 37 | self.logger = logging.getLogger(__name__) 38 | 39 | @classmethod 40 | def from_settings(cls, settings): 41 | _port = settings.getint('REDIS_PORT', 6379) 42 | _host = settings.get('REDIS_HOST', '127.0.0.1') 43 | _db = settings.get('REDIS_DUP_DB', 0) 44 | key = settings.get('BLOOMFILTER_REDIS_KEY', 'bloomfilter') 45 | block_number = settings.getint( 46 | 'BLOOMFILTER_BLOCK_NUMBER', 1) 47 | 48 | return cls(_host, _port, _db, blockNum=block_number, key=key) 49 | 50 | def request_seen(self, request): 51 | fp = request_fingerprint(request) 52 | if self.exists(fp): 53 | return True 54 | 55 | self.insert(fp) 56 | return False 57 | 58 | def exists(self, str_input): 59 | if not str_input: 60 | return False 61 | m5 = md5() 62 | m5.update(str(str_input).encode('utf-8')) 63 | _input = m5.hexdigest() 64 | ret = True 65 | name = self.key + str(int(_input[0:2], 16) % self.blockNum) 66 | for f in self.hashfunc: 67 | loc = f.hash(_input) 68 | ret = ret & self.redis.getbit(name, loc) 69 | return ret 70 | 71 | def insert(self, str_input): 72 | m5 = md5() 73 | m5.update(str(str_input).encode('utf-8')) 74 | _input = m5.hexdigest() 75 | name = self.key + str(int(_input[0:2], 16) % self.blockNum) 76 | for f in self.hashfunc: 77 | loc = f.hash(_input) 78 | self.redis.setbit(name, loc, 1) 79 | 80 | def log(self, request, spider): 81 | msg = ("已过滤的重复请求: %(request)s") 82 | self.logger.debug(msg, {'request': request}, extra={'spider': spider}) 83 | spider.crawler.stats.inc_value( 84 | 'redisbloomfilter/filtered', spider=spider) -------------------------------------------------------------------------------- /scrapy_plus/utils/spiders.py: -------------------------------------------------------------------------------- 1 | from scrapy.spiders import CrawlSpider 2 | from scrapy.loader import ItemLoader 3 | from scrapy.utils.response import get_base_url 4 | 5 | from .starturls import FeedGenerator, FragmentGenerator 6 | 7 | 8 | class RequiredFieldMissing(Exception): 9 | def __init__(self, msg): 10 | self.msg = msg 11 | 12 | def __str__(self): 13 | return self.msg 14 | 15 | 16 | class PortiaItemLoader(ItemLoader): 17 | def get_value(self, value, *processors, **kw): 18 | required = kw.pop('required', False) 19 | val = super(PortiaItemLoader, self).get_value(value, *processors, **kw) 20 | if required and not val: 21 | raise RequiredFieldMissing( 22 | 'Missing required field "{value}" for "{item}"'.format( 23 | value=value, item=self.item.__class__.__name__)) 24 | return val 25 | 26 | 27 | class BasePortiaSpider(CrawlSpider): 28 | items = [] 29 | 30 | def start_requests(self): 31 | for url in self.start_urls: 32 | if isinstance(url, dict): 33 | type_ = url['type'] 34 | if type_ == 'generated': 35 | for generated_url in FragmentGenerator()(url): 36 | yield self.make_requests_from_url(generated_url) 37 | elif type_ == 'feed': 38 | yield FeedGenerator(self.parse)(url) 39 | else: 40 | yield self.make_requests_from_url(url) 41 | 42 | def parse_item(self, response): 43 | for sample in self.items: 44 | items = [] 45 | try: 46 | for definition in sample: 47 | items.extend( 48 | [i for i in self.load_item(definition, response)] 49 | ) 50 | except RequiredFieldMissing as exc: 51 | self.logger.warning(str(exc)) 52 | if items: 53 | for item in items: 54 | yield item 55 | break 56 | 57 | def load_item(self, definition, response): 58 | query = response.xpath if definition.type == 'xpath' else response.css 59 | selectors = query(definition.selector) 60 | for selector in selectors: 61 | selector = selector if selector else None 62 | ld = PortiaItemLoader( 63 | item=definition.item(), 64 | selector=selector, 65 | response=response, 66 | baseurl=get_base_url(response) 67 | ) 68 | for field in definition.fields: 69 | if hasattr(field, 'fields'): 70 | if field.name is not None: 71 | ld.add_value(field.name, 72 | self.load_item(field, selector)) 73 | elif field.type == 'xpath': 74 | ld.add_xpath(field.name, field.selector, *field.processors, 75 | required=field.required) 76 | else: 77 | ld.add_css(field.name, field.selector, *field.processors, 78 | required=field.required) 79 | yield ld.load_item() 80 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | ### Eclipse template 108 | 109 | .metadata 110 | bin/ 111 | tmp/ 112 | *.tmp 113 | *.bak 114 | *.swp 115 | *~.nib 116 | local.properties 117 | .settings/ 118 | .loadpath 119 | .recommenders 120 | 121 | # External tool builders 122 | .externalToolBuilders/ 123 | 124 | # Locally stored "Eclipse launch configurations" 125 | *.launch 126 | 127 | # PyDev specific (Python IDE for Eclipse) 128 | *.pydevproject 129 | 130 | # CDT-specific (C/C++ Development Tooling) 131 | .cproject 132 | 133 | # CDT- autotools 134 | .autotools 135 | 136 | # Java annotation processor (APT) 137 | .factorypath 138 | 139 | # PDT-specific (PHP Development Tools) 140 | .buildpath 141 | 142 | # sbteclipse plugin 143 | .target 144 | 145 | # Tern plugin 146 | .tern-project 147 | 148 | # TeXlipse plugin 149 | .texlipse 150 | 151 | # STS (Spring Tool Suite) 152 | .springBeans 153 | 154 | # Code Recommenders 155 | .recommenders/ 156 | 157 | # Annotation Processing 158 | .apt_generated/ 159 | 160 | # Scala IDE specific (Scala & Java development for Eclipse) 161 | .cache-main 162 | .scala_dependencies 163 | .worksheet 164 | ### VirtualEnv template 165 | # Virtualenv 166 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ 167 | .Python 168 | [Bb]in 169 | [Ii]nclude 170 | [Ll]ib 171 | [Ll]ib64 172 | [Ll]ocal 173 | [Ss]cripts 174 | pyvenv.cfg 175 | .venv 176 | pip-selfcheck.json 177 | 178 | build 179 | .vscode/ 180 | .idea/ -------------------------------------------------------------------------------- /scrapy_plus/extensions/sql.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from zope.interface import Interface, implementer 3 | from scrapy.extensions.feedexport import IFeedStorage 4 | from scrapy.exceptions import NotConfigured 5 | from sqlalchemy.engine import create_engine 6 | from sqlalchemy.orm import sessionmaker 7 | from importlib import import_module 8 | from scrapy.exporters import BaseItemExporter 9 | from logging import getLogger 10 | 11 | logger = getLogger(__name__) 12 | 13 | 14 | class EntityFileFaker(): 15 | 16 | def __init__(self, session, entity_cls): 17 | self.session = session 18 | if entity_cls is None: 19 | raise NotConfigured 20 | 21 | self.entity_cls = entity_cls 22 | 23 | def write(self, keys, values): 24 | """ 25 | 将值写入到 26 | :param key: 实体的成员变量 27 | :param value: 实体字段值 28 | """ 29 | entity = self.entity_cls() 30 | for key in keys: 31 | val = values.get(key) 32 | if val is not None: 33 | entity.__setattr__(key, val) 34 | self.session.add(entity) 35 | 36 | def close(self): 37 | self.session.commit() 38 | self.session.close() 39 | 40 | 41 | @implementer(IFeedStorage) 42 | class SQLFeedStorage(): 43 | """ 44 | SQL的存储后端 45 | @uri - SQL的连接字符串 46 | """ 47 | 48 | @classmethod 49 | def from_crawler(cls, crawler, uri): 50 | return cls(uri, 51 | crawler.settings.get('ORM_MODULE'), 52 | crawler.settings.get('ORM_METABASE'), 53 | crawler.settings.get('ORM_ENTITY')) 54 | 55 | def __init__(self, uri, mod_name=None, metabase_name=None, entity_name=None): 56 | """ 57 | 初始化SQL的存储后端 58 | FEED_URI 作为连接字符串使用 59 | """ 60 | self.connection_str = uri 61 | self.mod_name = mod_name 62 | self.metabase = metabase_name 63 | self.entity_name = entity_name 64 | 65 | def open(self, spider): 66 | """ 67 | 通过连接字符串打开SQL数据库并返回生成的数据库上下文 68 | """ 69 | engine = create_engine(self.connection_str) 70 | 71 | # 动态载入MetaData 72 | mod = import_module(self.mod_name) 73 | metabase = getattr(mod, self.metabase) 74 | entity_cls = getattr(mod, self.entity_name) 75 | metabase.metadata.bind = engine 76 | metabase.metadata.create_all() 77 | 78 | DBSession = sessionmaker(bind=engine) 79 | return EntityFileFaker(session=DBSession(), entity_cls=entity_cls) 80 | 81 | def store(self, file): 82 | """ 83 | 向数据提提交更改并关闭数据库 84 | """ 85 | file.close() 86 | 87 | 88 | class SQLItemExporter(BaseItemExporter): 89 | """ 90 | 将Item中的数据写入转换成为实体 91 | """ 92 | 93 | def __init__(self, file, **kwargs): 94 | self.file = file 95 | self._configure(kwargs, dont_fail=True) 96 | 97 | def export_item(self, item): 98 | """ 99 | 将Item插入到数据库 100 | 可以通过FEED_EXPORT_FIELDS设置要从Item中序列化至数据库的字段 101 | """ 102 | 103 | self.file.write(self.fields_to_export if self.fields_to_export is not None and self.fields_to_export.__len__() else item.fields.keys(), 104 | item) 105 | 106 | # TODO:要进行数据实体的转换就涉及数据类型转换问题,Item就需要进行序列化控制 107 | -------------------------------------------------------------------------------- /scrapy_plus/middlewares/huaban.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from selenium import webdriver 3 | from selenium.common.exceptions import TimeoutException 4 | from selenium.webdriver.common.by import By 5 | from selenium.webdriver.support.ui import WebDriverWait 6 | from selenium.webdriver.support import expected_conditions as EC 7 | from scrapy.http import HtmlResponse 8 | from logging import getLogger 9 | 10 | 11 | class HuabanMiddleware(): 12 | 13 | @classmethod 14 | def from_crawler(cls, crawler): 15 | return cls(timeout=crawler.settings.get('SELENIUM_TIMEOUT'), 16 | exec_path=crawler.settings.get('CHROMEDRIVER'), 17 | username=crawler.settings.get('HUABAN_USR'), 18 | password=crawler.settings.get('HUABAN_PWD')) 19 | 20 | def __init__(self, timeout=None, exec_path='', username='', password=''): 21 | self.logger = getLogger(__name__) # 打开日志 22 | self.timeout = timeout 23 | self.usr = username 24 | self.pwd = password 25 | options = webdriver.ChromeOptions() 26 | options.add_argument('headless') # 采用无头浏览器 27 | self.browser = webdriver.Chrome(executable_path=exec_path, 28 | options=options) 29 | 30 | self.browser.set_window_size(1400, 700) # 设置浏览窗口 31 | self.browser.set_page_load_timeout(self.timeout) # 设置浏览器加载网页的超时时间 32 | self.wait = WebDriverWait(self.browser, self.timeout) 33 | 34 | def __del__(self): 35 | self.browser.close() # 释构时关闭浏览器实例 36 | 37 | def login(self): 38 | 39 | login_button = self.browser.find_element_by_css_selector('.login.btn') 40 | login_button.click() 41 | form = self.browser.find_element_by_css_selector('form.mail-login') 42 | email_input = form.find_element_by_name('email') 43 | password_input = form.find_element_by_name('password') 44 | email_input.send_keys(self.usr) 45 | password_input.send_keys(self.pwd) 46 | form.submit() 47 | self._wait() 48 | 49 | def _wait(self): 50 | self.wait.until(EC.presence_of_element_located( 51 | (By.CSS_SELECTOR, '#index_footer'))) 52 | 53 | def process_request(self, request, spider): 54 | """ 55 | 用Chrome抓取页面 56 | :param request: Request对象 57 | :param spider: Spider对象 58 | :return: HtmlResponse 59 | """ 60 | self.logger.debug(u'启动Chrome...') 61 | 62 | try: 63 | self.browser.get(request.url) 64 | # 等待页脚被渲染完成 65 | self.browser.implicitly_wait(3) 66 | 67 | cookies = self.browser.get_cookies() 68 | is_login = False 69 | for cookie in cookies: 70 | if cookie['name'] == 'sid': 71 | is_login = True 72 | break 73 | 74 | if not is_login: 75 | self.login() 76 | self.browser.get(request.url) 77 | self.browser.implicitly_wait(3) 78 | 79 | return HtmlResponse(url=request.url, 80 | body=self.browser.page_source, 81 | request=request, 82 | encoding='utf-8', 83 | status=200) 84 | 85 | except TimeoutException: 86 | # 超时抛出异常 87 | return HtmlResponse(url=request.url, status=500, request=request) 88 | -------------------------------------------------------------------------------- /scrapy_plus/middlewares/splash.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy import signals 3 | from scrapy_splash import SplashRequest 4 | 5 | class SplashSpiderMiddleware(): 6 | """ 7 | Splash 中间件,可将请求转发至指定的Splash服务,使蜘蛛具有浏览器仿真功能。 8 | """ 9 | # 以下的Lua脚本会生一个等待指定元素选择器加载完成的函数 10 | lua_source = """ 11 | function wait_for_element(splash, css, maxwait) 12 | -- Wait until a selector matches an element 13 | -- in the page. Return an error if waited more 14 | -- than maxwait seconds. 15 | if maxwait == nil then 16 | maxwait = 10 17 | end 18 | return splash:wait_for_resume(string.format([[ 19 | function main(splash) { 20 | var selector = '%s'; 21 | var maxwait = %s; 22 | var end = Date.now() + maxwait*1000; 23 | 24 | function check() { 25 | if(document.querySelector(selector)) { 26 | splash.resume('Element found'); 27 | } else if(Date.now() >= end) { 28 | var err = 'Timeout waiting for element'; 29 | splash.error(err + " " + selector); 30 | } else { 31 | setTimeout(check, 200); 32 | } 33 | } 34 | check(); 35 | } 36 | ]], css, maxwait)) 37 | end 38 | 39 | function main(splash, args) 40 | splash:go(args.url) 41 | wait_for_element(splash, args.wait_for_element) 42 | return splash:html() 43 | end 44 | """ 45 | 46 | @classmethod 47 | def from_crawler(cls, crawler): 48 | # This method is used by Scrapy to create your spiders. 49 | s = cls(wait_for_element=crawler.settings.get('WAIT_FOR_ELEMENT')) 50 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 51 | return s 52 | 53 | def __init__(self, wait_for_element=None): 54 | self.wait_for_element = wait_for_element 55 | 56 | def process_spider_input(self, response, spider): 57 | # Called for each response that goes through the spider 58 | # middleware and into the spider. 59 | 60 | # Should return None or raise an exception. 61 | return None 62 | 63 | def process_spider_output(self, response, result, spider): 64 | # Called with the results returned from the Spider, after 65 | # it has processed the response. 66 | 67 | # Must return an iterable of Request, dict or Item objects. 68 | for i in result: 69 | yield i 70 | 71 | def process_spider_exception(self, response, exception, spider): 72 | # Called when a spider or process_spider_input() method 73 | # (from other spider middleware) raises an exception. 74 | 75 | # Should return either None or an iterable of Response, dict 76 | # or Item objects. 77 | pass 78 | 79 | def process_start_requests(self, start_requests, spider): 80 | # Called with the start requests of the spider, and works 81 | # similarly to the process_spider_output() method, except 82 | # that it doesn’t have a response associated. 83 | 84 | # Must return only requests (not items). 85 | for request in start_requests: 86 | yield SplashRequest(request.url, 87 | request.callback, 88 | endpoint='execute', 89 | meta=dict(request.meta), 90 | args={ 91 | 'lua_source': self.lua_source, 92 | 'wait_for_element': self.wait_for_element, 93 | 'wait': 3} 94 | ) 95 | 96 | def spider_opened(self, spider): 97 | spider.logger.info('Spider opened: %s' % spider.name) 98 | -------------------------------------------------------------------------------- /scrapy_plus/utils/parser.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | try: 3 | from HTMLParser import HTMLParser 4 | except ImportError: 5 | from html.parser import HTMLParser 6 | 7 | ALLOWED_TAGS = frozenset({ 8 | 'abbr', 'acronym', 'address', 'bdo', 'big', 'blockquote', 'br', 'cite', 9 | 'code', 'dd', 'del', 'dfn', 'dl', 'dt', 'em', 'ins', 'kbd', 'li', 10 | 'listing', 'ol', 'p', 'plaintext', 'pre', 'q', 'samp', 'small', 'strong', 11 | 'sub', 'sup', 'table', 'tbody', 'td', 'th', 'time', 'tr', 'tt', 'ul', 'var' 12 | }) 13 | REPLACE_TAGS = { 14 | 'b': 'strong', 15 | 'h1': 'strong', 16 | 'h2': 'strong', 17 | 'h3': 'strong', 18 | 'h4': 'strong', 19 | 'h5': 'strong', 20 | 'h6': 'strong', 21 | 'i': 'em' 22 | } 23 | PURGE_TAGS = ('script', 'img', 'input', 'style') 24 | ALLOWED_ATTRS = frozenset({ 25 | 'height', 'width', 'colspan', 'cellspacing', 'callpadding', 'border', 26 | 'bgcolor', 'alt', 'align', 'valign', 'dir', 'headers', 'reversed', 27 | 'rows', 'rowspan', 'scope', 'span', 'start', 'summary', 'title', 'value' 28 | }) 29 | class AllowAll(object): 30 | def __contains__(self, value): 31 | return True 32 | 33 | 34 | class SafeHtmlParser(HTMLParser): 35 | """Parser for making raw html safe for displaying. 36 | 37 | HTML is made safe by the removal of some tags and the replacement of 38 | others. The HTML generated should be safe for display and shouldn't cause 39 | formatting problems. 40 | 41 | Behaviour can be customized through the following keyword arguments: 42 | allowed_tags is a set of tags that are allowed 43 | replace_tags is a mapping of tags to alternative tags to substitute. 44 | tags_to_purge are tags that, if encountered, all content between the 45 | opening and closing tag is removed. 46 | 47 | For example: 48 | >>> t = SafeHtmlParser().feed 49 | >>> t(u'test test') 50 | u'test test' 51 | 52 | Some tags, like script, are completely removed 53 | >>> t(u'test') 54 | u'test' 55 | 56 | replace_tags defines tags that are converted. By default all headers, bold 57 | and indenting are converted to strong and em. 58 | >>> t(u'

header

test bold indent') 59 | u'header test bold indent' 60 | 61 | tags_to_purge defines the tags that have enclosing content removed: 62 | >>> t(u'

test

') 63 | u'

test

' 64 | 65 | Comments are stripped, but entities are not converted 66 | >>> t(u' only £42') 67 | u'only £42' 68 | 69 | Paired tags are closed 70 | >>> t(u'

test') 71 | u'

test

' 72 | 73 | >>> t(u'

test
test

') 74 | u'

test
test

' 75 | 76 | """ 77 | def __init__(self, allowed_tags=ALLOWED_TAGS, replace_tags=REPLACE_TAGS, 78 | tags_to_purge=PURGE_TAGS, allowed_attrs=ALLOWED_ATTRS): 79 | self.reset() 80 | self._body = [] 81 | self.skip = False 82 | self._unclosed = deque() 83 | if allowed_tags is None: 84 | allowed_tags = AllowAll() 85 | if allowed_attrs is None: 86 | allowed_attrs = AllowAll() 87 | self.allowed_tags = allowed_tags 88 | self.replace_tags = replace_tags 89 | self.tags_to_purge = tags_to_purge 90 | self.allowed_attrs = allowed_attrs 91 | super(SafeHtmlParser, self).__init__() 92 | 93 | def feed(self, data): 94 | self._body, self._unclosed, self.skip = [], deque(), False 95 | self.rawdata = self.rawdata + data 96 | self.goahead(0) 97 | self._close_remaining_tags() 98 | return ''.join(self._body).strip() 99 | 100 | def handle_starttag(self, tag, attrs): 101 | self._handle_open(tag, attrs) 102 | self._unclosed.appendleft(tag) 103 | 104 | def handle_startendtag(self, tag, attrs): 105 | self._handle_open(tag, attrs, closed=True) 106 | 107 | def handle_endtag(self, tag): 108 | tag = tag.lower() 109 | try: 110 | last_opened = self._unclosed.popleft() 111 | while last_opened != tag: 112 | self._body.append(self._build_close_tag(last_opened)) 113 | last_opened = self._unclosed.popleft() 114 | except IndexError: 115 | return 116 | if self.skip and tag in self.tags_to_purge: 117 | self.skip = False 118 | return 119 | if tag not in self.allowed_tags and tag not in self.replace_tags: 120 | return 121 | self._body.append(self._build_close_tag(tag)) 122 | 123 | def handle_data(self, data): 124 | if self.skip: 125 | return 126 | self._body.append(data) 127 | 128 | def handle_entityref(self, name): 129 | self._body.append('&{};'.format(name)) 130 | 131 | def _handle_open(self, tag, attrs, closed=False): 132 | tag = tag.lower() 133 | if tag in self.tags_to_purge: 134 | if not closed: 135 | self.skip = True 136 | return 137 | if tag not in self.allowed_tags and tag not in self.replace_tags: 138 | return 139 | self._body.append(self._build_open_tag(tag, attrs)) 140 | 141 | def _build_open_tag(self, tag, attrs): 142 | tag = self.replace_tags.get(tag, tag) 143 | attrs = [(k, v) for k, v in attrs if k.lower() in self.allowed_attrs] 144 | return '<{tag}{has_attrs}{attrs}>'.format( 145 | tag=tag, 146 | has_attrs=' ' * bool(attrs), 147 | attrs=(' '.join('{}="{}"#'.format(*a) for a in attrs) 148 | if attrs else '') 149 | ) 150 | 151 | def _build_close_tag(self, tag): 152 | tag = self.replace_tags.get(tag, tag) 153 | return ''.format(tag) 154 | 155 | def _close_remaining_tags(self): 156 | for tag in self._unclosed: 157 | self._body.append(self._build_close_tag(tag)) 158 | -------------------------------------------------------------------------------- /scrapy_plus/processors.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import re 3 | import six 4 | 5 | from urllib.parse import urljoin, urlparse, urlunparse 6 | 7 | from copy import deepcopy 8 | from itertools import chain 9 | try: 10 | from itertools import izip_longest 11 | except ImportError: 12 | from itertools import zip_longest as izip_longest 13 | 14 | from dateparser.date import DateDataParser 15 | from scrapy.loader.processors import Identity as _Identity 16 | from scrapy.utils.markup import unquote_markup 17 | from w3lib.html import remove_tags 18 | from .utils.parser import SafeHtmlParser 19 | 20 | 21 | # Regeps from Scrapely_CSS_IMAGERE.pattern 22 | _CSS_IMAGERE = re.compile(r'background(?:-image)?\s*:\s*url\((.*?)\)') 23 | _GENERIC_PATH_RE = re.compile('/?(?:[^/]+/)*(?:.+)') 24 | _IMAGE_PATH_RE = re.compile(r'/?(?:[^/]+/)*(?:.+\.(?:mng|pct|bmp|gif|jpg|jpeg|' 25 | r'png|pst|psp|tif|tiff|ai|drw|dxf|eps|ps|svg))') 26 | _NUMERIC_ENTITIES = re.compile(r'&#([0-9]+)(?:;|\s)', re.U) 27 | _PRICE_NUMBER_RE = re.compile(r'(?:^|[^a-zA-Z0-9])(\d+(?:\.\d+)?)' 28 | r'(?:$|[^a-zA-Z0-9])') 29 | _NUMBER_RE = re.compile(r'(-?\d+(?:\.\d+)?)') 30 | _DECIMAL_RE = re.compile(r'(\d[\d\,]*(?:(?:\.\d+)|(?:)))', re.U | re.M) 31 | _VALPARTS_RE = re.compile(r'([\.,]?\d+)') 32 | _SENTINEL = object() 33 | 34 | 35 | def _strip_url(text): 36 | if text: 37 | return text.strip("\t\r\n '\"") 38 | 39 | 40 | def extract_image_url(text): 41 | text = _strip_url(text) 42 | imgurl = None 43 | if text: 44 | # check if the text is style content 45 | match = _CSS_IMAGERE.search(text) 46 | text = match.groups()[0] if match else text 47 | parsed = urlparse(text) 48 | path = None 49 | match = _IMAGE_PATH_RE.search(parsed.path) 50 | if match: 51 | path = match.group() 52 | elif parsed.query: 53 | match = _GENERIC_PATH_RE.search(parsed.path) 54 | if match: 55 | path = match.group() 56 | if path is not None: 57 | parsed = list(parsed) 58 | parsed[2] = path 59 | imgurl = urlunparse(parsed) 60 | if not imgurl: 61 | imgurl = text 62 | return imgurl 63 | 64 | class Text(): 65 | def __call__(self, values): 66 | return [remove_tags(v).strip() 67 | if v and isinstance(v, six.string_types) else v 68 | for v in values] 69 | 70 | 71 | class Number(): 72 | def __call__(self, values): 73 | numbers = [] 74 | for value in values: 75 | if isinstance(value, (dict, list)): 76 | numbers.append(value) 77 | txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])), 78 | value) 79 | numbers.append(_NUMBER_RE.findall(txt)) 80 | return list(chain(*numbers)) 81 | 82 | 83 | class Price(): 84 | def __call__(self, values): 85 | prices = [] 86 | for value in values: 87 | if isinstance(value, (dict, list)): 88 | prices.append(value) 89 | txt = _NUMERIC_ENTITIES.sub(lambda m: unichr(int(m.groups()[0])), 90 | value) 91 | m = _DECIMAL_RE.search(txt) 92 | if m: 93 | value = m.group(1) 94 | parts = _VALPARTS_RE.findall(value) 95 | decimalpart = parts.pop(-1) 96 | if decimalpart[0] == "," and len(decimalpart) <= 3: 97 | decimalpart = decimalpart.replace(",", ".") 98 | value = "".join(parts + [decimalpart]).replace(",", "") 99 | prices.append(value) 100 | return prices 101 | 102 | 103 | class Date(Text): 104 | def __init__(self, format='%Y-%m-%dT%H:%M:%S'): 105 | self.format = format 106 | 107 | def __call__(self, values): 108 | values = super(Date, self).__call__(values) 109 | dates = [] 110 | for text in values: 111 | if isinstance(text, (dict, list)): 112 | dates.append(text) 113 | try: 114 | date = DateDataParser().get_date_data(text)['date_obj'] 115 | dates.append(date.strftime(self.format)) 116 | except ValueError: 117 | pass 118 | return dates 119 | 120 | 121 | class Url(Text): 122 | def __call__(self, values, loader_context=None): 123 | values = super(Url, self).__call__(values) 124 | urls = [] 125 | for value in values: 126 | if isinstance(value, (dict, list)): 127 | urls.append(value) 128 | value = _strip_url(unquote_markup(value)) 129 | base = loader_context.get('baseurl', '') 130 | urls.append(urljoin(base, value)) 131 | return urls 132 | 133 | class CleanText(): 134 | def __call__(self, values): 135 | return [(lambda v: v.replace('\n', '').replace(' ', '').strip())(v) for v in values] 136 | 137 | class Image(Text): 138 | def __call__(self, values): 139 | return super(Image, self).__call__([ 140 | val if isinstance(val, (dict, list)) else extract_image_url(val) 141 | for val in values 142 | ]) 143 | 144 | 145 | class SafeHtml(Text): 146 | 147 | def __init__(self, parser=None): 148 | if parser is None: 149 | parser = SafeHtmlParser() 150 | self.parser = parser 151 | 152 | def __call__(self, values): 153 | results = [] 154 | for val in values: 155 | if isinstance(val, (dict, list)): 156 | results.append(val) 157 | results.append(self.parser.feed(str(val))) 158 | return results 159 | 160 | 161 | class Regex(): 162 | def __init__(self, regexp): 163 | if isinstance(regexp, six.string_types): 164 | regexp = re.compile(regexp) 165 | self.regexp = regexp.pattern 166 | self._regexp = regexp 167 | 168 | def __call__(self, values): 169 | results = [] 170 | for value in values: 171 | if isinstance(value, (dict, list)): 172 | results.append(value) 173 | if not value: 174 | continue 175 | match = self._regexp.search(value) 176 | if not match: 177 | continue 178 | results.append( 179 | u"".join([g for g in match.groups() or match.group() if g]) 180 | ) 181 | return results 182 | 183 | def __deepcopy__(self, memo): 184 | """Overwrite deepcopy so that the regexp is recalculated.""" 185 | return type(self)(deepcopy(self.regexp, memo)) 186 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scrapy+ 2 | 3 | scrapy-plus是一个scrapy的辅助扩展工具包,提供了一系列在开发爬虫过程中需要的一些扩展插件,便于大家在以后的爬虫项目中快速开发。 4 | 5 | > 更多详细的使用方法请移玉步到专栏内参考 [《从0学爬虫专栏》](https://www.imooc.com/read/34) 6 | 7 | ``` 8 | $ pip install scrapy_plus 9 | ``` 10 | 11 | Scrapy+提供以下的内容 12 | 13 | - 过滤器 14 | - Redis 去重过滤器 15 | - Redis 布隆去重过滤器 16 | - 中间件 17 | - 自登录中间件 18 | - 花瓣网专用中间件 19 | - Chrome通用中间件 20 | - Splash渲染中间件 21 | - Tor中间件 22 | - 随机UA中间件 23 | - 随机代理中间件 24 | - 管道 25 | - MongoDB数据存储管道 26 | - 可支持阿里云的OSS图片管道 27 | - SQL存储端 28 | - 输入/输出处理器 29 | - 蜘蛛 30 | - `BookSpider` 31 | - `NeteaseSpider` 32 | - `TaobaoSpider` 33 | 34 | 35 | ## 过滤器 36 | 37 | 根据本专栏第4章网易爬虫的优化——大规模数据处理技术中所介绍的Redis去重过滤器与高效的布隆过滤器的内容进行抽象与优化。 38 | 39 | 所有的过滤器都放置于`scrapy_plus.dupefilters`下。 40 | 41 | ### Redis 去重过滤器 42 | 43 | `RedisDupeFilter`的模块位置: 44 | 45 | ```python 46 | scrapy_plus.dupefilters.RedisDupeFilter 47 | ``` 48 | 49 | 基于Redis使用`Set`存储曾访问过的URL。 50 | 51 | **使用方法** 52 | 53 | 首先要安装Redis或者起动一个Redis的容器,具体做法请参考本专栏的第4章第2节去重处理——高性能爬虫调优技术中的Redis安装介绍。 54 | 55 | `RedisDupeFilter`的使用极其简单,只需要在配置文件中做以相应的修改即可。具体做法是在`settings`文件内引入以下的内容: 56 | 57 | ```py 58 | # 覆盖原有的去重过滤器 59 | DUPEFILTER_CLASS = 'scrapy_plus.dupefilters.RedisDupeFilter' 60 | REDIS_PORT = 6379 # REDIS服务器端口 61 | REDIS_HOST = '127.0.0.1' # REDIS服务器地址 62 | REDIS_DB = 0 # 数据库名 63 | ``` 64 | 65 | **默认配置** 66 | 67 | ```py 68 | REDIS_PORT = 6379 # REDIS服务器端口 69 | REDIS_HOST = '127.0.0.1' # REDIS服务器地址 70 | REDIS_DB = 0 # 数据库名 71 | ``` 72 | 73 | 如果你不修改Redis的安装配置可以只在`settings.py`文件中加入以下这行即可: 74 | 75 | ``` 76 | DUPEFILTER_CLASS = 'scrapy_plus.dupefilters.RedisDupeFilter' 77 | ``` 78 | 79 | 80 | 81 | ### Redis 布隆去重过滤器 82 | 83 | 这是基于`RedisDupeFilter`并加入布隆算法后的最高效的去重过滤器。 84 | 85 | `RedisBloomDupeFilter`的模块位置: 86 | 87 | ``` 88 | scrapy_plus.dupefilters.RedisBloomDupeFilter 89 | ``` 90 | 91 | **使用方法** 92 | 93 | 使用方法与`RedisDupeFilter`相同在`settings`文件内引入以下的内容: 94 | 95 | ```py 96 | # 覆盖原有的去重过滤器 97 | DUPEFILTER_CLASS = 'scrapy_plus.dupefilters.RedisBloomDupeFilter' 98 | REDIS_PORT = 6379 # REDIS服务器端口 99 | REDIS_HOST = '127.0.0.1' # REDIS服务器地址 100 | REDIS_DB = 0 # 数据库名 101 | ``` 102 | 103 | **默认配置** 104 | 105 | ``` 106 | REDIS_PORT = 6379 # REDIS服务器端口 107 | REDIS_HOST = '127.0.0.1' # REDIS服务器地址 108 | REDIS_DB = 0 # 数据库名 109 | BLOOMFILTER_REDIS_KEY = 'bloomfilter' # 去重键名 110 | BLOOMFILTER_BLOCK_NUMBER = 1 # 块大小 111 | ``` 112 | 113 | 与`RedisDupeFilter`不同的是`RedisBloomDupeFilter`增加了两个配置项: 114 | 115 | - `BLOOMFILTER_REDIS_KEY` - 设置Redis中去重键的名称。 116 | - `BLOOMFILTER_BLOCK_NUMBER` - 设置布隆算法块的大小。 117 | 118 | 这两个选项推荐使用默认值,也可以根据你项目的实际情况进行调节。 119 | 120 | 121 | 122 | ## 中间件 123 | 124 | Scrapy+的中间件放置于`scrapy_plus.middlewares`包内。 125 | 126 | ### 自登录中间件 127 | 128 | 这是一个通用的中间件,可以应用于所有能提供登陆URL的网站,`LoginMiddleWare`会判断是否已登录,如果已登录则不会进行重复登录。 129 | 130 | 使用这个中间件时需要在`settings.py`配置文件中的`COOKIES_ENABLED`打开 131 | 132 | 例如在网页中找到一个`
`元素: 133 | 134 | ```html 135 | 136 | 137 | 138 |
139 | ``` 140 | 那就完全可以应用`LoginMiddleware`完成自动登录。 141 | 142 | **模块位置** 143 | 144 | ``` 145 | scrapy_plus.middlewares.LoginMiddleWare 146 | ``` 147 | 148 | 以下是自动登录中间件的配置: 149 | 150 | ```python 151 | COOKIES_ENABLED=True 152 | LOGIN_URL = '网站登录地址' 153 | LOGIN_USR = '用户名' 154 | LOGIN_PWD = '密码' 155 | LOGIN_USR_FIELD = '用户名input元素名称(name)' 156 | LOGIN_PWD_FIELD = '密码input元素名称(name)' 157 | DOWNLOADER_MIDDLEWARES = { 158 | 'scrapyplus.middlewares.LoginMiddleWare': 330 159 | } 160 | ``` 161 | 162 | 163 | 164 | ### 花瓣网专用中间件 165 | 166 | 这是一个基于Chrome无头浏览器,可以自动登录花瓣网并能正确渲染花瓣网javascript页面的中间件。 167 | 168 | 169 | 170 | **模块位置** 171 | 172 | ```python 173 | scrapy_plus.middlewares.HuabanMiddleware 174 | ``` 175 | 176 | 177 | 178 | **使用方法** 179 | 180 | 首先,你需要安装chromedriver,具体做法请参考本专栏第6章第4节用Chrome无头浏览器处理js网页关于安装chromedriver的相关内容。 181 | 182 | 其次,你需要拥有一个花瓣网的注册账号。 183 | 184 | 最后,在`settings.py`配置文件内加入以下的配置项: 185 | 186 | ```python 187 | SELENIUM_TIMEOUT = 30 # 设置页面打开的超时秒数 188 | CHROMEDRIVER = "/path/to/chrome" # Chrome浏览器驱动地址 189 | # 以下的macOS上示例: 190 | # CHROMEDRIVER = "/usr/local/Caskroom/chromedriver/75.0.3770.90/chromedriver" 191 | DOWNLOADER_MIDDLEWARES = { 192 | 'scrapyplus.middlewares.HuabanMiddleware': 100 193 | } 194 | HUABAN_USR="您在花瓣网上的用户名" 195 | HUABAN_PWD="你在花瓣网上注册的用户密码" 196 | ``` 197 | 198 | 有了这个中间件你就可以像写普通蜘蛛一样来编写花瓣蜘蛛的逻辑。 199 | 200 | 201 | 202 | ### Chrome通用中间件 203 | 204 | Chrome 无头浏览器仿真中间件。让爬虫用Chrome来访问目标URL,完美解决富JS页面的问题。 205 | 206 | > 但仅可以对不需要进行登录的网站应用此中间件。 207 | 208 | ```python 209 | SELENIUM_TIMEOUT = 30 # 设置页面打开的超时秒数 210 | CHROMEDRIVER = "/path/to/chrome" # Chrome浏览器驱动地址 211 | DOWNLOADER_MIDDLEWARES = { 212 | 'scrapyplus.middlewares.ChromeMiddleware': 800 213 | } 214 | ``` 215 | 216 | 217 | 218 | ### Splash渲染中间件 219 | 220 | 基于scrappy_splash进行扩展,简化splash的用法,使蜘蛛可以不发出`SplashRequest`就能使用splash进行javascript页面的渲染。 221 | 222 | **模块位置** 223 | 224 | ```python 225 | scrapy_plus.middlewares.SplashSpiderMiddleware 226 | ``` 227 | 228 | 229 | Splash 中间件,可将请求转发至指定的Splash服务,使蜘蛛具有浏览器仿真功能。 230 | 231 | ```python 232 | WAIT_FOR_ELEMENT = "选择器" # 等待该元素被加载成功才认为页面加载完成 233 | DOWNLOADER_MIDDLEWARES = { 234 | } 235 | 236 | DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter' 237 | 238 | DOWNLOADER_MIDDLEWARES = { 239 | 'scrapy_splash.SplashCookiesMiddleware': 723, 240 | 'scrapy_splash.SplashMiddleware': 725, 241 | 'scrapy_plus.middlewares.SplashSpiderMiddleware': 800, 242 | 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810 243 | } 244 | 245 | SPIDER_MIDDLEWARES = { 246 | 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, 247 | } 248 | ``` 249 | 250 | 251 | 252 | ### 随机UA中间件 253 | 254 | 为爬取请求随机分配UA。具体原理请参考本专栏第5章第5节反爬初步之客户端仿真。 255 | 256 | > 使用前需要将`scrapy.downloadermiddlewares.useragent.UserAgentMiddleware`禁用。 257 | 258 | **模块位置** 259 | 260 | 261 | ```python 262 | scrapyplus.middlewares.RandomUserAgentMiddleware 263 | ``` 264 | 265 | **使用方法** 266 | 267 | 在`settings.py`添加以下的配置: 268 | 269 | ```python 270 | DOWNLOADER_MIDDLEWARES = { 271 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 272 | 'scrapyplus.middlewares.RandomUserAgentMiddleware': 500 273 | } 274 | ``` 275 | 276 | 在默认情况下RandomUserAgentMiddleware会提供一系列常用的UA,还可以在settings.py文件内配置`USER_AGENTS`添加自定义的UA。 277 | 如下所示: 278 | 279 | ```python 280 | ## 可随机增加更多的UA,中间件会进行自动随机选择 281 | USER_AGENTS = [ 282 | 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0', 283 | 'Mozilla/5.0 (Linux; U; Android 2.2) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1', 284 | 'Mozilla/5.0 (Windows NT 5.1; rv:7.0.1) Gecko/20100101 Firefox/7.0.1', 285 | 'Mozilla/5.0 (Linux; Android 6.0.1; SM-G532G Build/MMB29T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.83 Mobile Safari/537.36', 286 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/604.5.6 (KHTML, like Gecko) Version/11.0.3 Safari/604.5.6' 287 | ] 288 | ``` 289 | 290 | ### 随机代理中间件 291 | 292 | 当爬虫向目标网站发出请求时在配置文件的`HTTP_PROXIES`列表中随机选择一个地址。 293 | 294 | **模块位置** 295 | 296 | ```python 297 | scrapyplus.middlewares.RandomProxyMiddleware 298 | ``` 299 | 300 | 301 | 302 | 在`settings.py`文件内添加以下的配置: 303 | 304 | ```python 305 | DOWNLOADER_MIDDLEWARES = { 306 | 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': None, 307 | 'scrapyplus.middlewares.RandomProxyMiddleware': 750 308 | } 309 | # 以下为代理列表 310 | HTTP_PROXIES=[ 311 | '203.11.43.22:8080' 312 | ] 313 | ``` 314 | 315 | 316 | 317 | ### Tor 中间件 318 | 319 | **模块位置** 320 | 321 | ```python 322 | scrapyplus.middlewares.TorProxyMiddleware 323 | ``` 324 | 325 | 洋葱头代理中间件,让你的蜘蛛不停地更换IP地址,化身万千。需要先安装 tor 与 privoxy 具体配置方法请参考《虫术——python绝技》 326 | 327 | ```py 328 | # Tor代理 329 | TOR_PROXY = 'http://127.0.0.1:8118' # 8118是Privoxy的默认代理端口 330 | TOR_CTRL_PORT = 9051 331 | TOR_PASSWORD = 'mypassword' 332 | TOR_CHANGE_AFTER_TIMES = 50 # 在发出多少次请求之后更换IP地址。 333 | ``` 334 | 335 | 336 | 337 | ## 管道 338 | 339 | Scrapy+的中间件放置于`scrapy_plus.pipelines`包内。 340 | 341 | ### MongoDB数据存储管道 342 | 343 | 将`Item`数据项目直接写入到MongoDB。使用此管道前需要先安装MongoDB或起动MongoDB的Docker实例。 344 | 345 | **模块位置** 346 | 347 | ```python 348 | scrapy_plus.piplines.MongoDBPipeline 349 | ``` 350 | 351 | 可以将Item直接写入MongoDB数据库中。 352 | 353 | **使用方法** 354 | 355 | 在`settings.py`文件内加入以下配置项: 356 | 357 | ```py 358 | ITEM_PIPELINES = {'scrapy_plus.pipelines.MongoDBPipeline':2} 359 | 360 | MONGODB_SERVER = "localhost" # mongodb服务器地址 361 | MONGODB_PORT = 27017 # mongodb服务端口 362 | MONGODB_DB = "数据库名" # 数据库名 363 | MONGODB_COLLECTION = "表名" # 表名 364 | ``` 365 | 366 | ### 可支持阿里云的OSS图片管道 367 | 368 | scrapy 搭载的图片管道`ImagesPipeline`只能将图片保存到本地或者S3、Google这些中国不能用的云储存,在第6章第5节将花瓣爬虫采访的图片存储于阿里云一节中我就介绍过这个管道的实现过程与原理。 369 | 370 | **模块位置** 371 | 372 | ``` 373 | scrapy_plus.pipelines.ImagesPipeline 374 | ``` 375 | 376 | **使用方法** 377 | 378 | 需要在配置文件中`settings.py`加入以下的配置项 379 | 380 | ``` 381 | IMAGE_STORE='oss://.<外网EndPoint>.aliyuncs.com/<子目录>' 382 | OSS_ACCESS_KEY = 'OSS上访问公钥' 383 | OSS_ACCESS_SECRET = 'OSS的访问私钥' 384 | # 在Item中存储目标图片下载地址的字段名 385 | IMAGES_URLS_FIELD = 'img_urls' 386 | # 当下载完后将下载结果对象写入到Item对象的字段名 387 | IMAGES_RESULT_FIELD = 'img_files' 388 | # 加载图片存储管道 389 | ITEM_PIPELINES = { 390 | 'huaban.pipelines.images.ImagesPipeline': 2 391 | } 392 | 393 | ``` 394 | 395 | 396 | 397 | 398 | 399 | ## 存储端 400 | 401 | ### SQL存储端 402 | 403 | 将数据一次性地写入SQL数据库存储端。可以支持sqlite, postgresql和mysql多种SQL类型的数据库。 404 | 405 | **模块位置** 406 | 407 | ``` 408 | scrapy_plus.extensions.SQLFeedStorage 409 | ``` 410 | 411 | **使用方法** 412 | 413 | `SQLFeedStorage`的使用方法有点复杂,在使用前需要基于SQLAlchemy进行数据建模,具体方法可以参考本专栏第5章第4节基于SQL的数据导出机制。 414 | 415 | 在`settings.py`需要加入以下的配置项 416 | 417 | ```py 418 | # 数据存储 419 | ORM_MODULE = 'movies.entities' 420 | ORM_METABASE = 'Base' 421 | ORM_ENTITY = 'Movie' 422 | 423 | FEED_FORMAT = 'entity' 424 | FEED_EXPORTERS = { 425 | 'entity': 'scrapyplus.extensions.SQLItemExporter' 426 | } 427 | 428 | FEED_URI = 'dialect+driver://username:password@host:port/database' # 默认后端存储文件的名称 429 | FEED_STORAGES = { 430 | 'sqlite': 'scrapyplus.extensions.SQLFeedStorage', 431 | 'postgresql': 'scrapyplus.extensions.SQLFeedStorage', 432 | 'mysql': 'scrapyplus.extensions.SQLFeedStorage' 433 | } 434 | ``` 435 | 436 | 437 | 438 | ## 输入/输出处理器 439 | 440 | Scrapy+的中间件放置于`scrapy_plus.processors`包内。我在本专栏第五章第一、第二节都有介绍过输入输出处理器的用法以及如何开发自定义的输入/输出处理器。在Scrapy+中则更进一步提供了8个最常用的输入/输出处理器。他们分别是: 441 | 442 | - `Text` - 提取并输出文字字符串 443 | - `Number` - 提取并输出数字格式字符串 444 | - `Price` - 提取出价格格式的字符串 445 | - `Date` - 提取日期格式的字符串 446 | - `Url` - 提取并输出URL格式字符串 447 | - `Image` - 提取并输出图片地址格式字符串 448 | - `SafeHtml` - 提取并输出移除所有可被执行的HTML代码后的HTML格式化字符串 449 | - `Regex` - 提取并输出符合正规表达式的字符串 450 | 451 | **模块位置** 452 | 453 | ```python 454 | scrapy_plus.processors 455 | ``` 456 | 457 | 458 | 459 | ## 蜘蛛 460 | 461 | Scrapy+将本专栏中两个蜘蛛放到了`scrapy_plus.spiders`包内,方便读者可以在不编写蜘蛛的情况下只配`settings.py`就可以直接使用。 462 | 463 | Scrapy+总共提供了三个蜘蛛类,分别是: 464 | 465 | 类名 | 蜘蛛名 | 说明 466 | --|--|-- 467 | `BookSpider` | `'doubanbook'` |用于拉取豆瓣图书的专用蜘蛛,该蜘蛛将会返回`scrapy_plus.items.BookItem`的数据项对象。 468 | `NeteaseSpider` | `'netease'`| 用于拉取网易新闻的专用蜘蛛,该蜘蛛将会返回`scrapy_plus.items.NewsItem`数据项对象 469 | `TaobaoSpider`| `'taobao'`| 用于拉取淘宝网搜索页面的专用蜘蛛,该蜘蛛将会返回`scrapy_plus.items.ProductItem`数据项对象 470 | 471 | 它们的使方法如下,在`settings.py`内修改 472 | 473 | ``` 474 | BOT_NAME = '蜘蛛名' 475 | SPIDER_MODULES = ['scrapy_plus.spiders'] # 指定爬虫类所在的包 476 | NEWSPIDER_MODULE = 'scrapy_plus.spiders' # 指定爬虫模块 477 | ``` 478 | 479 | 淘宝蜘蛛是在《Python绝技——虫术》一书中的一个经典例子,要使用这个蜘蛛需要使用慢速爬虫与自动登录的Chrome中间件,同时这个蜘蛛需要被继承并重新实现`gen_keywords`方法后才能使用: 480 | 481 | ```python 482 | from scrapy_plus.spider import TaobaoSpider 483 | 484 | class MyTaobaoSpider(TaobaoSpider): 485 | def gen_keywords(self): 486 | return ["小米","红酒"] 487 | ``` 488 | 489 | 这个蜘蛛是利用淘宝搜索页进行爬取的,所以你需要返回需要搜索的关键字,该蜘蛛就会对这些关键字下搜索出来的产品进行爬取。 --------------------------------------------------------------------------------